Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Due to inactivity, this project is scheduled to be deleted on 2035-04-24.
Why is this scheduled?
Open sidebar
Cyril Labbe
scidetect
Commits
a555af2b
Commit
a555af2b
authored
Mar 10, 2015
by
Tien
Browse files
separate text and corpus class
parent
90fbf481
Changes
7
Hide whitespace changes
Inline
Side-by-side
src/fr/imag/forge/Scidetect/Checker/Classifier.java
View file @
a555af2b
...
...
@@ -84,7 +84,7 @@ public class Classifier {
}
/**
* Check if the distance is lower, between of upper the two threshold.
* @param result a string composed having for each classe the value of its NN
* @param result a string composed having for each classe
s
the value of its NN
* @return
*/
private
String
checkdistant
(
String
result
)
{
...
...
src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java
View file @
a555af2b
...
...
@@ -21,6 +21,7 @@ import java.util.HashSet;
import
java.util.Set
;
import
fr.imag.forge.Scidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
/**
*
...
...
@@ -31,7 +32,7 @@ public class DistantCalculator {
//private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
private
DistancesSet
distant
=
new
DistancesSet
();
public
DistancesSet
caldistant
(
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
,
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
)
{
public
DistancesSet
caldistant
(
Corpus
samples
,
Corpus
tests
)
{
for
(
String
key
:
tests
.
keySet
())
{
//HashMap<String, Double> distantto = new HashMap<String, Double>();
for
(
String
key2
:
samples
.
keySet
())
{
...
...
src/fr/imag/forge/Scidetect/Checker/Reader.java
View file @
a555af2b
...
...
@@ -16,8 +16,12 @@
*/
package
fr.imag.forge.Scidetect.Checker
;
import
com.sun.corba.se.spi.transport.CorbaAcceptor
;
import
fr.imag.forge.Scidetect.TextExtractor.Xmlextractor
;
import
fr.imag.forge.Scidetect.TextExtractor.pdfextractor
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
import
fr.imag.forge.scidetect.Corpus.ProcessText
;
import
fr.imag.forge.scidetect.Corpus.Text
;
import
java.io.BufferedReader
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
...
...
@@ -36,8 +40,10 @@ import java.util.List;
*/
public
class
Reader
{
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>();
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>();
//private HashMap<String, HashMap<String, Integer>> samples = new HashMap<String, HashMap<String, Integer>>();
// private HashMap<String, HashMap<String, Integer>> tests = new HashMap<String, HashMap<String, Integer>>();
private
Corpus
samples
=
new
Corpus
();
private
Corpus
test
=
new
Corpus
();
private
String
SamplesFolder
;
private
int
maxlength
;
...
...
@@ -56,149 +62,51 @@ public class Reader {
//other config should be read over here
if
(
b
[
0
].
equals
(
"Max_length"
))
{
maxlength
=
Integer
.
parseInt
(
b
[
1
]);
ProcessText
.
maxlength
=
Integer
.
parseInt
(
b
[
1
]);
}
}
}
}
public
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
readsamples
(
String
foldername
)
throws
IOException
{
public
Corpus
readsamples
(
String
foldername
)
throws
IOException
{
File
folder
=
new
File
(
foldername
);
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readsamples
(
listOfFile
[
j
].
getPath
());
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".txt"
)
&&
!
listOfFile
[
j
].
getName
().
startsWith
(
"INDEX-"
))
{
// find if there is already index for it
String
indexname
=
"INDEX-"
+
listOfFile
[
j
].
getName
().
substring
(
0
,
listOfFile
[
j
].
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
if
(
Arrays
.
asList
(
listOfFile
).
toString
().
contains
(
indexname
))
{
// System.out.println("lets read from index file");
readindexfile
(
listOfFile
[
j
].
getParent
()
+
"/"
+
indexname
);
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
))
{
pdfextractor
a
=
new
pdfextractor
();
String
content
=
a
.
pdfextract
(
listOfFile
[
j
]);
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
listOfFile
[
j
]);
readindexfile
(
listOfFile
[
j
].
getParent
()
+
"/"
+
indexname
);
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xml"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xtx"
)||(
listOfFile
[
j
].
getName
().
endsWith
(
".txt"
)
&&
!
listOfFile
[
j
].
getName
().
startsWith
(
"INDEX-"
)))
{
ArrayList
<
Text
>
text
=
new
ArrayList
<
Text
>();
ProcessText
textprocessor
=
new
ProcessText
();
text
=
textprocessor
.
newtext
(
listOfFile
[
j
],
listOfFile
);
for
(
int
i
=
0
;
i
<
text
.
size
();
i
++)
{
samples
.
put
(
text
.
get
(
i
));
}
}
}
return
samples
;
}
private
void
readindexfile
(
String
path
)
throws
IOException
{
File
index
=
new
File
(
path
);
BufferedReader
br
;
br
=
new
BufferedReader
(
new
FileReader
(
index
));
String
line
;
HashMap
<
String
,
Integer
>
a
=
new
HashMap
<
String
,
Integer
>();
while
((
line
=
br
.
readLine
())
!=
null
)
{
String
[]
b
=
line
.
split
(
"\t"
);
a
.
put
(
b
[
0
],
Integer
.
parseInt
(
b
[
1
]));
}
br
.
close
();
if
(
path
.
contains
(
SamplesFolder
))
{
samples
.
put
(
path
,
a
);
}
else
{
tests
.
put
(
path
,
a
);
}
}
private
void
readfile
(
File
pdf
)
throws
IOException
{
String
content
=
""
;
if
(
pdf
.
getName
().
endsWith
(
".pdf"
))
{
pdfextractor
a
=
new
pdfextractor
();
content
=
a
.
pdfextract
(
pdf
);
}
else
if
(
pdf
.
getName
().
endsWith
(
".xml"
)
||
pdf
.
getName
().
endsWith
(
".xtx"
))
{
Xmlextractor
a
=
new
Xmlextractor
();
content
=
a
.
xmlextract
(
pdf
);
}
//lets deal with long file over here
//split content and the index part by part
if
(
content
.
length
()
<
maxlength
)
{
String
indexname
=
"INDEX-"
+
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
pdf
);
readindexfile
(
pdf
.
getParent
()
+
"/"
+
indexname
);
}
else
{
String
[]
part
=
splitcontent
(
content
);
for
(
int
i
=
0
;
i
<
part
.
length
;
i
++)
{
String
indexname
=
"INDEX-"
+
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
String
filename
=
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
Indexer
b
=
new
Indexer
();
File
a
=
new
File
(
pdf
.
getParent
()
+
"/"
+
filename
);
PrintWriter
out
=
new
PrintWriter
(
new
FileWriter
(
a
));
out
.
println
(
part
[
i
]);
//System.out.println(text);
out
.
close
();
b
.
index
(
part
[
i
],
a
);
readindexfile
(
a
.
getParent
()
+
"/"
+
indexname
);
return
samples
;
}
}
}
public
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
readtests
(
String
testpath
)
throws
IOException
{
File
folder
=
new
File
(
testpath
);
if
(
folder
.
isDirectory
()
)
{
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++
)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readtests
(
listOfFile
[
j
]
.
getPath
()
);
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xml"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xtx"
)
)
{
readfile
(
listOfFile
[
j
]
);
public
Corpus
readtests
(
String
foldername
)
throws
IOException
{
File
folder
=
new
File
(
foldername
);
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++
)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readtests
(
listOfFile
[
j
].
getPath
());
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xml"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xtx"
)
)
{
ArrayList
<
Text
>
text
=
new
ArrayList
<
Text
>();
ProcessText
textprocessor
=
new
ProcessText
();
text
=
textprocessor
.
newtext
(
listOfFile
[
j
]
,
listOfFile
);
for
(
int
i
=
0
;
i
<
text
.
size
();
i
++
)
{
test
.
put
(
text
.
get
(
i
)
);
}
}
}
else
if
(
folder
.
getName
().
endsWith
(
".pdf"
)
||
folder
.
getName
().
endsWith
(
".xml"
)
||
folder
.
getName
().
endsWith
(
".xtx"
))
{
readfile
(
folder
);
}
return
tests
;
}
private
String
[]
splitcontent
(
String
content
)
{
int
nbofpart
=
content
.
length
()
/
maxlength
;
String
[]
part
=
new
String
[
nbofpart
+
1
];
int
lower
=
0
;
int
upper
=
0
;
int
i
;
for
(
i
=
0
;
i
<
nbofpart
;
i
++)
{
upper
+=
maxlength
;
part
[
i
]
=
content
.
substring
(
lower
,
upper
);
lower
=
upper
;
}
return
test
;
if
(
upper
<=
content
.
length
()
-
1
)
{
lower
=
upper
;
upper
=
content
.
length
();
part
[
i
]
=
(
content
.
substring
(
lower
,
upper
));
}
return
part
;
}
}
src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java
View file @
a555af2b
...
...
@@ -23,6 +23,7 @@ import fr.imag.forge.Scidetect.Checker.Reader;
import
fr.imag.forge.Scidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.Scidetect.Logger.Log
;
import
fr.imag.forge.Scidetect.TextExtractor.pdfextractor
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
import
java.io.BufferedReader
;
import
java.io.File
;
...
...
@@ -46,8 +47,8 @@ public class ScigenChecker_Local {
// private String detailloglocation;
private
String
testpath
;
//private String logtime;
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
();
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
();
private
Corpus
samples
=
new
Corpus
();
private
Corpus
tests
=
new
Corpus
();
private
String
SamplesFolder
;
//private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
DistancesSet
distant
=
new
DistancesSet
();
...
...
src/fr/imag/forge/scidetect/Corpus/Corpus.java
0 → 100644
View file @
a555af2b
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.scidetect.Corpus
;
import
java.util.HashMap
;
import
java.util.Set
;
/**
*
* @author tien
*/
public
class
Corpus
{
private
HashMap
<
String
,
Text
>
corpus
=
new
HashMap
<
String
,
Text
>();
public
void
put
(
Text
a
)
{
corpus
.
put
(
a
.
getname
(),
a
);
}
public
HashMap
<
String
,
Text
>
getcorpus
()
{
return
corpus
;
}
public
Set
<
String
>
keySet
()
{
return
corpus
.
keySet
();
}
public
HashMap
<
String
,
Integer
>
get
(
String
name
)
{
return
corpus
.
get
(
name
).
getindex
();
}
}
src/fr/imag/forge/scidetect/Corpus/ProcessText.java
0 → 100644
View file @
a555af2b
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.scidetect.Corpus
;
import
fr.imag.forge.Scidetect.Checker.Indexer
;
import
fr.imag.forge.Scidetect.TextExtractor.Xmlextractor
;
import
fr.imag.forge.Scidetect.TextExtractor.pdfextractor
;
import
java.io.BufferedReader
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.FileReader
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.io.PrintWriter
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.logging.Level
;
import
java.util.logging.Logger
;
/**
* Manage texts file in the corpus
*
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public
class
ProcessText
{
public
static
int
maxlength
;
ArrayList
<
Text
>
text
=
new
ArrayList
<
Text
>();
/**
* Process a File (pdf,xml) to create clean text[] (incase of need to split)
*
* @param original file
* @return list of text[]
*
*/
public
ArrayList
<
Text
>
newtext
(
File
original
,
File
[]
listOfFile
)
throws
IOException
{
// find if there is already index for it
String
indexname
=
"INDEX-"
+
original
.
getName
().
substring
(
0
,
original
.
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
String
content
=
""
;
if
(
Arrays
.
asList
(
listOfFile
).
toString
().
contains
(
indexname
))
{
// System.out.println("lets read from index file");
readindexfile
(
original
.
getParent
()
+
"/"
+
indexname
);
}
else
{
if
(
original
.
getName
().
endsWith
(
".pdf"
))
{
try
{
pdfextractor
a
=
new
pdfextractor
();
content
=
a
.
pdfextract
(
original
);
}
catch
(
FileNotFoundException
ex
)
{
Logger
.
getLogger
(
ProcessText
.
class
.
getName
()).
log
(
Level
.
SEVERE
,
null
,
ex
);
}
}
else
if
(
original
.
getName
().
endsWith
(
".xml"
)
||
original
.
getName
().
endsWith
(
".xtx"
))
{
Xmlextractor
a
=
new
Xmlextractor
();
content
=
a
.
xmlextract
(
original
);
}
//lets deal with long file over here
//split content and the index part by part
if
(
content
.
length
()
<
maxlength
)
{
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
original
);
readindexfile
(
original
.
getParent
()
+
"/"
+
indexname
);
}
else
{
String
[]
part
=
splitcontent
(
content
);
for
(
int
i
=
0
;
i
<
part
.
length
;
i
++)
{
String
indexnameparti
=
"INDEX-"
+
original
.
getName
().
substring
(
0
,
original
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
String
filename
=
original
.
getName
().
substring
(
0
,
original
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
Indexer
b
=
new
Indexer
();
File
a
=
new
File
(
original
.
getParent
()
+
"/"
+
filename
);
PrintWriter
out
=
new
PrintWriter
(
new
FileWriter
(
a
));
out
.
println
(
part
[
i
]);
//System.out.println(text);
out
.
close
();
b
.
index
(
part
[
i
],
a
);
readindexfile
(
a
.
getParent
()
+
"/"
+
indexnameparti
);
}
}
}
return
text
;
}
private
HashMap
<
String
,
Integer
>
readindexfile
(
String
path
)
throws
IOException
{
File
index
=
new
File
(
path
);
BufferedReader
br
;
br
=
new
BufferedReader
(
new
FileReader
(
index
));
String
line
;
HashMap
<
String
,
Integer
>
a
=
new
HashMap
<
String
,
Integer
>();
while
((
line
=
br
.
readLine
())
!=
null
)
{
String
[]
b
=
line
.
split
(
"\t"
);
a
.
put
(
b
[
0
],
Integer
.
parseInt
(
b
[
1
]));
}
br
.
close
();
Text
c
=
new
Text
();
c
.
setindex
(
a
);
c
.
setname
(
path
);
text
.
add
(
c
);
return
a
;
}
private
String
[]
splitcontent
(
String
content
)
{
int
nbofpart
=
content
.
length
()
/
maxlength
;
String
[]
part
=
new
String
[
nbofpart
+
1
];
int
lower
=
0
;
int
upper
=
0
;
int
i
;
for
(
i
=
0
;
i
<
nbofpart
;
i
++)
{
upper
+=
maxlength
;
part
[
i
]
=
content
.
substring
(
lower
,
upper
);
lower
=
upper
;
}
if
(
upper
<=
content
.
length
()
-
1
)
{
lower
=
upper
;
upper
=
content
.
length
();
part
[
i
]
=
(
content
.
substring
(
lower
,
upper
));
}
return
part
;
}
}
src/fr/imag/forge/scidetect/Corpus/Text.java
0 → 100644
View file @
a555af2b
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.scidetect.Corpus
;
import
java.util.HashMap
;
/**
*
* @author tien
*/
public
class
Text
{
private
HashMap
<
String
,
Integer
>
index
=
new
HashMap
<
String
,
Integer
>();
private
String
cleantext
=
""
;
private
String
name
=
""
;
public
void
setindex
(
HashMap
<
String
,
Integer
>
a
)
{
index
=
a
;
}
public
void
setname
(
String
a
)
{
name
=
a
;
}
public
HashMap
<
String
,
Integer
>
getindex
()
{
return
index
;
}
public
String
getname
()
{
return
name
;
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment