Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
scidetect
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Cyril Labbe
scidetect
Commits
a555af2b
Commit
a555af2b
authored
Mar 10, 2015
by
Tien
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
separate text and corpus class
parent
90fbf481
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
282 additions
and
126 deletions
+282
-126
src/fr/imag/forge/Scidetect/Checker/Classifier.java
src/fr/imag/forge/Scidetect/Checker/Classifier.java
+1
-1
src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java
src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java
+2
-1
src/fr/imag/forge/Scidetect/Checker/Reader.java
src/fr/imag/forge/Scidetect/Checker/Reader.java
+30
-122
src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java
...ge/Scidetect/scigenchecker_local/ScigenChecker_Local.java
+3
-2
src/fr/imag/forge/scidetect/Corpus/Corpus.java
src/fr/imag/forge/scidetect/Corpus/Corpus.java
+45
-0
src/fr/imag/forge/scidetect/Corpus/ProcessText.java
src/fr/imag/forge/scidetect/Corpus/ProcessText.java
+153
-0
src/fr/imag/forge/scidetect/Corpus/Text.java
src/fr/imag/forge/scidetect/Corpus/Text.java
+48
-0
No files found.
src/fr/imag/forge/Scidetect/Checker/Classifier.java
View file @
a555af2b
...
...
@@ -84,7 +84,7 @@ public class Classifier {
}
/**
* Check if the distance is lower, between of upper the two threshold.
* @param result a string composed having for each classe the value of its NN
* @param result a string composed having for each classe
s
the value of its NN
* @return
*/
private
String
checkdistant
(
String
result
)
{
...
...
src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java
View file @
a555af2b
...
...
@@ -21,6 +21,7 @@ import java.util.HashSet;
import
java.util.Set
;
import
fr.imag.forge.Scidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
/**
*
...
...
@@ -31,7 +32,7 @@ public class DistantCalculator {
//private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
private
DistancesSet
distant
=
new
DistancesSet
();
public
DistancesSet
caldistant
(
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
,
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
)
{
public
DistancesSet
caldistant
(
Corpus
samples
,
Corpus
tests
)
{
for
(
String
key
:
tests
.
keySet
())
{
//HashMap<String, Double> distantto = new HashMap<String, Double>();
for
(
String
key2
:
samples
.
keySet
())
{
...
...
src/fr/imag/forge/Scidetect/Checker/Reader.java
View file @
a555af2b
...
...
@@ -16,8 +16,12 @@
*/
package
fr.imag.forge.Scidetect.Checker
;
import
com.sun.corba.se.spi.transport.CorbaAcceptor
;
import
fr.imag.forge.Scidetect.TextExtractor.Xmlextractor
;
import
fr.imag.forge.Scidetect.TextExtractor.pdfextractor
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
import
fr.imag.forge.scidetect.Corpus.ProcessText
;
import
fr.imag.forge.scidetect.Corpus.Text
;
import
java.io.BufferedReader
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
...
...
@@ -36,8 +40,10 @@ import java.util.List;
*/
public
class
Reader
{
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>();
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>();
//private HashMap<String, HashMap<String, Integer>> samples = new HashMap<String, HashMap<String, Integer>>();
// private HashMap<String, HashMap<String, Integer>> tests = new HashMap<String, HashMap<String, Integer>>();
private
Corpus
samples
=
new
Corpus
();
private
Corpus
test
=
new
Corpus
();
private
String
SamplesFolder
;
private
int
maxlength
;
...
...
@@ -56,149 +62,51 @@ public class Reader {
//other config should be read over here
if
(
b
[
0
].
equals
(
"Max_length"
))
{
maxlength
=
Integer
.
parseInt
(
b
[
1
]);
ProcessText
.
maxlength
=
Integer
.
parseInt
(
b
[
1
]);
}
}
}
}
public
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
readsamples
(
String
foldername
)
throws
IOException
{
public
Corpus
readsamples
(
String
foldername
)
throws
IOException
{
File
folder
=
new
File
(
foldername
);
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readsamples
(
listOfFile
[
j
].
getPath
());
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".txt"
)
&&
!
listOfFile
[
j
].
getName
().
startsWith
(
"INDEX-"
))
{
// find if there is already index for it
String
indexname
=
"INDEX-"
+
listOfFile
[
j
].
getName
().
substring
(
0
,
listOfFile
[
j
].
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
if
(
Arrays
.
asList
(
listOfFile
).
toString
().
contains
(
indexname
))
{
// System.out.println("lets read from index file");
readindexfile
(
listOfFile
[
j
].
getParent
()
+
"/"
+
indexname
);
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
))
{
pdfextractor
a
=
new
pdfextractor
();
String
content
=
a
.
pdfextract
(
listOfFile
[
j
]);
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
listOfFile
[
j
]);
readindexfile
(
listOfFile
[
j
].
getParent
()
+
"/"
+
indexname
);
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xml"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xtx"
)||(
listOfFile
[
j
].
getName
().
endsWith
(
".txt"
)
&&
!
listOfFile
[
j
].
getName
().
startsWith
(
"INDEX-"
)))
{
ArrayList
<
Text
>
text
=
new
ArrayList
<
Text
>();
ProcessText
textprocessor
=
new
ProcessText
();
text
=
textprocessor
.
newtext
(
listOfFile
[
j
],
listOfFile
);
for
(
int
i
=
0
;
i
<
text
.
size
();
i
++)
{
samples
.
put
(
text
.
get
(
i
));
}
}
}
return
samples
;
}
private
void
readindexfile
(
String
path
)
throws
IOException
{
File
index
=
new
File
(
path
);
BufferedReader
br
;
br
=
new
BufferedReader
(
new
FileReader
(
index
));
String
line
;
HashMap
<
String
,
Integer
>
a
=
new
HashMap
<
String
,
Integer
>();
while
((
line
=
br
.
readLine
())
!=
null
)
{
String
[]
b
=
line
.
split
(
"\t"
);
a
.
put
(
b
[
0
],
Integer
.
parseInt
(
b
[
1
]));
}
br
.
close
();
if
(
path
.
contains
(
SamplesFolder
))
{
samples
.
put
(
path
,
a
);
}
else
{
tests
.
put
(
path
,
a
);
}
}
private
void
readfile
(
File
pdf
)
throws
IOException
{
String
content
=
""
;
if
(
pdf
.
getName
().
endsWith
(
".pdf"
))
{
pdfextractor
a
=
new
pdfextractor
();
content
=
a
.
pdfextract
(
pdf
);
}
else
if
(
pdf
.
getName
().
endsWith
(
".xml"
)
||
pdf
.
getName
().
endsWith
(
".xtx"
))
{
Xmlextractor
a
=
new
Xmlextractor
();
content
=
a
.
xmlextract
(
pdf
);
}
//lets deal with long file over here
//split content and the index part by part
if
(
content
.
length
()
<
maxlength
)
{
String
indexname
=
"INDEX-"
+
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
pdf
);
readindexfile
(
pdf
.
getParent
()
+
"/"
+
indexname
);
}
else
{
String
[]
part
=
splitcontent
(
content
);
for
(
int
i
=
0
;
i
<
part
.
length
;
i
++)
{
String
indexname
=
"INDEX-"
+
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
String
filename
=
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
Indexer
b
=
new
Indexer
();
File
a
=
new
File
(
pdf
.
getParent
()
+
"/"
+
filename
);
PrintWriter
out
=
new
PrintWriter
(
new
FileWriter
(
a
));
out
.
println
(
part
[
i
]);
//System.out.println(text);
out
.
close
();
b
.
index
(
part
[
i
],
a
);
readindexfile
(
a
.
getParent
()
+
"/"
+
indexname
);
return
samples
;
}
}
}
public
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
readtests
(
String
testpath
)
throws
IOException
{
File
folder
=
new
File
(
testpath
);
if
(
folder
.
isDirectory
()
)
{
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++
)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readtests
(
listOfFile
[
j
].
getPath
()
);
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xml"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xtx"
)
)
{
readfile
(
listOfFile
[
j
]
);
public
Corpus
readtests
(
String
foldername
)
throws
IOException
{
File
folder
=
new
File
(
foldername
);
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++
)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readtests
(
listOfFile
[
j
].
getPath
());
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xml"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xtx"
)
)
{
ArrayList
<
Text
>
text
=
new
ArrayList
<
Text
>();
ProcessText
textprocessor
=
new
ProcessText
();
text
=
textprocessor
.
newtext
(
listOfFile
[
j
],
listOfFile
);
for
(
int
i
=
0
;
i
<
text
.
size
();
i
++
)
{
test
.
put
(
text
.
get
(
i
)
);
}
}
}
else
if
(
folder
.
getName
().
endsWith
(
".pdf"
)
||
folder
.
getName
().
endsWith
(
".xml"
)
||
folder
.
getName
().
endsWith
(
".xtx"
))
{
readfile
(
folder
);
}
return
tests
;
}
private
String
[]
splitcontent
(
String
content
)
{
int
nbofpart
=
content
.
length
()
/
maxlength
;
String
[]
part
=
new
String
[
nbofpart
+
1
];
int
lower
=
0
;
int
upper
=
0
;
int
i
;
for
(
i
=
0
;
i
<
nbofpart
;
i
++)
{
upper
+=
maxlength
;
part
[
i
]
=
content
.
substring
(
lower
,
upper
);
lower
=
upper
;
}
return
test
;
if
(
upper
<=
content
.
length
()
-
1
)
{
lower
=
upper
;
upper
=
content
.
length
();
part
[
i
]
=
(
content
.
substring
(
lower
,
upper
));
}
return
part
;
}
}
src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java
View file @
a555af2b
...
...
@@ -23,6 +23,7 @@ import fr.imag.forge.Scidetect.Checker.Reader;
import
fr.imag.forge.Scidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.Scidetect.Logger.Log
;
import
fr.imag.forge.Scidetect.TextExtractor.pdfextractor
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
import
java.io.BufferedReader
;
import
java.io.File
;
...
...
@@ -46,8 +47,8 @@ public class ScigenChecker_Local {
// private String detailloglocation;
private
String
testpath
;
//private String logtime;
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
();
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
();
private
Corpus
samples
=
new
Corpus
();
private
Corpus
tests
=
new
Corpus
();
private
String
SamplesFolder
;
//private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
DistancesSet
distant
=
new
DistancesSet
();
...
...
src/fr/imag/forge/scidetect/Corpus/Corpus.java
0 → 100644
View file @
a555af2b
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.scidetect.Corpus
;
import
java.util.HashMap
;
import
java.util.Set
;
/**
*
* @author tien
*/
public
class
Corpus
{
private
HashMap
<
String
,
Text
>
corpus
=
new
HashMap
<
String
,
Text
>();
public
void
put
(
Text
a
)
{
corpus
.
put
(
a
.
getname
(),
a
);
}
public
HashMap
<
String
,
Text
>
getcorpus
()
{
return
corpus
;
}
public
Set
<
String
>
keySet
()
{
return
corpus
.
keySet
();
}
public
HashMap
<
String
,
Integer
>
get
(
String
name
)
{
return
corpus
.
get
(
name
).
getindex
();
}
}
src/fr/imag/forge/scidetect/Corpus/ProcessText.java
0 → 100644
View file @
a555af2b
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.scidetect.Corpus
;
import
fr.imag.forge.Scidetect.Checker.Indexer
;
import
fr.imag.forge.Scidetect.TextExtractor.Xmlextractor
;
import
fr.imag.forge.Scidetect.TextExtractor.pdfextractor
;
import
java.io.BufferedReader
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
import
java.io.FileReader
;
import
java.io.FileWriter
;
import
java.io.IOException
;
import
java.io.PrintWriter
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.logging.Level
;
import
java.util.logging.Logger
;
/**
* Manage texts file in the corpus
*
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public
class
ProcessText
{
public
static
int
maxlength
;
ArrayList
<
Text
>
text
=
new
ArrayList
<
Text
>();
/**
* Process a File (pdf,xml) to create clean text[] (incase of need to split)
*
* @param original file
* @return list of text[]
*
*/
public
ArrayList
<
Text
>
newtext
(
File
original
,
File
[]
listOfFile
)
throws
IOException
{
// find if there is already index for it
String
indexname
=
"INDEX-"
+
original
.
getName
().
substring
(
0
,
original
.
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
String
content
=
""
;
if
(
Arrays
.
asList
(
listOfFile
).
toString
().
contains
(
indexname
))
{
// System.out.println("lets read from index file");
readindexfile
(
original
.
getParent
()
+
"/"
+
indexname
);
}
else
{
if
(
original
.
getName
().
endsWith
(
".pdf"
))
{
try
{
pdfextractor
a
=
new
pdfextractor
();
content
=
a
.
pdfextract
(
original
);
}
catch
(
FileNotFoundException
ex
)
{
Logger
.
getLogger
(
ProcessText
.
class
.
getName
()).
log
(
Level
.
SEVERE
,
null
,
ex
);
}
}
else
if
(
original
.
getName
().
endsWith
(
".xml"
)
||
original
.
getName
().
endsWith
(
".xtx"
))
{
Xmlextractor
a
=
new
Xmlextractor
();
content
=
a
.
xmlextract
(
original
);
}
//lets deal with long file over here
//split content and the index part by part
if
(
content
.
length
()
<
maxlength
)
{
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
original
);
readindexfile
(
original
.
getParent
()
+
"/"
+
indexname
);
}
else
{
String
[]
part
=
splitcontent
(
content
);
for
(
int
i
=
0
;
i
<
part
.
length
;
i
++)
{
String
indexnameparti
=
"INDEX-"
+
original
.
getName
().
substring
(
0
,
original
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
String
filename
=
original
.
getName
().
substring
(
0
,
original
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
Indexer
b
=
new
Indexer
();
File
a
=
new
File
(
original
.
getParent
()
+
"/"
+
filename
);
PrintWriter
out
=
new
PrintWriter
(
new
FileWriter
(
a
));
out
.
println
(
part
[
i
]);
//System.out.println(text);
out
.
close
();
b
.
index
(
part
[
i
],
a
);
readindexfile
(
a
.
getParent
()
+
"/"
+
indexnameparti
);
}
}
}
return
text
;
}
private
HashMap
<
String
,
Integer
>
readindexfile
(
String
path
)
throws
IOException
{
File
index
=
new
File
(
path
);
BufferedReader
br
;
br
=
new
BufferedReader
(
new
FileReader
(
index
));
String
line
;
HashMap
<
String
,
Integer
>
a
=
new
HashMap
<
String
,
Integer
>();
while
((
line
=
br
.
readLine
())
!=
null
)
{
String
[]
b
=
line
.
split
(
"\t"
);
a
.
put
(
b
[
0
],
Integer
.
parseInt
(
b
[
1
]));
}
br
.
close
();
Text
c
=
new
Text
();
c
.
setindex
(
a
);
c
.
setname
(
path
);
text
.
add
(
c
);
return
a
;
}
private
String
[]
splitcontent
(
String
content
)
{
int
nbofpart
=
content
.
length
()
/
maxlength
;
String
[]
part
=
new
String
[
nbofpart
+
1
];
int
lower
=
0
;
int
upper
=
0
;
int
i
;
for
(
i
=
0
;
i
<
nbofpart
;
i
++)
{
upper
+=
maxlength
;
part
[
i
]
=
content
.
substring
(
lower
,
upper
);
lower
=
upper
;
}
if
(
upper
<=
content
.
length
()
-
1
)
{
lower
=
upper
;
upper
=
content
.
length
();
part
[
i
]
=
(
content
.
substring
(
lower
,
upper
));
}
return
part
;
}
}
src/fr/imag/forge/scidetect/Corpus/Text.java
0 → 100644
View file @
a555af2b
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.scidetect.Corpus
;
import
java.util.HashMap
;
/**
*
* @author tien
*/
public
class
Text
{
private
HashMap
<
String
,
Integer
>
index
=
new
HashMap
<
String
,
Integer
>();
private
String
cleantext
=
""
;
private
String
name
=
""
;
public
void
setindex
(
HashMap
<
String
,
Integer
>
a
)
{
index
=
a
;
}
public
void
setname
(
String
a
)
{
name
=
a
;
}
public
HashMap
<
String
,
Integer
>
getindex
()
{
return
index
;
}
public
String
getname
()
{
return
name
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment