Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
scidetect
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Cyril Labbe
scidetect
Commits
06786a2e
Commit
06786a2e
authored
Feb 26, 2015
by
Cyril Labbe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Committer: Cyril Labbe
parent
94700418
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
123 additions
and
29 deletions
+123
-29
Makefile
Makefile
+4
-1
manifest.mf
manifest.mf
+2
-0
src/fr/imag/Scidetect/Checker/Classifier.java
src/fr/imag/Scidetect/Checker/Classifier.java
+38
-7
src/fr/imag/Scidetect/Checker/Indexer.java
src/fr/imag/Scidetect/Checker/Indexer.java
+5
-0
src/fr/imag/Scidetect/Logger/Log.java
src/fr/imag/Scidetect/Logger/Log.java
+6
-2
src/fr/imag/Scidetect/TextExtractor/pdfextractor.java
src/fr/imag/Scidetect/TextExtractor/pdfextractor.java
+9
-2
src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java
...ag/Scidetect/scigenchecker_local/ScigenChecker_Local.java
+59
-17
No files found.
Makefile
View file @
06786a2e
...
...
@@ -11,7 +11,7 @@ JAVACLASSPATH = -cp lib/pdfbox-app-1.8.8.jar
default
:
all
all
:
classes doc jar
all
:
classes doc jar
run
classes
:
mkdir
-p
classes
...
...
@@ -24,5 +24,8 @@ jar:
cd
classes
;
jar
-cfvm
../ScigenChecker_Local
`
date
+%Y-%m-%d
`
.jar ../MANIFEST.MF
*
;
cd
..
cp
ScigenChecker_Local
`
date
+%Y-%m-%d
`
.jar ScigenChecker_Local.jar
run
:
java
-jar
ScigenChecker_local.jar
-l
checklog.txt
-c
Test
clean
:
rm
-r
classes
;
rm
-r
doc
;
manifest.mf
View file @
06786a2e
Manifest-Version: 1.0
Class-Path: lib/pdfbox-app-1.8.8.jar
X-COMMENT: Main-Class will be added automatically by build
Main-Class: fr.imag.Scidetect.scigenchecker_local.ScigenChecker_Local
src/fr/imag/Scidetect/Checker/Classifier.java
View file @
06786a2e
...
...
@@ -25,13 +25,23 @@ import java.io.PrintWriter;
import
java.util.HashMap
;
/**
*
* Classifier is tagging input files has being of a certain class. Example of classes are SCIgen, Mathgen,...
* The decision is made according to the distance between the tested file and its nearest neighbor
* in each class. Thresholds for assignation are read in file specified in the configuration file
* (default config.txt).
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.frs
*/
public
class
Classifier
{
/**
* The key is the class name and value is a couple of Double.
* This couple is composed of a threshold for quasi-certain classification and another for suspicion.
*/
HashMap
<
String
,
Double
[]>
Threshold
=
new
HashMap
<
String
,
Double
[]>();
/**
* Reads threshold in the configuration file (default config.txt).
* @throws FileNotFoundException
* @throws IOException
*/
public
void
readconfig
()
throws
FileNotFoundException
,
IOException
{
File
conf
=
new
File
(
"config.txt"
);
BufferedReader
br
=
new
BufferedReader
(
new
FileReader
(
conf
));
...
...
@@ -49,7 +59,13 @@ public class Classifier {
}
}
/**
* Classify is classifying each document given the matrix of distances (distant).
* For each entry it gives the class (or more) to which the text can be assigned
* @param distant is a matrix of distances
* @return the assigned class
* @throws IOException
*/
public
String
classify
(
HashMap
<
String
,
HashMap
<
String
,
Double
>>
distant
)
throws
IOException
{
String
result
=
""
;
...
...
@@ -77,7 +93,11 @@ public class Classifier {
return
conclusion
;
}
/**
* Check if the distance is lower, between of upper the two threshold.
* @param result a string composed having for each classe the value of its NN
* @return
*/
private
String
checkdistant
(
String
result
)
{
String
conclution
=
""
;
String
[]
eachtype
=
result
.
split
(
"\n"
);
...
...
@@ -105,7 +125,10 @@ public class Classifier {
}
return
conclution
;
}
/**
* @param result
* @return
*/
private
String
findmindistant
(
String
result
)
{
Double
mindistant
=
1.0
;
String
[]
eachtype
=
result
.
split
(
"\n"
);
...
...
@@ -120,7 +143,11 @@ public class Classifier {
}
return
conclu
;
}
/**
*
* @param indexpath
* @return
*/
private
String
gettype
(
String
indexpath
)
{
File
indexfile
=
new
File
(
indexpath
);
String
parent
=
indexfile
.
getParent
();
...
...
@@ -129,6 +156,10 @@ public class Classifier {
return
parent
;
}
/**
* @param distantto
* @return
*/
private
String
find_NN
(
HashMap
<
String
,
Double
>
distantto
)
{
HashMap
<
String
,
Double
>
distotype
=
new
HashMap
<
String
,
Double
>();
HashMap
<
String
,
String
>
NNname
=
new
HashMap
<
String
,
String
>();
...
...
src/fr/imag/Scidetect/Checker/Indexer.java
View file @
06786a2e
...
...
@@ -29,6 +29,11 @@ public class Indexer {
private
Object
content
;
/**
* @param content
* @param textfile
* @throws FileNotFoundException
*/
public
void
index
(
String
content
,
File
textfile
)
throws
FileNotFoundException
{
String
filename
=
textfile
.
getName
().
substring
(
0
,
textfile
.
getName
().
lastIndexOf
(
"."
));
filename
+=
".txt"
;
...
...
src/fr/imag/Scidetect/Logger/Log.java
View file @
06786a2e
...
...
@@ -53,6 +53,9 @@ public class Log {
}
}
/**
* @param conclusion
*/
public
void
savelog
(
String
conclusion
)
{
File
distantout
;
if
(!
loglocation
.
equals
(
"logs/"
))
{
...
...
@@ -70,8 +73,9 @@ public class Log {
out
.
close
();
}
catch
(
FileNotFoundException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
System
.
out
.
println
(
"***** Scidetect : Output file error \n"
);
System
.
out
.
println
(
"***** Most probably the specified file is a Dir \n"
);
//e.printStackTrace();
}
}
}
src/fr/imag/Scidetect/TextExtractor/pdfextractor.java
View file @
06786a2e
...
...
@@ -44,14 +44,21 @@ public class pdfextractor {
try
{
pd
=
PDDocument
.
load
(
pdf
.
getPath
());
wr
=
new
BufferedWriter
(
new
OutputStreamWriter
(
new
FileOutputStream
(
totxt
)));
stripper
.
writeText
(
pd
,
wr
);
try
{
stripper
.
writeText
(
pd
,
wr
);}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"* Something went wrong during:"
);
System
.
out
.
println
(
" - txt extraction from pdf:"
+
pdf
);
System
.
out
.
println
(
"* Continuing anyway..."
);
}
if
(
pd
!=
null
)
{
pd
.
close
();
}
// I use close() to flush the stream.
wr
.
close
();
}
catch
(
Exception
e
)
{
// TODO: handle exception
System
.
out
.
println
(
"* Something went wrong during:"
);
System
.
out
.
println
(
" - txt extraction from pdf:"
+
pdf
);
System
.
out
.
println
(
"* Continuing anyway..."
);
}
//this seems to be faster but it seems like the app server does not support pdftotext
//commandexecutor cm = new commandexecutor();
...
...
src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java
View file @
06786a2e
...
...
@@ -50,6 +50,13 @@ public class ScigenChecker_Local {
private
HashMap
<
String
,
HashMap
<
String
,
Double
>>
distant
=
new
HashMap
<
String
,
HashMap
<
String
,
Double
>>();
private
Boolean
savedetaillog
=
false
;
/**
* Read in the config file:
*- places where to find samples of each class
*- default places where to write results.
* @throws FileNotFoundException
* @throws IOException
*/
private
void
readconfig
()
throws
FileNotFoundException
,
IOException
{
File
conf
=
new
File
(
"config.txt"
);
BufferedReader
br
=
new
BufferedReader
(
new
FileReader
(
conf
));
...
...
@@ -75,6 +82,9 @@ public class ScigenChecker_Local {
}
/**
* @throws IOException
*/
private
void
compute
()
throws
IOException
{
if
(
testpath
!=
null
)
{
DateFormat
dateFormat
=
new
SimpleDateFormat
(
"HH:mm dd.MM.yyyy"
);
...
...
@@ -88,8 +98,12 @@ public class ScigenChecker_Local {
tests
=
reader
.
readtests
(
testpath
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
System
.
out
.
println
(
"* Something went wrong during:"
);
System
.
out
.
println
(
" - reading the config file"
);
System
.
out
.
println
(
" - reading the samples (dir data)"
);
System
.
out
.
println
(
" - txt extraction from pdf"
);
System
.
out
.
println
(
"* Continuing anyway..."
);
//e.printStackTrace();
}
DistantCalculator
dc
=
new
DistantCalculator
();
...
...
@@ -103,31 +117,59 @@ public class ScigenChecker_Local {
log
.
savedetaillog
(
distant
);
}
}
else
{
System
.
out
.
println
(
"can not read path to test folder"
);
System
.
out
.
println
(
"***** Can not read path to the folder:"
+
testpath
);
System
.
out
.
println
(
"***** The folder should contains file to check"
);
}
}
/**
* Parsing of the command line arguments:
* where to find pdf files, where results should be written
* @param args
*/
public
void
readargs
(
String
[]
args
)
{
for
(
int
i
=
0
;
i
<
args
.
length
;
i
+=
1
)
{
// System.out.println(args[i]);
if
(
args
[
i
].
equals
(
"-l"
))
{
Log
.
loglocation
=
args
[
i
+
1
];
}
if
(
args
[
i
].
equals
(
"-c"
))
{
testpath
=
args
[
i
+
1
];
}
if
(
args
[
i
].
equals
(
"-d"
))
{
savedetaillog
=
true
;
}
}
if
(
args
.
length
>
0
)
{
for
(
int
i
=
0
;
i
<
args
.
length
;
i
+=
1
)
{
// System.out.println(args[i]);
if
(
args
[
i
].
equals
(
"-l"
))
{
Log
.
loglocation
=
args
[
i
+
1
];
}
if
(
args
[
i
].
equals
(
"-c"
))
{
testpath
=
args
[
i
+
1
];
}
if
(
args
[
i
].
equals
(
"-d"
))
{
savedetaillog
=
true
;
}
if
(
args
[
i
].
equals
(
"-h"
))
{
printUsage
();
}
}
}
else
{
printUsage
();}
}
/**
* To print usage (-h)
*/
private
static
void
printUsage
()
{
System
.
out
.
println
(
"***** Scigen & Co Checker \n"
);
System
.
out
.
println
(
"To test all files in a directory <pathToFilesDirToTest>:"
);
System
.
out
.
println
(
"java -jar ScigenChecker_local.jar -l <pathToLogFile> -c <pathToFilesDirToTest> \n"
);
System
.
out
.
println
(
"To print usage:"
);
System
.
out
.
println
(
"java -jar ScigenChecker_local.jar -h \n"
);
System
.
out
.
println
(
"***** \n"
);
}
/**
* This is the standalone checker. All pdf files in the dir specified after -c are
* checked against classes found in the dir "data". Results are written in the log
* file specified by the -l option. If -d is given a detailled log is produced.
* Example: testing all pdf files in a directory MyConf/PDF/ and having results
* in the MyConf/checklog.txt:
* java -jar ScigenChecker_local.jar -l MyConf/checklog.txt -c MyConf/PDF/
* @param args the command line arguments
*/
public
static
void
main
(
String
[]
args
)
throws
IOException
{
ScigenChecker_Local
a
=
new
ScigenChecker_Local
();
a
.
readconfig
();
a
.
readargs
(
args
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment