Commit 06786a2e authored by Cyril Labbe's avatar Cyril Labbe

Committer: Cyril Labbe

parent 94700418
...@@ -11,7 +11,7 @@ JAVACLASSPATH = -cp lib/pdfbox-app-1.8.8.jar ...@@ -11,7 +11,7 @@ JAVACLASSPATH = -cp lib/pdfbox-app-1.8.8.jar
default: all default: all
all: classes doc jar all: classes doc jar run
classes: classes:
mkdir -p classes mkdir -p classes
...@@ -24,5 +24,8 @@ jar: ...@@ -24,5 +24,8 @@ jar:
cd classes ; jar -cfvm ../ScigenChecker_Local`date +%Y-%m-%d`.jar ../MANIFEST.MF *; cd .. cd classes ; jar -cfvm ../ScigenChecker_Local`date +%Y-%m-%d`.jar ../MANIFEST.MF *; cd ..
cp ScigenChecker_Local`date +%Y-%m-%d`.jar ScigenChecker_Local.jar cp ScigenChecker_Local`date +%Y-%m-%d`.jar ScigenChecker_Local.jar
run:
java -jar ScigenChecker_local.jar -l checklog.txt -c Test
clean: clean:
rm -r classes; rm -r doc; rm -r classes; rm -r doc;
Manifest-Version: 1.0 Manifest-Version: 1.0
Class-Path: lib/pdfbox-app-1.8.8.jar
X-COMMENT: Main-Class will be added automatically by build X-COMMENT: Main-Class will be added automatically by build
Main-Class: fr.imag.Scidetect.scigenchecker_local.ScigenChecker_Local
...@@ -25,13 +25,23 @@ import java.io.PrintWriter; ...@@ -25,13 +25,23 @@ import java.io.PrintWriter;
import java.util.HashMap; import java.util.HashMap;
/** /**
* * Classifier is tagging input files has being of a certain class. Example of classes are SCIgen, Mathgen,...
* The decision is made according to the distance between the tested file and its nearest neighbor
* in each class. Thresholds for assignation are read in file specified in the configuration file
* (default config.txt).
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.frs * @author Nguyen Minh Tien - minh-tien.nguyen@imag.frs
*/ */
public class Classifier { public class Classifier {
/**
* The key is the class name and value is a couple of Double.
* This couple is composed of a threshold for quasi-certain classification and another for suspicion.
*/
HashMap<String, Double[]> Threshold = new HashMap<String, Double[]>(); HashMap<String, Double[]> Threshold = new HashMap<String, Double[]>();
/**
* Reads threshold in the configuration file (default config.txt).
* @throws FileNotFoundException
* @throws IOException
*/
public void readconfig() throws FileNotFoundException, IOException { public void readconfig() throws FileNotFoundException, IOException {
File conf = new File("config.txt"); File conf = new File("config.txt");
BufferedReader br = new BufferedReader(new FileReader(conf)); BufferedReader br = new BufferedReader(new FileReader(conf));
...@@ -49,7 +59,13 @@ public class Classifier { ...@@ -49,7 +59,13 @@ public class Classifier {
} }
} }
/**
* Classify is classifying each document given the matrix of distances (distant).
* For each entry it gives the class (or more) to which the text can be assigned
* @param distant is a matrix of distances
* @return the assigned class
* @throws IOException
*/
public String classify(HashMap<String, HashMap<String, Double>> distant) throws IOException { public String classify(HashMap<String, HashMap<String, Double>> distant) throws IOException {
String result = ""; String result = "";
...@@ -77,7 +93,11 @@ public class Classifier { ...@@ -77,7 +93,11 @@ public class Classifier {
return conclusion; return conclusion;
} }
/**
* Check if the distance is lower, between of upper the two threshold.
* @param result a string composed having for each classe the value of its NN
* @return
*/
private String checkdistant(String result) { private String checkdistant(String result) {
String conclution = ""; String conclution = "";
String[] eachtype = result.split("\n"); String[] eachtype = result.split("\n");
...@@ -105,7 +125,10 @@ public class Classifier { ...@@ -105,7 +125,10 @@ public class Classifier {
} }
return conclution; return conclution;
} }
/**
* @param result
* @return
*/
private String findmindistant(String result) { private String findmindistant(String result) {
Double mindistant = 1.0; Double mindistant = 1.0;
String[] eachtype = result.split("\n"); String[] eachtype = result.split("\n");
...@@ -120,7 +143,11 @@ public class Classifier { ...@@ -120,7 +143,11 @@ public class Classifier {
} }
return conclu; return conclu;
} }
/**
*
* @param indexpath
* @return
*/
private String gettype(String indexpath) { private String gettype(String indexpath) {
File indexfile = new File(indexpath); File indexfile = new File(indexpath);
String parent = indexfile.getParent(); String parent = indexfile.getParent();
...@@ -129,6 +156,10 @@ public class Classifier { ...@@ -129,6 +156,10 @@ public class Classifier {
return parent; return parent;
} }
/**
* @param distantto
* @return
*/
private String find_NN(HashMap<String, Double> distantto) { private String find_NN(HashMap<String, Double> distantto) {
HashMap<String, Double> distotype = new HashMap<String, Double>(); HashMap<String, Double> distotype = new HashMap<String, Double>();
HashMap<String, String> NNname = new HashMap<String, String>(); HashMap<String, String> NNname = new HashMap<String, String>();
......
...@@ -29,6 +29,11 @@ public class Indexer { ...@@ -29,6 +29,11 @@ public class Indexer {
private Object content; private Object content;
/**
* @param content
* @param textfile
* @throws FileNotFoundException
*/
public void index(String content, File textfile) throws FileNotFoundException { public void index(String content, File textfile) throws FileNotFoundException {
String filename = textfile.getName().substring(0,textfile.getName().lastIndexOf(".")); String filename = textfile.getName().substring(0,textfile.getName().lastIndexOf("."));
filename+=".txt"; filename+=".txt";
......
...@@ -53,6 +53,9 @@ public class Log { ...@@ -53,6 +53,9 @@ public class Log {
} }
} }
/**
* @param conclusion
*/
public void savelog(String conclusion) { public void savelog(String conclusion) {
File distantout ; File distantout ;
if (!loglocation.equals("logs/")) { if (!loglocation.equals("logs/")) {
...@@ -70,8 +73,9 @@ public class Log { ...@@ -70,8 +73,9 @@ public class Log {
out.close(); out.close();
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
// TODO Auto-generated catch block System.out.println("***** Scidetect : Output file error \n");
e.printStackTrace(); System.out.println("***** Most probably the specified file is a Dir \n");
//e.printStackTrace();
} }
} }
} }
...@@ -44,14 +44,21 @@ public class pdfextractor { ...@@ -44,14 +44,21 @@ public class pdfextractor {
try { try {
pd = PDDocument.load(pdf.getPath()); pd = PDDocument.load(pdf.getPath());
wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(totxt))); wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(totxt)));
stripper.writeText(pd, wr); try {stripper.writeText(pd, wr);}
catch (Exception e) {
System.out.println("* Something went wrong during:");
System.out.println(" - txt extraction from pdf:"+pdf);
System.out.println("* Continuing anyway...");
}
if (pd != null) { if (pd != null) {
pd.close(); pd.close();
} }
// I use close() to flush the stream. // I use close() to flush the stream.
wr.close(); wr.close();
} catch (Exception e) { } catch (Exception e) {
// TODO: handle exception System.out.println("* Something went wrong during:");
System.out.println(" - txt extraction from pdf:"+pdf);
System.out.println("* Continuing anyway...");
} }
//this seems to be faster but it seems like the app server does not support pdftotext //this seems to be faster but it seems like the app server does not support pdftotext
//commandexecutor cm = new commandexecutor(); //commandexecutor cm = new commandexecutor();
......
...@@ -50,6 +50,13 @@ public class ScigenChecker_Local { ...@@ -50,6 +50,13 @@ public class ScigenChecker_Local {
private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>(); private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
private Boolean savedetaillog = false; private Boolean savedetaillog = false;
/**
* Read in the config file:
*- places where to find samples of each class
*- default places where to write results.
* @throws FileNotFoundException
* @throws IOException
*/
private void readconfig() throws FileNotFoundException, IOException { private void readconfig() throws FileNotFoundException, IOException {
File conf = new File("config.txt"); File conf = new File("config.txt");
BufferedReader br = new BufferedReader(new FileReader(conf)); BufferedReader br = new BufferedReader(new FileReader(conf));
...@@ -75,6 +82,9 @@ public class ScigenChecker_Local { ...@@ -75,6 +82,9 @@ public class ScigenChecker_Local {
} }
/**
* @throws IOException
*/
private void compute() throws IOException { private void compute() throws IOException {
if (testpath != null) { if (testpath != null) {
DateFormat dateFormat = new SimpleDateFormat("HH:mm dd.MM.yyyy"); DateFormat dateFormat = new SimpleDateFormat("HH:mm dd.MM.yyyy");
...@@ -88,8 +98,12 @@ public class ScigenChecker_Local { ...@@ -88,8 +98,12 @@ public class ScigenChecker_Local {
tests = reader.readtests(testpath); tests = reader.readtests(testpath);
} catch (Exception e) { } catch (Exception e) {
System.out.println("* Something went wrong during:");
e.printStackTrace(); System.out.println(" - reading the config file");
System.out.println(" - reading the samples (dir data)");
System.out.println(" - txt extraction from pdf");
System.out.println("* Continuing anyway...");
//e.printStackTrace();
} }
DistantCalculator dc = new DistantCalculator(); DistantCalculator dc = new DistantCalculator();
...@@ -103,12 +117,18 @@ public class ScigenChecker_Local { ...@@ -103,12 +117,18 @@ public class ScigenChecker_Local {
log.savedetaillog(distant); log.savedetaillog(distant);
} }
} else { } else {
System.out.println("can not read path to test folder"); System.out.println("***** Can not read path to the folder:"+testpath);
System.out.println("***** The folder should contains file to check");
} }
} }
/**
* Parsing of the command line arguments:
* where to find pdf files, where results should be written
* @param args
*/
public void readargs(String[] args) { public void readargs(String[] args) {
if (args.length > 0) {
for (int i = 0; i < args.length; i += 1) { for (int i = 0; i < args.length; i += 1) {
// System.out.println(args[i]); // System.out.println(args[i]);
if (args[i].equals("-l")) { if (args[i].equals("-l")) {
...@@ -120,14 +140,36 @@ public class ScigenChecker_Local { ...@@ -120,14 +140,36 @@ public class ScigenChecker_Local {
if (args[i].equals("-d")) { if (args[i].equals("-d")) {
savedetaillog = true; savedetaillog = true;
} }
if (args[i].equals("-h")) {
printUsage();
}
} }
} else
{printUsage();}
} }
/** /**
* To print usage (-h)
*/
private static void printUsage() {
System.out.println("***** Scigen & Co Checker \n");
System.out.println("To test all files in a directory <pathToFilesDirToTest>:");
System.out.println("java -jar ScigenChecker_local.jar -l <pathToLogFile> -c <pathToFilesDirToTest> \n");
System.out.println("To print usage:");
System.out.println("java -jar ScigenChecker_local.jar -h \n");
System.out.println("***** \n");
}
/**
* This is the standalone checker. All pdf files in the dir specified after -c are
* checked against classes found in the dir "data". Results are written in the log
* file specified by the -l option. If -d is given a detailled log is produced.
* Example: testing all pdf files in a directory MyConf/PDF/ and having results
* in the MyConf/checklog.txt:
* java -jar ScigenChecker_local.jar -l MyConf/checklog.txt -c MyConf/PDF/
* @param args the command line arguments * @param args the command line arguments
*/ */
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
ScigenChecker_Local a = new ScigenChecker_Local(); ScigenChecker_Local a = new ScigenChecker_Local();
a.readconfig(); a.readconfig();
a.readargs(args); a.readargs(args);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment