From 06786a2eb926fbc1870ac8abad04c4394951883b Mon Sep 17 00:00:00 2001 From: Cyril Labbe Date: Thu, 26 Feb 2015 16:31:58 +0100 Subject: [PATCH] Committer: Cyril Labbe --- Makefile | 5 +- manifest.mf | 2 + src/fr/imag/Scidetect/Checker/Classifier.java | 45 +++++++++-- src/fr/imag/Scidetect/Checker/Indexer.java | 5 ++ src/fr/imag/Scidetect/Logger/Log.java | 8 +- .../Scidetect/TextExtractor/pdfextractor.java | 11 ++- .../ScigenChecker_Local.java | 76 ++++++++++++++----- 7 files changed, 123 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index 33f7ca4..b0d955d 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ JAVACLASSPATH = -cp lib/pdfbox-app-1.8.8.jar default: all -all: classes doc jar +all: classes doc jar run classes: mkdir -p classes @@ -24,5 +24,8 @@ jar: cd classes ; jar -cfvm ../ScigenChecker_Local`date +%Y-%m-%d`.jar ../MANIFEST.MF *; cd .. cp ScigenChecker_Local`date +%Y-%m-%d`.jar ScigenChecker_Local.jar +run: + java -jar ScigenChecker_local.jar -l checklog.txt -c Test + clean: rm -r classes; rm -r doc; diff --git a/manifest.mf b/manifest.mf index 328e8e5..b3e260f 100644 --- a/manifest.mf +++ b/manifest.mf @@ -1,3 +1,5 @@ Manifest-Version: 1.0 +Class-Path: lib/pdfbox-app-1.8.8.jar X-COMMENT: Main-Class will be added automatically by build +Main-Class: fr.imag.Scidetect.scigenchecker_local.ScigenChecker_Local diff --git a/src/fr/imag/Scidetect/Checker/Classifier.java b/src/fr/imag/Scidetect/Checker/Classifier.java index 4daa7f6..6dd61d4 100644 --- a/src/fr/imag/Scidetect/Checker/Classifier.java +++ b/src/fr/imag/Scidetect/Checker/Classifier.java @@ -25,13 +25,23 @@ import java.io.PrintWriter; import java.util.HashMap; /** - * + * Classifier is tagging input files has being of a certain class. Example of classes are SCIgen, Mathgen,... + * The decision is made according to the distance between the tested file and its nearest neighbor + * in each class. Thresholds for assignation are read in file specified in the configuration file + * (default config.txt). * @author Nguyen Minh Tien - minh-tien.nguyen@imag.frs */ public class Classifier { - + /** + * The key is the class name and value is a couple of Double. + * This couple is composed of a threshold for quasi-certain classification and another for suspicion. + */ HashMap Threshold = new HashMap(); - + /** + * Reads threshold in the configuration file (default config.txt). + * @throws FileNotFoundException + * @throws IOException + */ public void readconfig() throws FileNotFoundException, IOException { File conf = new File("config.txt"); BufferedReader br = new BufferedReader(new FileReader(conf)); @@ -49,7 +59,13 @@ public class Classifier { } } - + /** + * Classify is classifying each document given the matrix of distances (distant). + * For each entry it gives the class (or more) to which the text can be assigned + * @param distant is a matrix of distances + * @return the assigned class + * @throws IOException + */ public String classify(HashMap> distant) throws IOException { String result = ""; @@ -77,7 +93,11 @@ public class Classifier { return conclusion; } - + /** + * Check if the distance is lower, between of upper the two threshold. + * @param result a string composed having for each classe the value of its NN + * @return + */ private String checkdistant(String result) { String conclution = ""; String[] eachtype = result.split("\n"); @@ -105,7 +125,10 @@ public class Classifier { } return conclution; } - + /** + * @param result + * @return + */ private String findmindistant(String result) { Double mindistant = 1.0; String[] eachtype = result.split("\n"); @@ -120,7 +143,11 @@ public class Classifier { } return conclu; } - + /** + * + * @param indexpath + * @return + */ private String gettype(String indexpath) { File indexfile = new File(indexpath); String parent = indexfile.getParent(); @@ -129,6 +156,10 @@ public class Classifier { return parent; } + /** + * @param distantto + * @return + */ private String find_NN(HashMap distantto) { HashMap distotype = new HashMap(); HashMap NNname = new HashMap(); diff --git a/src/fr/imag/Scidetect/Checker/Indexer.java b/src/fr/imag/Scidetect/Checker/Indexer.java index 479fef8..088332f 100644 --- a/src/fr/imag/Scidetect/Checker/Indexer.java +++ b/src/fr/imag/Scidetect/Checker/Indexer.java @@ -29,6 +29,11 @@ public class Indexer { private Object content; + /** + * @param content + * @param textfile + * @throws FileNotFoundException + */ public void index(String content, File textfile) throws FileNotFoundException { String filename = textfile.getName().substring(0,textfile.getName().lastIndexOf(".")); filename+=".txt"; diff --git a/src/fr/imag/Scidetect/Logger/Log.java b/src/fr/imag/Scidetect/Logger/Log.java index cdbf868..7435d05 100644 --- a/src/fr/imag/Scidetect/Logger/Log.java +++ b/src/fr/imag/Scidetect/Logger/Log.java @@ -53,6 +53,9 @@ public class Log { } } + /** + * @param conclusion + */ public void savelog(String conclusion) { File distantout ; if (!loglocation.equals("logs/")) { @@ -70,8 +73,9 @@ public class Log { out.close(); } catch (FileNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + System.out.println("***** Scidetect : Output file error \n"); + System.out.println("***** Most probably the specified file is a Dir \n"); + //e.printStackTrace(); } } } diff --git a/src/fr/imag/Scidetect/TextExtractor/pdfextractor.java b/src/fr/imag/Scidetect/TextExtractor/pdfextractor.java index e06e766..6dabac0 100644 --- a/src/fr/imag/Scidetect/TextExtractor/pdfextractor.java +++ b/src/fr/imag/Scidetect/TextExtractor/pdfextractor.java @@ -44,14 +44,21 @@ public class pdfextractor { try { pd = PDDocument.load(pdf.getPath()); wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(totxt))); - stripper.writeText(pd, wr); + try {stripper.writeText(pd, wr);} + catch (Exception e) { + System.out.println("* Something went wrong during:"); + System.out.println(" - txt extraction from pdf:"+pdf); + System.out.println("* Continuing anyway..."); + } if (pd != null) { pd.close(); } // I use close() to flush the stream. wr.close(); } catch (Exception e) { - // TODO: handle exception + System.out.println("* Something went wrong during:"); + System.out.println(" - txt extraction from pdf:"+pdf); + System.out.println("* Continuing anyway..."); } //this seems to be faster but it seems like the app server does not support pdftotext //commandexecutor cm = new commandexecutor(); diff --git a/src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java b/src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java index fd944ba..9d22a1d 100644 --- a/src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java +++ b/src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java @@ -50,6 +50,13 @@ public class ScigenChecker_Local { private HashMap> distant = new HashMap>(); private Boolean savedetaillog = false; + /** + * Read in the config file: + *- places where to find samples of each class + *- default places where to write results. + * @throws FileNotFoundException + * @throws IOException + */ private void readconfig() throws FileNotFoundException, IOException { File conf = new File("config.txt"); BufferedReader br = new BufferedReader(new FileReader(conf)); @@ -75,6 +82,9 @@ public class ScigenChecker_Local { } + /** + * @throws IOException + */ private void compute() throws IOException { if (testpath != null) { DateFormat dateFormat = new SimpleDateFormat("HH:mm dd.MM.yyyy"); @@ -88,8 +98,12 @@ public class ScigenChecker_Local { tests = reader.readtests(testpath); } catch (Exception e) { - - e.printStackTrace(); + System.out.println("* Something went wrong during:"); + System.out.println(" - reading the config file"); + System.out.println(" - reading the samples (dir data)"); + System.out.println(" - txt extraction from pdf"); + System.out.println("* Continuing anyway..."); + //e.printStackTrace(); } DistantCalculator dc = new DistantCalculator(); @@ -103,31 +117,59 @@ public class ScigenChecker_Local { log.savedetaillog(distant); } } else { - System.out.println("can not read path to test folder"); + System.out.println("***** Can not read path to the folder:"+testpath); + System.out.println("***** The folder should contains file to check"); } } + /** + * Parsing of the command line arguments: + * where to find pdf files, where results should be written + * @param args + */ public void readargs(String[] args) { - - for (int i = 0; i < args.length; i += 1) { - // System.out.println(args[i]); - if (args[i].equals("-l")) { - Log.loglocation = args[i + 1]; - } - if (args[i].equals("-c")) { - testpath = args[i + 1]; - } - if (args[i].equals("-d")) { - savedetaillog = true; - } - } + if (args.length > 0) { + for (int i = 0; i < args.length; i += 1) { + // System.out.println(args[i]); + if (args[i].equals("-l")) { + Log.loglocation = args[i + 1]; + } + if (args[i].equals("-c")) { + testpath = args[i + 1]; + } + if (args[i].equals("-d")) { + savedetaillog = true; + } + if (args[i].equals("-h")) { + printUsage(); + } + } + } else + {printUsage();} } /** + * To print usage (-h) + */ + private static void printUsage() { + System.out.println("***** Scigen & Co Checker \n"); + System.out.println("To test all files in a directory :"); + System.out.println("java -jar ScigenChecker_local.jar -l -c \n"); + System.out.println("To print usage:"); + System.out.println("java -jar ScigenChecker_local.jar -h \n"); + System.out.println("***** \n"); + } + + /** + * This is the standalone checker. All pdf files in the dir specified after -c are + * checked against classes found in the dir "data". Results are written in the log + * file specified by the -l option. If -d is given a detailled log is produced. + * Example: testing all pdf files in a directory MyConf/PDF/ and having results + * in the MyConf/checklog.txt: + * java -jar ScigenChecker_local.jar -l MyConf/checklog.txt -c MyConf/PDF/ * @param args the command line arguments */ public static void main(String[] args) throws IOException { - ScigenChecker_Local a = new ScigenChecker_Local(); a.readconfig(); a.readargs(args); -- GitLab