From 29705e427fcd64314e63bd0280d8f26ede7241cc Mon Sep 17 00:00:00 2001 From: Tien Date: Thu, 26 Feb 2015 14:38:56 +0100 Subject: [PATCH] Corrected packet naming --- Makefile | 4 +- .../Scidetect/Checker/Classifier.java | 2 +- .../forge/Scidetect/Checker/Classifier.java~ | 158 ++++++++++++++ .../Scidetect/Checker/DistantCalculator.java | 2 +- .../Scidetect/Checker/DistantCalculator.java~ | 103 +++++++++ .../Scidetect/Checker/Indexer.java | 2 +- .../forge/Scidetect/Checker/Indexer.java~ | 59 +++++ .../{ => forge}/Scidetect/Checker/Reader.java | 6 +- .../imag/forge/Scidetect/Checker/Reader.java~ | 206 ++++++++++++++++++ .../{ => forge}/Scidetect/Logger/Log.java | 2 +- src/fr/imag/forge/Scidetect/Logger/Log.java~ | 79 +++++++ .../Scidetect/TextExtractor/Xmlextractor.java | 2 +- .../TextExtractor/Xmlextractor.java~ | 95 ++++++++ .../TextExtractor/commandexecutor.java | 2 +- .../TextExtractor/commandexecutor.java~ | 56 +++++ .../Scidetect/TextExtractor/normalizer.java | 2 +- .../Scidetect/TextExtractor/normalizer.java~ | 57 +++++ .../Scidetect/TextExtractor/pdfextractor.java | 2 +- .../TextExtractor/pdfextractor.java~ | 68 ++++++ .../ScigenChecker_Local.java | 138 ++++++++++++ .../ScigenChecker_Local.java~} | 3 +- 21 files changed, 1034 insertions(+), 14 deletions(-) rename src/fr/imag/{ => forge}/Scidetect/Checker/Classifier.java (99%) create mode 100644 src/fr/imag/forge/Scidetect/Checker/Classifier.java~ rename src/fr/imag/{ => forge}/Scidetect/Checker/DistantCalculator.java (98%) create mode 100644 src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java~ rename src/fr/imag/{ => forge}/Scidetect/Checker/Indexer.java (97%) create mode 100644 src/fr/imag/forge/Scidetect/Checker/Indexer.java~ rename src/fr/imag/{ => forge}/Scidetect/Checker/Reader.java (97%) create mode 100644 src/fr/imag/forge/Scidetect/Checker/Reader.java~ rename src/fr/imag/{ => forge}/Scidetect/Logger/Log.java (98%) create mode 100644 src/fr/imag/forge/Scidetect/Logger/Log.java~ rename src/fr/imag/{ => forge}/Scidetect/TextExtractor/Xmlextractor.java (98%) create mode 100644 src/fr/imag/forge/Scidetect/TextExtractor/Xmlextractor.java~ rename src/fr/imag/{ => forge}/Scidetect/TextExtractor/commandexecutor.java (97%) create mode 100644 src/fr/imag/forge/Scidetect/TextExtractor/commandexecutor.java~ rename src/fr/imag/{ => forge}/Scidetect/TextExtractor/normalizer.java (97%) create mode 100644 src/fr/imag/forge/Scidetect/TextExtractor/normalizer.java~ rename src/fr/imag/{ => forge}/Scidetect/TextExtractor/pdfextractor.java (97%) create mode 100644 src/fr/imag/forge/Scidetect/TextExtractor/pdfextractor.java~ create mode 100644 src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java rename src/fr/imag/{Scidetect/scigenchecker_local/ScigenChecker_Local.java => forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java~} (98%) diff --git a/Makefile b/Makefile index 33f7ca4..e4ad248 100644 --- a/Makefile +++ b/Makefile @@ -15,10 +15,10 @@ all: classes doc jar classes: mkdir -p classes - $(JAVAC) $(JAVAFLAGS) $(JAVACLASSPATH) src/fr/imag/Scidetect/*/*.java + $(JAVAC) $(JAVAFLAGS) $(JAVACLASSPATH) src/fr/imag/forge/Scidetect/*/*.java doc: - $(JAVADOC) src/*/*/*/*/* + $(JAVADOC) src/*/*/*/*/*/* jar: cd classes ; jar -cfvm ../ScigenChecker_Local`date +%Y-%m-%d`.jar ../MANIFEST.MF *; cd .. diff --git a/src/fr/imag/Scidetect/Checker/Classifier.java b/src/fr/imag/forge/Scidetect/Checker/Classifier.java similarity index 99% rename from src/fr/imag/Scidetect/Checker/Classifier.java rename to src/fr/imag/forge/Scidetect/Checker/Classifier.java index 4daa7f6..563efac 100644 --- a/src/fr/imag/Scidetect/Checker/Classifier.java +++ b/src/fr/imag/forge/Scidetect/Checker/Classifier.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.Checker; +package fr.imag.forge.Scidetect.Checker; import java.io.BufferedReader; import java.io.File; diff --git a/src/fr/imag/forge/Scidetect/Checker/Classifier.java~ b/src/fr/imag/forge/Scidetect/Checker/Classifier.java~ new file mode 100644 index 0000000..35de5de --- /dev/null +++ b/src/fr/imag/forge/Scidetect/Checker/Classifier.java~ @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.Checker; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashMap; + +/** + * + * @author tien + */ +public class Classifier { + + HashMap Threshold = new HashMap(); + + public void readconfig() throws FileNotFoundException, IOException { + File conf = new File("config.txt"); + BufferedReader br = new BufferedReader(new FileReader(conf)); + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("Threshold_")) { + // System.out.println(line); + String[] b = line.split("\t"); + Double[] temp = new Double[2]; + temp[0] = Double.parseDouble(b[1]); + temp[1] = Double.parseDouble(b[2]); + Threshold.put(b[0].substring(10, b[0].length()), temp); + //10 because i want to cut Threshold_ + } + } + + } + + public String classify(HashMap> distant) throws IOException { + + String result = ""; + String conclusion = ""; + readconfig(); + for (String key : distant.keySet()) { + //for each file in the test + result = find_NN(distant.get(key)); + //System.out.println(result); + //System.out.println(key); + //System.out.println(result); + String[] a = checkdistant(result).split("\n"); + + if (a[0].length() == 0) { + conclusion += key + "\t" + "cant classify\t1\tnull\n"; + } else { + for (int i = 0; i < a.length; i++) { + + conclusion += key + "\t" + a[i] + "\n"; + } + } + + } + //System.out.println(conclusion); + return conclusion; + + } + + private String checkdistant(String result) { + String conclution = ""; + String[] eachtype = result.split("\n"); + + for (int i = 0; i < eachtype.length; i++) { + String[] eachNN = eachtype[i].split("\t"); + //System.out.println(eachtype[i]); + //get threshold for the corresponding type + Double[] threshold = new Double[2]; + if (Threshold.containsKey(eachNN[0])) { + threshold = Threshold.get(eachNN[0]); + } else { + threshold = Threshold.get("Default"); + } + //check distant with threshold + if (Double.parseDouble(eachNN[1]) < threshold[0]) { + conclution += "is a " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n"; + } else if (Double.parseDouble(eachNN[1]) < threshold[1]) { + conclution += "is suppected " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n"; + } + + } + if (conclution == "") { + conclution = findmindistant(result); + } + return conclution; + } + + private String findmindistant(String result) { + Double mindistant = 1.0; + String[] eachtype = result.split("\n"); + + String conclu = ""; + for (int i = 0; i < eachtype.length; i++) { + String[] eachNN = eachtype[i].split("\t"); + if (Double.parseDouble(eachNN[1]) < mindistant) { + mindistant = Double.parseDouble(eachNN[1]); + conclu = "is Genuine \t" + eachNN[1] + "\t" + eachNN[2] + "\n"; + } + } + return conclu; + } + + private String gettype(String indexpath) { + File indexfile = new File(indexpath); + String parent = indexfile.getParent(); + // String type = parent.substring(0, parent.lastIndexOf("/")); + parent = parent.substring(parent.lastIndexOf("/") + 1, parent.length()); + return parent; + } + + private String find_NN(HashMap distantto) { + HashMap distotype = new HashMap(); + HashMap NNname = new HashMap(); + Double MinNN = 1.0; + String NN = ""; + + for (String key : distantto.keySet()) { + String type = gettype(key); + if (!distotype.containsKey(type)) { + distotype.put(type, distantto.get(key)); + NNname.put(type, key); + } else if (distantto.get(key) < distotype.get(type)) { + distotype.put(type, distantto.get(key)); + NNname.put(type, key); + } + + } + // it returns the path to the NN + String result = ""; + for (String key : distotype.keySet()) { + result += key + "\t" + distotype.get(key) + "\t" + NNname.get(key) + "\n"; + } + return result; + } +} diff --git a/src/fr/imag/Scidetect/Checker/DistantCalculator.java b/src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java similarity index 98% rename from src/fr/imag/Scidetect/Checker/DistantCalculator.java rename to src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java index fab67b8..c8f3fd3 100644 --- a/src/fr/imag/Scidetect/Checker/DistantCalculator.java +++ b/src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.Checker; +package fr.imag.forge.Scidetect.Checker; import java.util.HashMap; import java.util.HashSet; diff --git a/src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java~ b/src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java~ new file mode 100644 index 0000000..c6a33d9 --- /dev/null +++ b/src/fr/imag/forge/Scidetect/Checker/DistantCalculator.java~ @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.Checker; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +/** + * + * @author tien + */ +public class DistantCalculator { + + private HashMap> distant = new HashMap>(); + + public HashMap> caldistant(HashMap> samples, HashMap> tests) { + for (String key : tests.keySet()) { + HashMap distantto = new HashMap(); + for (String key2 : samples.keySet()) { + double distanttt = cal_textdistant(tests.get(key), + samples.get(key2)); + // System.out.println("distant between " + key + " and " + key2 + // + ": " + distanttt); + distantto.put(key2, distanttt); + + } + distant.put(key, distantto); + } + + return distant; + } + + private double cal_textdistant(HashMap text1, + HashMap text2) { + double nboftoken = 0.0; + double sum = 0.0; + + Set keys1 = text1.keySet(); + Set keys2 = text2.keySet(); + Set allkeys = new HashSet(); + allkeys.addAll(keys1); + allkeys.addAll(keys2); + Integer Na = 0, Nb = 0; + // get the nb of token in each text + for (String key : allkeys) { + Integer Fa = 0; + Integer Fb = 0; + if (text1.containsKey(key)) { + Fa = text1.get(key); + } + if (text2.containsKey(key)) { + Fb = text2.get(key); + } + Na += Fa; + Nb += Fb; + } + // reduce propotion for text of different lenght + if (Na <= Nb) { + for (String key : allkeys) { + Integer Fa = 0; + Integer Fb = 0; + if (text1.containsKey(key)) { + Fa = text1.get(key); + } + if (text2.containsKey(key)) { + Fb = text2.get(key); + } + sum += Math.abs(Fa - (double) Fb * (Na / (double) Nb)); + } + return sum / (2 * Na); + } else { + for (String key : allkeys) { + Integer Fa = 0; + Integer Fb = 0; + if (text1.containsKey(key)) { + Fa = text1.get(key); + } + if (text2.containsKey(key)) { + Fb = text2.get(key); + } + sum += Math.abs(Fa * (Nb / (double) Na) - (double) Fb); + } + return sum / (2 * Nb); + } + } +} diff --git a/src/fr/imag/Scidetect/Checker/Indexer.java b/src/fr/imag/forge/Scidetect/Checker/Indexer.java similarity index 97% rename from src/fr/imag/Scidetect/Checker/Indexer.java rename to src/fr/imag/forge/Scidetect/Checker/Indexer.java index 479fef8..f386b08 100644 --- a/src/fr/imag/Scidetect/Checker/Indexer.java +++ b/src/fr/imag/forge/Scidetect/Checker/Indexer.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.Checker; +package fr.imag.forge.Scidetect.Checker; import java.io.File; import java.io.FileNotFoundException; diff --git a/src/fr/imag/forge/Scidetect/Checker/Indexer.java~ b/src/fr/imag/forge/Scidetect/Checker/Indexer.java~ new file mode 100644 index 0000000..23216fc --- /dev/null +++ b/src/fr/imag/forge/Scidetect/Checker/Indexer.java~ @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.Checker; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintWriter; +import java.util.HashMap; + +/** + * + * @author tien + */ +public class Indexer { + + private Object content; + + public void index(String content, File textfile) throws FileNotFoundException { + String filename = textfile.getName().substring(0,textfile.getName().lastIndexOf(".")); + filename+=".txt"; + String path = textfile.getParent(); + + String[] words = content.split(" "); + //System.out.println(words.length); + HashMap counter = new HashMap(); + for (int i = 0; i < words.length; i++) { + if (!counter.containsKey(words[i])) { + counter.put(words[i], 1); + } else { + counter.put(words[i], counter.get(words[i]) + 1); + } + } + + File indexout = new File(path + "/INDEX-" + filename); + // String filepath = (indexout.getPath()); + PrintWriter out = new PrintWriter(indexout); + + for (String key : counter.keySet()) { + out.println(key + "\t" + counter.get(key)); + } + out.close(); + } +} diff --git a/src/fr/imag/Scidetect/Checker/Reader.java b/src/fr/imag/forge/Scidetect/Checker/Reader.java similarity index 97% rename from src/fr/imag/Scidetect/Checker/Reader.java rename to src/fr/imag/forge/Scidetect/Checker/Reader.java index e187804..5edf143 100644 --- a/src/fr/imag/Scidetect/Checker/Reader.java +++ b/src/fr/imag/forge/Scidetect/Checker/Reader.java @@ -14,10 +14,10 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.Checker; +package fr.imag.forge.Scidetect.Checker; -import fr.imag.Scidetect.TextExtractor.Xmlextractor; -import fr.imag.Scidetect.TextExtractor.pdfextractor; +import fr.imag.forge.Scidetect.TextExtractor.Xmlextractor; +import fr.imag.forge.Scidetect.TextExtractor.pdfextractor; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; diff --git a/src/fr/imag/forge/Scidetect/Checker/Reader.java~ b/src/fr/imag/forge/Scidetect/Checker/Reader.java~ new file mode 100644 index 0000000..879d14d --- /dev/null +++ b/src/fr/imag/forge/Scidetect/Checker/Reader.java~ @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.Checker; + +import fr.imag.Scidetect.TextExtractor.Xmlextractor; +import fr.imag.Scidetect.TextExtractor.pdfextractor; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +/** + * + * @author tien + */ +public class Reader { + + private HashMap> samples = new HashMap>(); + private HashMap> tests = new HashMap>(); + private String SamplesFolder; + private int maxlength; + + public void readconfig() throws FileNotFoundException, IOException { + File conf = new File("config.txt"); + BufferedReader br = new BufferedReader(new FileReader(conf)); + String line; + + while ((line = br.readLine()) != null) { + if (!line.startsWith("#")) { + // System.out.println(line); + String[] b = line.split("\t"); + if (b[0].equals("samples")) { + SamplesFolder = b[1]; + } + //other config should be read over here + if (b[0].equals("Max_length")) { + maxlength = Integer.parseInt(b[1]); + } + } + } + + } + + public HashMap> readsamples(String foldername) throws IOException { + File folder = new File(foldername); + File[] listOfFile = folder.listFiles(); + + for (int j = 0; j < listOfFile.length; j++) { + if (listOfFile[j].isDirectory()) { + readsamples(listOfFile[j].getPath()); + } else if (listOfFile[j].getName().endsWith(".pdf")) { + // find if there is already index for it + String indexname = "INDEX-" + + listOfFile[j].getName().substring(0, + listOfFile[j].getName().lastIndexOf(".")) + + ".txt"; + if (Arrays.asList(listOfFile).toString().contains(indexname)) { + // System.out.println("lets read from index file"); + readindexfile(listOfFile[j].getParent() + "/" + indexname); + } else { + pdfextractor a = new pdfextractor(); + String content = a.pdfextract(listOfFile[j]); + Indexer b = new Indexer(); + b.index(content, listOfFile[j]); + readindexfile(listOfFile[j].getParent() + "/" + indexname); + } + } + } + return samples; + } + + private void readindexfile(String path) throws IOException { + File index = new File(path); + BufferedReader br; + br = new BufferedReader(new FileReader(index)); + String line; + HashMap a = new HashMap(); + while ((line = br.readLine()) != null) { + String[] b = line.split("\t"); + a.put(b[0], Integer.parseInt(b[1])); + } + br.close(); + if (path.contains(SamplesFolder)) { + samples.put(path, a); + } else { + tests.put(path, a); + } + } + + private void readfile(File pdf) throws IOException { + + String content = ""; + if (pdf.getName().endsWith(".pdf")) { + + pdfextractor a = new pdfextractor(); + content = a.pdfextract(pdf); + } else if (pdf.getName().endsWith(".xml") || pdf.getName().endsWith(".xtx")) { + Xmlextractor a = new Xmlextractor(); + content = a.xmlextract(pdf); + } + + //lets deal with long file over here + //split content and the index part by part + if (content.length() < maxlength) { + String indexname = "INDEX-" + + pdf.getName().substring(0, + pdf.getName().lastIndexOf(".")) + + ".txt"; + + Indexer b = new Indexer(); + b.index(content, pdf); + + readindexfile(pdf.getParent() + "/" + indexname); + } else { + String[] part = splitcontent(content); + for (int i = 0; i < part.length; i++) { + String indexname = "INDEX-" + + pdf.getName().substring(0, + pdf.getName().lastIndexOf(".")) + + "_part" + i + ".txt"; + String filename = pdf.getName().substring(0, + pdf.getName().lastIndexOf(".")) + + "_part" + i + ".txt"; + Indexer b = new Indexer(); + File a = new File(pdf.getParent() + "/" + filename); + PrintWriter out = new PrintWriter(new FileWriter(a)); + out.println(part[i]); + //System.out.println(text); + out.close(); + b.index(part[i], a); + + readindexfile(a.getParent() + "/" + indexname); + + } + } + } + + public HashMap> readtests(String testpath) throws IOException { + File folder = new File(testpath); + + if (folder.isDirectory()) { + File[] listOfFile = folder.listFiles(); + + for (int j = 0; j < listOfFile.length; j++) { + + if (listOfFile[j].isDirectory()) { + readtests(listOfFile[j].getPath()); + } else if (listOfFile[j].getName().endsWith(".pdf") || listOfFile[j].getName().endsWith(".xml") || listOfFile[j].getName().endsWith(".xtx")) { + readfile(listOfFile[j]); + } + } + } else if (folder.getName().endsWith(".pdf") || folder.getName().endsWith(".xml") || folder.getName().endsWith(".xtx")) { + readfile(folder); + + } + return tests; + } + + private String[] splitcontent(String content) { + + int nbofpart = content.length() / maxlength; + String[] part = new String[nbofpart + 1]; + int lower = 0; + int upper = 0; + int i; + for (i = 0; i < nbofpart; i++) { + upper += maxlength; + part[i] = content.substring(lower, upper); + lower = upper; + } + + if (upper <= content.length() - 1) { + + lower = upper; + + upper = content.length(); + + part[i] = (content.substring(lower, upper)); + } + return part; + } + +} diff --git a/src/fr/imag/Scidetect/Logger/Log.java b/src/fr/imag/forge/Scidetect/Logger/Log.java similarity index 98% rename from src/fr/imag/Scidetect/Logger/Log.java rename to src/fr/imag/forge/Scidetect/Logger/Log.java index cdbf868..cc45a49 100644 --- a/src/fr/imag/Scidetect/Logger/Log.java +++ b/src/fr/imag/forge/Scidetect/Logger/Log.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.Logger; +package fr.imag.forge.Scidetect.Logger; import java.io.File; import java.io.FileNotFoundException; diff --git a/src/fr/imag/forge/Scidetect/Logger/Log.java~ b/src/fr/imag/forge/Scidetect/Logger/Log.java~ new file mode 100644 index 0000000..6612c76 --- /dev/null +++ b/src/fr/imag/forge/Scidetect/Logger/Log.java~ @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.Logger; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintWriter; +import java.util.HashMap; + +/** + * + * @author tien + */ +public class Log { + + public static String loglocation; + public static String detailloglocation; + // public static String testpath; + public static String logtime; + + public void savedetaillog(HashMap> distant) { + + File distantout = new File(detailloglocation + logtime + ".xls"); + //File distantout = new File(testpath+"/alldistant.xls"); + PrintWriter out; + try { + out = new PrintWriter(distantout); + + for (String key : distant.keySet()) { + for (String key2 : distant.get(key).keySet()) { + out.println(key + "\t" + key2 + "\t" + + distant.get(key).get(key2)); + } + } + out.close(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public void savelog(String conclusion) { + File distantout ; + if (!loglocation.equals("logs/")) { + distantout = new File(loglocation); + } else { + distantout = new File(loglocation+ logtime + ".xls"); + } + + PrintWriter out; + try { + + out = new PrintWriter(distantout); + + out.write(conclusion); + + out.close(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +} diff --git a/src/fr/imag/Scidetect/TextExtractor/Xmlextractor.java b/src/fr/imag/forge/Scidetect/TextExtractor/Xmlextractor.java similarity index 98% rename from src/fr/imag/Scidetect/TextExtractor/Xmlextractor.java rename to src/fr/imag/forge/Scidetect/TextExtractor/Xmlextractor.java index b7c6bba..6aa4858 100644 --- a/src/fr/imag/Scidetect/TextExtractor/Xmlextractor.java +++ b/src/fr/imag/forge/Scidetect/TextExtractor/Xmlextractor.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.TextExtractor; +package fr.imag.forge.Scidetect.TextExtractor; import static java.awt.SystemColor.text; import java.io.File; diff --git a/src/fr/imag/forge/Scidetect/TextExtractor/Xmlextractor.java~ b/src/fr/imag/forge/Scidetect/TextExtractor/Xmlextractor.java~ new file mode 100644 index 0000000..ad1a1cd --- /dev/null +++ b/src/fr/imag/forge/Scidetect/TextExtractor/Xmlextractor.java~ @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.TextExtractor; + +import static java.awt.SystemColor.text; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.logging.Level; +import java.util.logging.Logger; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +/** + * + * @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + */ +public class Xmlextractor { + + private String text = ""; + + private void printNote(NodeList nodeList) { + + for (int count = 0; count < nodeList.getLength(); count++) { + + Node tempNode = nodeList.item(count); + + // make sure it's element node. + if (tempNode.getNodeType() == Node.ELEMENT_NODE) { + if (tempNode.getNodeName() == "Para" || tempNode.getNodeName() == "PDFTextExtract") { + text += tempNode.getTextContent(); + } + if (tempNode.hasChildNodes()) { + // loop again if has child nodes + printNote(tempNode.getChildNodes()); + } + } + } + //return text; + } + + public String xmlextract(File xml) throws IOException { + try { + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + + org.w3c.dom.Document doc = dBuilder.parse(xml); + + //System.out.println(doc.getDocumentElement().getNodeName()); + if (doc.hasChildNodes()) { + + printNote(doc.getChildNodes()); + + } + + } catch (ParserConfigurationException ex) { + Logger.getLogger(Xmlextractor.class.getName()).log(Level.SEVERE, null, ex); + } catch (SAXException ex) { + Logger.getLogger(Xmlextractor.class.getName()).log(Level.SEVERE, null, ex); + } catch (IOException ex) { + Logger.getLogger(Xmlextractor.class.getName()).log(Level.SEVERE, null, ex); + } + File totxt = new File(xml.getPath() + .substring(0, xml.getPath().lastIndexOf('.')) + ".txt"); + // File txt = new File(xml.getParent()+xml.getName().substring(0,xml.getName().lastIndexOf("."))+".txt"); + PrintWriter out = new PrintWriter(new FileWriter(totxt)); + out.println(text); + out.close(); + normalizer a = new normalizer(); + String content = a.normalize(totxt); + + return content; + } +} diff --git a/src/fr/imag/Scidetect/TextExtractor/commandexecutor.java b/src/fr/imag/forge/Scidetect/TextExtractor/commandexecutor.java similarity index 97% rename from src/fr/imag/Scidetect/TextExtractor/commandexecutor.java rename to src/fr/imag/forge/Scidetect/TextExtractor/commandexecutor.java index 7ca3c39..39e6a0c 100644 --- a/src/fr/imag/Scidetect/TextExtractor/commandexecutor.java +++ b/src/fr/imag/forge/Scidetect/TextExtractor/commandexecutor.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.TextExtractor; +package fr.imag.forge.Scidetect.TextExtractor; import java.io.BufferedReader; import java.io.IOException; diff --git a/src/fr/imag/forge/Scidetect/TextExtractor/commandexecutor.java~ b/src/fr/imag/forge/Scidetect/TextExtractor/commandexecutor.java~ new file mode 100644 index 0000000..879b91f --- /dev/null +++ b/src/fr/imag/forge/Scidetect/TextExtractor/commandexecutor.java~ @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.TextExtractor; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +/** + * + * @author tien + */ +public class commandexecutor { + public static String execute(String command){ + StringBuilder sb = new StringBuilder(); + String[] commands = new String[]{"/bin/sh","-c", command}; + try { + Process proc = new ProcessBuilder(commands).start(); + BufferedReader stdInput = new BufferedReader(new + InputStreamReader(proc.getInputStream())); + + BufferedReader stdError = new BufferedReader(new + InputStreamReader(proc.getErrorStream())); + + String s = null; + while ((s = stdInput.readLine()) != null) { + sb.append(s); + sb.append("\n"); + } + + while ((s = stdError.readLine()) != null) { + sb.append(s); + sb.append("\n"); + } + } catch (IOException e) { + e.printStackTrace(); + } + return "done"; + } +} diff --git a/src/fr/imag/Scidetect/TextExtractor/normalizer.java b/src/fr/imag/forge/Scidetect/TextExtractor/normalizer.java similarity index 97% rename from src/fr/imag/Scidetect/TextExtractor/normalizer.java rename to src/fr/imag/forge/Scidetect/TextExtractor/normalizer.java index 56b459e..2f54b08 100644 --- a/src/fr/imag/Scidetect/TextExtractor/normalizer.java +++ b/src/fr/imag/forge/Scidetect/TextExtractor/normalizer.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.TextExtractor; +package fr.imag.forge.Scidetect.TextExtractor; import java.io.BufferedReader; import java.io.File; diff --git a/src/fr/imag/forge/Scidetect/TextExtractor/normalizer.java~ b/src/fr/imag/forge/Scidetect/TextExtractor/normalizer.java~ new file mode 100644 index 0000000..a9ecfef --- /dev/null +++ b/src/fr/imag/forge/Scidetect/TextExtractor/normalizer.java~ @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.TextExtractor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * + * @author tien + */ +public class normalizer { + + public String normalize(File txt) throws IOException { + BufferedReader br; + br = new BufferedReader(new FileReader(txt)); + String line; + String content = ""; + while ((line = br.readLine()) != null) { + content += " "; + content += line; + + } + br.close(); + content = content.toUpperCase(); + content = content.replaceAll("-", " ");// parenthesis + content = content.replaceAll("[^A-Z ]", "");// non A to Z + + content = content.replaceAll("\n", " ");//prob not nessesary :D + content = content.replaceAll("\\s+", " ");// remove extra spaces + + PrintWriter out = new PrintWriter(txt); + out.println(content); + out.close(); + return content; + } + +} diff --git a/src/fr/imag/Scidetect/TextExtractor/pdfextractor.java b/src/fr/imag/forge/Scidetect/TextExtractor/pdfextractor.java similarity index 97% rename from src/fr/imag/Scidetect/TextExtractor/pdfextractor.java rename to src/fr/imag/forge/Scidetect/TextExtractor/pdfextractor.java index e06e766..dc750af 100644 --- a/src/fr/imag/Scidetect/TextExtractor/pdfextractor.java +++ b/src/fr/imag/forge/Scidetect/TextExtractor/pdfextractor.java @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -package fr.imag.Scidetect.TextExtractor; +package fr.imag.forge.Scidetect.TextExtractor; import java.io.BufferedWriter; import java.io.File; diff --git a/src/fr/imag/forge/Scidetect/TextExtractor/pdfextractor.java~ b/src/fr/imag/forge/Scidetect/TextExtractor/pdfextractor.java~ new file mode 100644 index 0000000..cea9d96 --- /dev/null +++ b/src/fr/imag/forge/Scidetect/TextExtractor/pdfextractor.java~ @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +package fr.imag.Scidetect.TextExtractor; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.text.Normalizer; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.util.PDFTextStripper; + +/** + * + * @author tien + */ +public class pdfextractor { + + public String pdfextract(File pdf) throws IOException { + + PDFTextStripper stripper = new PDFTextStripper(); + PDDocument pd; + BufferedWriter wr; + System.out + .println("converting: " + pdf.getPath()); + File totxt = new File(pdf.getPath() + .substring(0, pdf.getPath().lastIndexOf('.')) + ".txt"); + + try { + pd = PDDocument.load(pdf.getPath()); + wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(totxt))); + stripper.writeText(pd, wr); + if (pd != null) { + pd.close(); + } + // I use close() to flush the stream. + wr.close(); + } catch (Exception e) { + // TODO: handle exception + } + //this seems to be faster but it seems like the app server does not support pdftotext + //commandexecutor cm = new commandexecutor(); + //cm.execute("pdftotext "+ listOfFile[j].getPath()); + + // ok now I have the txt file; lets normalize it + normalizer a = new normalizer(); + String content = a.normalize(totxt); + return content; + } + +} diff --git a/src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java b/src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java new file mode 100644 index 0000000..a115e12 --- /dev/null +++ b/src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package fr.imag.forge.Scidetect.scigenchecker_local; + +import fr.imag.forge.Scidetect.Checker.Classifier; +import fr.imag.forge.Scidetect.Checker.DistantCalculator; +import fr.imag.forge.Scidetect.Checker.Indexer; +import fr.imag.forge.Scidetect.Checker.Reader; +import fr.imag.forge.Scidetect.Logger.Log; +import fr.imag.forge.Scidetect.TextExtractor.pdfextractor; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; + +/** + * + * @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + */ +public class ScigenChecker_Local { + + // private String loglocation; + // private String detailloglocation; + private String testpath; + //private String logtime; + private HashMap> samples = new HashMap>(); + private HashMap> tests = new HashMap>(); + private String SamplesFolder; + private HashMap> distant = new HashMap>(); + private Boolean savedetaillog = false; + + private void readconfig() throws FileNotFoundException, IOException { + File conf = new File("config.txt"); + BufferedReader br = new BufferedReader(new FileReader(conf)); + String line; + while ((line = br.readLine()) != null) { + if (!line.startsWith("#")) { + // System.out.println(line); + String[] b = line.split("\t"); + if (b[0].equals("samples")) { + SamplesFolder = b[1]; + } + if (b[0].equals("Default_log_folder")) { + + Log.loglocation = b[1]; + // System.out.println(loglocation); + } + if (b[0].equals("Default_detail_log_folder")) { + Log.detailloglocation = b[1]; + //System.out.println(detailloglocation); + } + } + } + + } + + private void compute() throws IOException { + if (testpath != null) { + DateFormat dateFormat = new SimpleDateFormat("HH:mm dd.MM.yyyy"); + Date date = new Date(); + Log.logtime = dateFormat.format(date); + try { + Reader reader = new Reader(); + + reader.readconfig(); + samples = reader.readsamples(SamplesFolder); + tests = reader.readtests(testpath); + + } catch (Exception e) { + + e.printStackTrace(); + } + + DistantCalculator dc = new DistantCalculator(); + distant = dc.caldistant(samples, tests); + Classifier cl = new Classifier(); + String conclusion = cl.classify(distant); + System.out.println(conclusion); + Log log = new Log(); + log.savelog(conclusion); + if (savedetaillog) { + log.savedetaillog(distant); + } + } else { + System.out.println("can not read path to test folder"); + } + } + + public void readargs(String[] args) { + + for (int i = 0; i < args.length; i += 1) { + // System.out.println(args[i]); + if (args[i].equals("-l")) { + Log.loglocation = args[i + 1]; + } + if (args[i].equals("-c")) { + testpath = args[i + 1]; + } + if (args[i].equals("-d")) { + savedetaillog = true; + } + } + } + + /** + * @param args the command line arguments + */ + public static void main(String[] args) throws IOException { + + ScigenChecker_Local a = new ScigenChecker_Local(); + a.readconfig(); + a.readargs(args); + a.compute(); + + } + +} diff --git a/src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java b/src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java~ similarity index 98% rename from src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java rename to src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java~ index fd944ba..9f1223e 100644 --- a/src/fr/imag/Scidetect/scigenchecker_local/ScigenChecker_Local.java +++ b/src/fr/imag/forge/Scidetect/scigenchecker_local/ScigenChecker_Local.java~ @@ -1,5 +1,6 @@ /* * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH + * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,7 +37,7 @@ import java.util.HashMap; /** * - * @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr + * @author tien */ public class ScigenChecker_Local { -- GitLab