Commit 45f7cbea authored by Tien's avatar Tien

rename pakages

parent a555af2b
......@@ -14,7 +14,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.Scidetect.Checker;
package fr.imag.forge.scidetect.Checker;
import java.io.BufferedReader;
import java.io.File;
......@@ -24,8 +24,8 @@ import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashMap;
import fr.imag.forge.Scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.Scidetect.Checker.Utils.ThresholdsSet;
import fr.imag.forge.scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.scidetect.Checker.Utils.ThresholdsSet;
/**
* Classifier is tagging input files has being of a certain class. Example of classes are SCIgen, Mathgen,...
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
* author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package fr.imag.Scidetect.Checker;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashMap;
/**
*
* @author tien
*/
public class Classifier {
HashMap<String, Double[]> Threshold = new HashMap<String, Double[]>();
public void readconfig() throws FileNotFoundException, IOException {
File conf = new File("config.txt");
BufferedReader br = new BufferedReader(new FileReader(conf));
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("Threshold_")) {
// System.out.println(line);
String[] b = line.split("\t");
Double[] temp = new Double[2];
temp[0] = Double.parseDouble(b[1]);
temp[1] = Double.parseDouble(b[2]);
Threshold.put(b[0].substring(10, b[0].length()), temp);
//10 because i want to cut Threshold_
}
}
}
public String classify(HashMap<String, HashMap<String, Double>> distant) throws IOException {
String result = "";
String conclusion = "";
readconfig();
for (String key : distant.keySet()) {
//for each file in the test
result = find_NN(distant.get(key));
//System.out.println(result);
//System.out.println(key);
//System.out.println(result);
String[] a = checkdistant(result).split("\n");
if (a[0].length() == 0) {
conclusion += key + "\t" + "cant classify\t1\tnull\n";
} else {
for (int i = 0; i < a.length; i++) {
conclusion += key + "\t" + a[i] + "\n";
}
}
}
//System.out.println(conclusion);
return conclusion;
}
private String checkdistant(String result) {
String conclution = "";
String[] eachtype = result.split("\n");
for (int i = 0; i < eachtype.length; i++) {
String[] eachNN = eachtype[i].split("\t");
//System.out.println(eachtype[i]);
//get threshold for the corresponding type
Double[] threshold = new Double[2];
if (Threshold.containsKey(eachNN[0])) {
threshold = Threshold.get(eachNN[0]);
} else {
threshold = Threshold.get("Default");
}
//check distant with threshold
if (Double.parseDouble(eachNN[1]) < threshold[0]) {
conclution += "is a " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
} else if (Double.parseDouble(eachNN[1]) < threshold[1]) {
conclution += "is suppected " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
}
}
if (conclution == "") {
conclution = findmindistant(result);
}
return conclution;
}
private String findmindistant(String result) {
Double mindistant = 1.0;
String[] eachtype = result.split("\n");
String conclu = "";
for (int i = 0; i < eachtype.length; i++) {
String[] eachNN = eachtype[i].split("\t");
if (Double.parseDouble(eachNN[1]) < mindistant) {
mindistant = Double.parseDouble(eachNN[1]);
conclu = "is Genuine \t" + eachNN[1] + "\t" + eachNN[2] + "\n";
}
}
return conclu;
}
private String gettype(String indexpath) {
File indexfile = new File(indexpath);
String parent = indexfile.getParent();
// String type = parent.substring(0, parent.lastIndexOf("/"));
parent = parent.substring(parent.lastIndexOf("/") + 1, parent.length());
return parent;
}
private String find_NN(HashMap<String, Double> distantto) {
HashMap<String, Double> distotype = new HashMap<String, Double>();
HashMap<String, String> NNname = new HashMap<String, String>();
Double MinNN = 1.0;
String NN = "";
for (String key : distantto.keySet()) {
String type = gettype(key);
if (!distotype.containsKey(type)) {
distotype.put(type, distantto.get(key));
NNname.put(type, key);
} else if (distantto.get(key) < distotype.get(type)) {
distotype.put(type, distantto.get(key));
NNname.put(type, key);
}
}
// it returns the path to the NN
String result = "";
for (String key : distotype.keySet()) {
result += key + "\t" + distotype.get(key) + "\t" + NNname.get(key) + "\n";
}
return result;
}
}
......@@ -14,13 +14,13 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.Scidetect.Checker;
package fr.imag.forge.scidetect.Checker;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import fr.imag.forge.Scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.scidetect.Corpus.Corpus;
/**
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
* author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package fr.imag.Scidetect.Checker;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
/**
*
* @author tien
*/
public class DistantCalculator {
private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
public HashMap<String, HashMap<String, Double>> caldistant(HashMap<String, HashMap<String, Integer>> samples, HashMap<String, HashMap<String, Integer>> tests) {
for (String key : tests.keySet()) {
HashMap<String, Double> distantto = new HashMap<String, Double>();
for (String key2 : samples.keySet()) {
double distanttt = cal_textdistant(tests.get(key),
samples.get(key2));
// System.out.println("distant between " + key + " and " + key2
// + ": " + distanttt);
distantto.put(key2, distanttt);
}
distant.put(key, distantto);
}
return distant;
}
private double cal_textdistant(HashMap<String, Integer> text1,
HashMap<String, Integer> text2) {
double nboftoken = 0.0;
double sum = 0.0;
Set<String> keys1 = text1.keySet();
Set<String> keys2 = text2.keySet();
Set<String> allkeys = new HashSet<String>();
allkeys.addAll(keys1);
allkeys.addAll(keys2);
Integer Na = 0, Nb = 0;
// get the nb of token in each text
for (String key : allkeys) {
Integer Fa = 0;
Integer Fb = 0;
if (text1.containsKey(key)) {
Fa = text1.get(key);
}
if (text2.containsKey(key)) {
Fb = text2.get(key);
}
Na += Fa;
Nb += Fb;
}
// reduce propotion for text of different lenght
if (Na <= Nb) {
for (String key : allkeys) {
Integer Fa = 0;
Integer Fb = 0;
if (text1.containsKey(key)) {
Fa = text1.get(key);
}
if (text2.containsKey(key)) {
Fb = text2.get(key);
}
sum += Math.abs(Fa - (double) Fb * (Na / (double) Nb));
}
return sum / (2 * Na);
} else {
for (String key : allkeys) {
Integer Fa = 0;
Integer Fb = 0;
if (text1.containsKey(key)) {
Fa = text1.get(key);
}
if (text2.containsKey(key)) {
Fb = text2.get(key);
}
sum += Math.abs(Fa * (Nb / (double) Na) - (double) Fb);
}
return sum / (2 * Nb);
}
}
}
......@@ -14,7 +14,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.Scidetect.Checker;
package fr.imag.forge.scidetect.Checker;
import java.io.File;
import java.io.FileNotFoundException;
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
* author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package fr.imag.Scidetect.Checker;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.HashMap;
/**
*
* @author tien
*/
public class Indexer {
private Object content;
public void index(String content, File textfile) throws FileNotFoundException {
String filename = textfile.getName().substring(0,textfile.getName().lastIndexOf("."));
filename+=".txt";
String path = textfile.getParent();
String[] words = content.split(" ");
//System.out.println(words.length);
HashMap<String, Integer> counter = new HashMap<String, Integer>();
for (int i = 0; i < words.length; i++) {
if (!counter.containsKey(words[i])) {
counter.put(words[i], 1);
} else {
counter.put(words[i], counter.get(words[i]) + 1);
}
}
File indexout = new File(path + "/INDEX-" + filename);
// String filepath = (indexout.getPath());
PrintWriter out = new PrintWriter(indexout);
for (String key : counter.keySet()) {
out.println(key + "\t" + counter.get(key));
}
out.close();
}
}
......@@ -14,11 +14,11 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.Scidetect.Checker;
package fr.imag.forge.scidetect.Checker;
import com.sun.corba.se.spi.transport.CorbaAcceptor;
import fr.imag.forge.Scidetect.TextExtractor.Xmlextractor;
import fr.imag.forge.Scidetect.TextExtractor.pdfextractor;
import fr.imag.forge.scidetect.TextExtractor.Xmlextractor;
import fr.imag.forge.scidetect.TextExtractor.pdfextractor;
import fr.imag.forge.scidetect.Corpus.Corpus;
import fr.imag.forge.scidetect.Corpus.ProcessText;
import fr.imag.forge.scidetect.Corpus.Text;
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
* author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package fr.imag.Scidetect.Checker;
import fr.imag.Scidetect.TextExtractor.Xmlextractor;
import fr.imag.Scidetect.TextExtractor.pdfextractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
/**
*
* @author tien
*/
public class Reader {
private HashMap<String, HashMap<String, Integer>> samples = new HashMap<String, HashMap<String, Integer>>();
private HashMap<String, HashMap<String, Integer>> tests = new HashMap<String, HashMap<String, Integer>>();
private String SamplesFolder;
private int maxlength;
public void readconfig() throws FileNotFoundException, IOException {
File conf = new File("config.txt");
BufferedReader br = new BufferedReader(new FileReader(conf));
String line;
while ((line = br.readLine()) != null) {
if (!line.startsWith("#")) {
// System.out.println(line);
String[] b = line.split("\t");
if (b[0].equals("samples")) {
SamplesFolder = b[1];
}
//other config should be read over here
if (b[0].equals("Max_length")) {
maxlength = Integer.parseInt(b[1]);
}
}
}
}
public HashMap<String, HashMap<String, Integer>> readsamples(String foldername) throws IOException {
File folder = new File(foldername);
File[] listOfFile = folder.listFiles();
for (int j = 0; j < listOfFile.length; j++) {
if (listOfFile[j].isDirectory()) {
readsamples(listOfFile[j].getPath());
} else if (listOfFile[j].getName().endsWith(".pdf")) {
// find if there is already index for it
String indexname = "INDEX-"
+ listOfFile[j].getName().substring(0,
listOfFile[j].getName().lastIndexOf("."))
+ ".txt";
if (Arrays.asList(listOfFile).toString().contains(indexname)) {
// System.out.println("lets read from index file");
readindexfile(listOfFile[j].getParent() + "/" + indexname);
} else {
pdfextractor a = new pdfextractor();
String content = a.pdfextract(listOfFile[j]);
Indexer b = new Indexer();
b.index(content, listOfFile[j]);
readindexfile(listOfFile[j].getParent() + "/" + indexname);
}
}
}
return samples;
}
private void readindexfile(String path) throws IOException {
File index = new File(path);
BufferedReader br;
br = new BufferedReader(new FileReader(index));
String line;
HashMap<String, Integer> a = new HashMap<String, Integer>();
while ((line = br.readLine()) != null) {
String[] b = line.split("\t");
a.put(b[0], Integer.parseInt(b[1]));
}
br.close();
if (path.contains(SamplesFolder)) {
samples.put(path, a);
} else {
tests.put(path, a);
}
}
private void readfile(File pdf) throws IOException {
String content = "";
if (pdf.getName().endsWith(".pdf")) {
pdfextractor a = new pdfextractor();
content = a.pdfextract(pdf);
} else if (pdf.getName().endsWith(".xml") || pdf.getName().endsWith(".xtx")) {
Xmlextractor a = new Xmlextractor();
content = a.xmlextract(pdf);
}
//lets deal with long file over here
//split content and the index part by part
if (content.length() < maxlength) {
String indexname = "INDEX-"
+ pdf.getName().substring(0,
pdf.getName().lastIndexOf("."))
+ ".txt";
Indexer b = new Indexer();
b.index(content, pdf);
readindexfile(pdf.getParent() + "/" + indexname);
} else {
String[] part = splitcontent(content);
for (int i = 0; i < part.length; i++) {
String indexname = "INDEX-"
+ pdf.getName().substring(0,
pdf.getName().lastIndexOf("."))
+ "_part" + i + ".txt";
String filename = pdf.getName().substring(0,
pdf.getName().lastIndexOf("."))
+ "_part" + i + ".txt";
Indexer b = new Indexer();
File a = new File(pdf.getParent() + "/" + filename);
PrintWriter out = new PrintWriter(new FileWriter(a));
out.println(part[i]);
//System.out.println(text);
out.close();
b.index(part[i], a);
readindexfile(a.getParent() + "/" + indexname);
}
}
}
public HashMap<String, HashMap<String, Integer>> readtests(String testpath) throws IOException {
File folder = new File(testpath);
if (folder.isDirectory()) {
File[] listOfFile = folder.listFiles();
for (int j = 0; j < listOfFile.length; j++) {
if (listOfFile[j].isDirectory()) {
readtests(listOfFile[j].getPath());
} else if (listOfFile[j].getName().endsWith(".pdf") || listOfFile[j].getName().endsWith(".xml") || listOfFile[j].getName().endsWith(".xtx")) {
readfile(listOfFile[j]);
}
}
} else if (folder.getName().endsWith(".pdf") || folder.getName().endsWith(".xml") || folder.getName().endsWith(".xtx")) {
readfile(folder);
}
return tests;
}
private String[] splitcontent(String content) {
int nbofpart = content.length() / maxlength;
String[] part = new String[nbofpart + 1];
int lower = 0;
int upper = 0;
int i;
for (i = 0; i < nbofpart; i++) {
upper += maxlength;
part[i] = content.substring(lower, upper);
lower = upper;
}
if (upper <= content.length() - 1) {
lower = upper;
upper = content.length();
part[i] = (content.substring(lower, upper));
}
return part;
}
}
package fr.imag.forge.Scidetect.Checker.Utils;
package fr.imag.forge.scidetect.Checker.Utils;
import java.util.HashMap;
......
package fr.imag.forge.Scidetect.Checker.Utils;
package fr.imag.forge.scidetect.Checker.Utils;
import java.io.BufferedReader;
import java.io.File;
......
......@@ -16,9 +16,9 @@
*/
package fr.imag.forge.scidetect.Corpus;
import fr.imag.forge.Scidetect.Checker.Indexer;
import fr.imag.forge.Scidetect.TextExtractor.Xmlextractor;
import fr.imag.forge.Scidetect.TextExtractor.pdfextractor;
import fr.imag.forge.scidetect.Checker.Indexer;
import fr.imag.forge.scidetect.TextExtractor.Xmlextractor;
import fr.imag.forge.scidetect.TextExtractor.pdfextractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
......
......@@ -14,7 +14,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.Scidetect.Logger;
package fr.imag.forge.scidetect.Logger;
import java.io.File;
import java.io.FileNotFoundException;
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
* author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package fr.imag.Scidetect.Logger;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.HashMap;
/**
*
* @author tien
*/
public class Log {
public static String loglocation;