Commit a555af2b authored by Tien's avatar Tien

separate text and corpus class

parent 90fbf481
......@@ -84,7 +84,7 @@ public class Classifier {
}
/**
* Check if the distance is lower, between of upper the two threshold.
* @param result a string composed having for each classe the value of its NN
* @param result a string composed having for each classes the value of its NN
* @return
*/
private String checkdistant(String result) {
......
......@@ -21,6 +21,7 @@ import java.util.HashSet;
import java.util.Set;
import fr.imag.forge.Scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.scidetect.Corpus.Corpus;
/**
*
......@@ -31,7 +32,7 @@ public class DistantCalculator {
//private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
private DistancesSet distant = new DistancesSet();
public DistancesSet caldistant(HashMap<String, HashMap<String, Integer>> samples, HashMap<String, HashMap<String, Integer>> tests) {
public DistancesSet caldistant(Corpus samples, Corpus tests) {
for (String key : tests.keySet()) {
//HashMap<String, Double> distantto = new HashMap<String, Double>();
for (String key2 : samples.keySet()) {
......
......@@ -16,8 +16,12 @@
*/
package fr.imag.forge.Scidetect.Checker;
import com.sun.corba.se.spi.transport.CorbaAcceptor;
import fr.imag.forge.Scidetect.TextExtractor.Xmlextractor;
import fr.imag.forge.Scidetect.TextExtractor.pdfextractor;
import fr.imag.forge.scidetect.Corpus.Corpus;
import fr.imag.forge.scidetect.Corpus.ProcessText;
import fr.imag.forge.scidetect.Corpus.Text;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
......@@ -36,8 +40,10 @@ import java.util.List;
*/
public class Reader {
private HashMap<String, HashMap<String, Integer>> samples = new HashMap<String, HashMap<String, Integer>>();
private HashMap<String, HashMap<String, Integer>> tests = new HashMap<String, HashMap<String, Integer>>();
//private HashMap<String, HashMap<String, Integer>> samples = new HashMap<String, HashMap<String, Integer>>();
// private HashMap<String, HashMap<String, Integer>> tests = new HashMap<String, HashMap<String, Integer>>();
private Corpus samples = new Corpus();
private Corpus test = new Corpus();
private String SamplesFolder;
private int maxlength;
......@@ -56,149 +62,51 @@ public class Reader {
//other config should be read over here
if (b[0].equals("Max_length")) {
maxlength = Integer.parseInt(b[1]);
ProcessText.maxlength = Integer.parseInt(b[1]);
}
}
}
}
public HashMap<String, HashMap<String, Integer>> readsamples(String foldername) throws IOException {
public Corpus readsamples(String foldername) throws IOException {
File folder = new File(foldername);
File[] listOfFile = folder.listFiles();
for (int j = 0; j < listOfFile.length; j++) {
if (listOfFile[j].isDirectory()) {
readsamples(listOfFile[j].getPath());
} else if (listOfFile[j].getName().endsWith(".pdf") || listOfFile[j].getName().endsWith(".txt") && !listOfFile[j].getName().startsWith("INDEX-")) {
// find if there is already index for it
String indexname = "INDEX-"
+ listOfFile[j].getName().substring(0,
listOfFile[j].getName().lastIndexOf("."))
+ ".txt";
if (Arrays.asList(listOfFile).toString().contains(indexname)) {
// System.out.println("lets read from index file");
readindexfile(listOfFile[j].getParent() + "/" + indexname);
} else if (listOfFile[j].getName().endsWith(".pdf")) {
pdfextractor a = new pdfextractor();
String content = a.pdfextract(listOfFile[j]);
Indexer b = new Indexer();
b.index(content, listOfFile[j]);
readindexfile(listOfFile[j].getParent() + "/" + indexname);
} else if (listOfFile[j].getName().endsWith(".pdf") || listOfFile[j].getName().endsWith(".xml") ||listOfFile[j].getName().endsWith(".xtx")||(listOfFile[j].getName().endsWith(".txt") && !listOfFile[j].getName().startsWith("INDEX-"))) {
ArrayList<Text> text = new ArrayList<Text>();
ProcessText textprocessor = new ProcessText();
text = textprocessor.newtext(listOfFile[j], listOfFile);
for (int i = 0; i < text.size(); i++) {
samples.put(text.get(i));
}
}
}
return samples;
}
private void readindexfile(String path) throws IOException {
File index = new File(path);
BufferedReader br;
br = new BufferedReader(new FileReader(index));
String line;
HashMap<String, Integer> a = new HashMap<String, Integer>();
while ((line = br.readLine()) != null) {
String[] b = line.split("\t");
a.put(b[0], Integer.parseInt(b[1]));
}
br.close();
if (path.contains(SamplesFolder)) {
samples.put(path, a);
} else {
tests.put(path, a);
}
}
private void readfile(File pdf) throws IOException {
String content = "";
if (pdf.getName().endsWith(".pdf")) {
pdfextractor a = new pdfextractor();
content = a.pdfextract(pdf);
} else if (pdf.getName().endsWith(".xml") || pdf.getName().endsWith(".xtx")) {
Xmlextractor a = new Xmlextractor();
content = a.xmlextract(pdf);
}
//lets deal with long file over here
//split content and the index part by part
if (content.length() < maxlength) {
String indexname = "INDEX-"
+ pdf.getName().substring(0,
pdf.getName().lastIndexOf("."))
+ ".txt";
Indexer b = new Indexer();
b.index(content, pdf);
readindexfile(pdf.getParent() + "/" + indexname);
} else {
String[] part = splitcontent(content);
for (int i = 0; i < part.length; i++) {
String indexname = "INDEX-"
+ pdf.getName().substring(0,
pdf.getName().lastIndexOf("."))
+ "_part" + i + ".txt";
String filename = pdf.getName().substring(0,
pdf.getName().lastIndexOf("."))
+ "_part" + i + ".txt";
Indexer b = new Indexer();
File a = new File(pdf.getParent() + "/" + filename);
PrintWriter out = new PrintWriter(new FileWriter(a));
out.println(part[i]);
//System.out.println(text);
out.close();
b.index(part[i], a);
readindexfile(a.getParent() + "/" + indexname);
return samples;
}
}
}
public HashMap<String, HashMap<String, Integer>> readtests(String testpath) throws IOException {
File folder = new File(testpath);
if (folder.isDirectory()) {
File[] listOfFile = folder.listFiles();
for (int j = 0; j < listOfFile.length; j++) {
if (listOfFile[j].isDirectory()) {
readtests(listOfFile[j].getPath());
} else if (listOfFile[j].getName().endsWith(".pdf") || listOfFile[j].getName().endsWith(".xml") || listOfFile[j].getName().endsWith(".xtx")) {
readfile(listOfFile[j]);
public Corpus readtests(String foldername) throws IOException {
File folder = new File(foldername);
File[] listOfFile = folder.listFiles();
for (int j = 0; j < listOfFile.length; j++) {
if (listOfFile[j].isDirectory()) {
readtests(listOfFile[j].getPath());
} else if (listOfFile[j].getName().endsWith(".pdf") || listOfFile[j].getName().endsWith(".xml") ||listOfFile[j].getName().endsWith(".xtx")) {
ArrayList<Text> text = new ArrayList<Text>();
ProcessText textprocessor = new ProcessText();
text = textprocessor.newtext(listOfFile[j], listOfFile);
for (int i = 0; i < text.size(); i++) {
test.put(text.get(i));
}
}
} else if (folder.getName().endsWith(".pdf") || folder.getName().endsWith(".xml") || folder.getName().endsWith(".xtx")) {
readfile(folder);
}
return tests;
}
private String[] splitcontent(String content) {
int nbofpart = content.length() / maxlength;
String[] part = new String[nbofpart + 1];
int lower = 0;
int upper = 0;
int i;
for (i = 0; i < nbofpart; i++) {
upper += maxlength;
part[i] = content.substring(lower, upper);
lower = upper;
}
return test;
if (upper <= content.length() - 1) {
lower = upper;
upper = content.length();
part[i] = (content.substring(lower, upper));
}
return part;
}
}
......@@ -23,6 +23,7 @@ import fr.imag.forge.Scidetect.Checker.Reader;
import fr.imag.forge.Scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.Scidetect.Logger.Log;
import fr.imag.forge.Scidetect.TextExtractor.pdfextractor;
import fr.imag.forge.scidetect.Corpus.Corpus;
import java.io.BufferedReader;
import java.io.File;
......@@ -46,8 +47,8 @@ public class ScigenChecker_Local {
// private String detailloglocation;
private String testpath;
//private String logtime;
private HashMap<String, HashMap<String, Integer>> samples = new HashMap<String, HashMap<String, Integer>>();
private HashMap<String, HashMap<String, Integer>> tests = new HashMap<String, HashMap<String, Integer>>();
private Corpus samples = new Corpus();
private Corpus tests = new Corpus();
private String SamplesFolder;
//private HashMap<String, HashMap<String, Double>> distant = new HashMap<String, HashMap<String, Double>>();
DistancesSet distant = new DistancesSet();
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.scidetect.Corpus;
import java.util.HashMap;
import java.util.Set;
/**
*
* @author tien
*/
public class Corpus {
private HashMap<String, Text> corpus = new HashMap<String, Text>();
public void put(Text a) {
corpus.put(a.getname(), a);
}
public HashMap<String, Text> getcorpus() {
return corpus;
}
public Set<String> keySet() {
return corpus.keySet();
}
public HashMap<String, Integer> get(String name) {
return corpus.get(name).getindex();
}
}
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.scidetect.Corpus;
import fr.imag.forge.Scidetect.Checker.Indexer;
import fr.imag.forge.Scidetect.TextExtractor.Xmlextractor;
import fr.imag.forge.Scidetect.TextExtractor.pdfextractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* Manage texts file in the corpus
*
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class ProcessText {
public static int maxlength;
ArrayList<Text> text = new ArrayList<Text>();
/**
* Process a File (pdf,xml) to create clean text[] (incase of need to split)
*
* @param original file
* @return list of text[]
*
*/
public ArrayList<Text> newtext(File original, File[] listOfFile) throws IOException {
// find if there is already index for it
String indexname = "INDEX-"
+ original.getName().substring(0,
original.getName().lastIndexOf("."))
+ ".txt";
String content = "";
if (Arrays.asList(listOfFile).toString().contains(indexname)) {
// System.out.println("lets read from index file");
readindexfile(original.getParent() + "/" + indexname);
} else {
if (original.getName().endsWith(".pdf")) {
try {
pdfextractor a = new pdfextractor();
content = a.pdfextract(original);
} catch (FileNotFoundException ex) {
Logger.getLogger(ProcessText.class.getName()).log(Level.SEVERE, null, ex);
}
} else if (original.getName().endsWith(".xml") || original.getName().endsWith(".xtx")) {
Xmlextractor a = new Xmlextractor();
content = a.xmlextract(original);
}
//lets deal with long file over here
//split content and the index part by part
if (content.length() < maxlength) {
Indexer b = new Indexer();
b.index(content, original);
readindexfile(original.getParent() + "/" + indexname);
} else {
String[] part = splitcontent(content);
for (int i = 0; i < part.length; i++) {
String indexnameparti = "INDEX-"
+ original.getName().substring(0,
original.getName().lastIndexOf("."))
+ "_part" + i + ".txt";
String filename = original.getName().substring(0,
original.getName().lastIndexOf("."))
+ "_part" + i + ".txt";
Indexer b = new Indexer();
File a = new File(original.getParent() + "/" + filename);
PrintWriter out = new PrintWriter(new FileWriter(a));
out.println(part[i]);
//System.out.println(text);
out.close();
b.index(part[i], a);
readindexfile(a.getParent() + "/" + indexnameparti);
}
}
}
return text;
}
private HashMap<String, Integer> readindexfile(String path) throws IOException {
File index = new File(path);
BufferedReader br;
br = new BufferedReader(new FileReader(index));
String line;
HashMap<String, Integer> a = new HashMap<String, Integer>();
while ((line = br.readLine()) != null) {
String[] b = line.split("\t");
a.put(b[0], Integer.parseInt(b[1]));
}
br.close();
Text c = new Text();
c.setindex(a);
c.setname(path);
text.add(c);
return a;
}
private String[] splitcontent(String content) {
int nbofpart = content.length() / maxlength;
String[] part = new String[nbofpart + 1];
int lower = 0;
int upper = 0;
int i;
for (i = 0; i < nbofpart; i++) {
upper += maxlength;
part[i] = content.substring(lower, upper);
lower = upper;
}
if (upper <= content.length() - 1) {
lower = upper;
upper = content.length();
part[i] = (content.substring(lower, upper));
}
return part;
}
}
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.scidetect.Corpus;
import java.util.HashMap;
/**
*
* @author tien
*/
public class Text {
private HashMap<String, Integer> index = new HashMap<String, Integer>();
private String cleantext = "";
private String name = "";
public void setindex(HashMap<String, Integer> a) {
index = a;
}
public void setname(String a) {
name = a;
}
public HashMap<String, Integer> getindex() {
return index;
}
public String getname() {
return name;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment