/* * Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH * author Nguyen Minh Tien - minh-tien.nguyen@imag.fr * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package fr.imag.Scidetect.Checker; import fr.imag.Scidetect.TextExtractor.Xmlextractor; import fr.imag.Scidetect.TextExtractor.pdfextractor; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; /** * * @author tien */ public class Reader { private HashMap> samples = new HashMap>(); private HashMap> tests = new HashMap>(); private String SamplesFolder; private int maxlength; public void readconfig() throws FileNotFoundException, IOException { File conf = new File("config.txt"); BufferedReader br = new BufferedReader(new FileReader(conf)); String line; while ((line = br.readLine()) != null) { if (!line.startsWith("#")) { // System.out.println(line); String[] b = line.split("\t"); if (b[0].equals("samples")) { SamplesFolder = b[1]; } //other config should be read over here if (b[0].equals("Max_length")) { maxlength = Integer.parseInt(b[1]); } } } } public HashMap> readsamples(String foldername) throws IOException { File folder = new File(foldername); File[] listOfFile = folder.listFiles(); for (int j = 0; j < listOfFile.length; j++) { if (listOfFile[j].isDirectory()) { readsamples(listOfFile[j].getPath()); } else if (listOfFile[j].getName().endsWith(".pdf")) { // find if there is already index for it String indexname = "INDEX-" + listOfFile[j].getName().substring(0, listOfFile[j].getName().lastIndexOf(".")) + ".txt"; if (Arrays.asList(listOfFile).toString().contains(indexname)) { // System.out.println("lets read from index file"); readindexfile(listOfFile[j].getParent() + "/" + indexname); } else { pdfextractor a = new pdfextractor(); String content = a.pdfextract(listOfFile[j]); Indexer b = new Indexer(); b.index(content, listOfFile[j]); readindexfile(listOfFile[j].getParent() + "/" + indexname); } } } return samples; } private void readindexfile(String path) throws IOException { File index = new File(path); BufferedReader br; br = new BufferedReader(new FileReader(index)); String line; HashMap a = new HashMap(); while ((line = br.readLine()) != null) { String[] b = line.split("\t"); a.put(b[0], Integer.parseInt(b[1])); } br.close(); if (path.contains(SamplesFolder)) { samples.put(path, a); } else { tests.put(path, a); } } private void readfile(File pdf) throws IOException { String content = ""; if (pdf.getName().endsWith(".pdf")) { pdfextractor a = new pdfextractor(); content = a.pdfextract(pdf); } else if (pdf.getName().endsWith(".xml") || pdf.getName().endsWith(".xtx")) { Xmlextractor a = new Xmlextractor(); content = a.xmlextract(pdf); } //lets deal with long file over here //split content and the index part by part if (content.length() < maxlength) { String indexname = "INDEX-" + pdf.getName().substring(0, pdf.getName().lastIndexOf(".")) + ".txt"; Indexer b = new Indexer(); b.index(content, pdf); readindexfile(pdf.getParent() + "/" + indexname); } else { String[] part = splitcontent(content); for (int i = 0; i < part.length; i++) { String indexname = "INDEX-" + pdf.getName().substring(0, pdf.getName().lastIndexOf(".")) + "_part" + i + ".txt"; String filename = pdf.getName().substring(0, pdf.getName().lastIndexOf(".")) + "_part" + i + ".txt"; Indexer b = new Indexer(); File a = new File(pdf.getParent() + "/" + filename); PrintWriter out = new PrintWriter(new FileWriter(a)); out.println(part[i]); //System.out.println(text); out.close(); b.index(part[i], a); readindexfile(a.getParent() + "/" + indexname); } } } public HashMap> readtests(String testpath) throws IOException { File folder = new File(testpath); if (folder.isDirectory()) { File[] listOfFile = folder.listFiles(); for (int j = 0; j < listOfFile.length; j++) { if (listOfFile[j].isDirectory()) { readtests(listOfFile[j].getPath()); } else if (listOfFile[j].getName().endsWith(".pdf") || listOfFile[j].getName().endsWith(".xml") || listOfFile[j].getName().endsWith(".xtx")) { readfile(listOfFile[j]); } } } else if (folder.getName().endsWith(".pdf") || folder.getName().endsWith(".xml") || folder.getName().endsWith(".xtx")) { readfile(folder); } return tests; } private String[] splitcontent(String content) { int nbofpart = content.length() / maxlength; String[] part = new String[nbofpart + 1]; int lower = 0; int upper = 0; int i; for (i = 0; i < nbofpart; i++) { upper += maxlength; part[i] = content.substring(lower, upper); lower = upper; } if (upper <= content.length() - 1) { lower = upper; upper = content.length(); part[i] = (content.substring(lower, upper)); } return part; } }