Commit b337c6ba authored by Tien's avatar Tien

added min length

parent fe5b30e1
File deleted
<?xml version="1.0" encoding="UTF-8"?>
<!-- You may freely edit this file. See commented blocks below for -->
<!-- some examples of how to customize the build. -->
<!-- (If you delete it and reopen the project it will be recreated.) -->
<!-- By default, only the Clean and Build commands use this build script. -->
<!-- Commands such as Run, Debug, and Test only use this build script if -->
<!-- the Compile on Save feature is turned off for the project. -->
<!-- You can turn off the Compile on Save (or Deploy on Save) setting -->
<!-- in the project's Project Properties dialog box.-->
<project name="ScigenChecker_Local" default="default" basedir=".">
<description>Builds, tests, and runs the project ScigenChecker_Local.</description>
<import file="nbproject/build-impl.xml"/>
<!--
There exist several targets which are by default empty and which can be
used for execution of your tasks. These targets are usually executed
before and after some main targets. They are:
-pre-init: called before initialization of project properties
-post-init: called after initialization of project properties
-pre-compile: called before javac compilation
-post-compile: called after javac compilation
-pre-compile-single: called before javac compilation of single file
-post-compile-single: called after javac compilation of single file
-pre-compile-test: called before javac compilation of JUnit tests
-post-compile-test: called after javac compilation of JUnit tests
-pre-compile-test-single: called before javac compilation of single JUnit test
-post-compile-test-single: called after javac compilation of single JUunit test
-pre-jar: called before JAR building
-post-jar: called after JAR building
-post-clean: called after cleaning build products
(Targets beginning with '-' are not intended to be called on their own.)
Example of inserting an obfuscator after compilation could look like this:
<target name="-post-compile">
<obfuscate>
<fileset dir="${build.classes.dir}"/>
</obfuscate>
</target>
For list of available properties check the imported
nbproject/build-impl.xml file.
Another way to customize the build is by overriding existing main targets.
The targets of interest are:
-init-macrodef-javac: defines macro for javac compilation
-init-macrodef-junit: defines macro for junit execution
-init-macrodef-debug: defines macro for class debugging
-init-macrodef-java: defines macro for class execution
-do-jar: JAR building
run: execution of project
-javadoc-build: Javadoc generation
test-report: JUnit report generation
An example of overriding the target for project execution could look like this:
<target name="run" depends="ScigenChecker_Local-impl.jar">
<exec dir="bin" executable="launcher.exe">
<arg file="${dist.jar}"/>
</exec>
</target>
Notice that the overridden target depends on the jar target and not only on
the compile target as the regular run target does. Again, for a list of available
properties which you can use, check the target you are overriding in the
nbproject/build-impl.xml file.
-->
</project>
......@@ -29,3 +29,4 @@ Default_detail_log_folder detaillogs/
#if an article has more character than this length,
#it wil be splited in to smaller part
Max_length 30000
Min_length 10000
......@@ -37,7 +37,7 @@ public class Classifier {
private ThresholdsSet SetOfThresholds;
/**
* Build a new classifier, thresholds are read in the configuration file
*
*/
public Classifier() {
this.SetOfThresholds = new ThresholdsSet();
......@@ -86,10 +86,10 @@ public class Classifier {
*
* @param result a string composed having for each classes the value of its
* NN
* @return a string composed of the classes and the distances to the nearest neighbor in each class.
* @return
*/
private String checkdistant(String result) {
String conclusion = "";
String conclution = "";
String[] eachtype = result.split("\n");
for (int i = 0; i < eachtype.length; i++) {
......@@ -105,16 +105,16 @@ public class Classifier {
}
//check distant with threshold
if (Double.parseDouble(eachNN[1]) < threshold[0]) {
conclusion += "is a " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
conclution += "is a " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
} else if (Double.parseDouble(eachNN[1]) < threshold[1]) {
conclusion += "is suppected " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
conclution += "is suppected " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
}
}
if (conclusion == "") {
conclusion = findmindistant(result);
if (conclution == "") {
conclution = findmindistant(result);
}
return conclusion;
return conclution;
}
/**
......
......@@ -24,7 +24,7 @@ import fr.imag.forge.scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.scidetect.Corpus.Corpus;
/**
* Compute distances between two sets of texts
*
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class DistantCalculator {
......@@ -33,10 +33,10 @@ public class DistantCalculator {
private DistancesSet distant = new DistancesSet();
/**
* Compute distances between each text of a corpus and the samples
*calculate the distant between 2 text corpus
* @param samples corpus
* @param tests corpus
* @return DistancesSet distances from text in test to text in sample
* @return DistancesSet from test to sample
*/
public DistancesSet caldistant(Corpus samples, Corpus tests) {
for (String key : tests.keySet()) {
......@@ -53,13 +53,9 @@ public class DistantCalculator {
return distant;
}
/**
* Compute the distance between 2 texts index
* @param text1
* @param text2
* @return the distance between text 1 and text 2.
*/
/**
*Calculate distant between 2 text index
*/
private double cal_textdistant(HashMap<String, Integer> text1,
HashMap<String, Integer> text2) {
double nboftoken = 0.0;
......
......@@ -22,7 +22,7 @@ import java.io.PrintWriter;
import java.util.HashMap;
/**
* Index texts (i.e. for each word computes its occurrence number
*
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class Indexer {
......@@ -30,8 +30,8 @@ public class Indexer {
private Object content;
/**
* Index a text (count how many time each word is appearing)
* and writes results in a file INDEX-filename.txt
* Index a text (count how many time each word appeared)
* and write to file under then name INDEX-filename.txt
* @param content
* @param textfile
* @throws FileNotFoundException
......
......@@ -40,7 +40,7 @@ public class Reader {
private Corpus samples = new Corpus();
private Corpus test = new Corpus();
private String SamplesFolder;
private int maxlength;
/**
*Read config file
......@@ -61,9 +61,13 @@ public class Reader {
}
//other config should be read over here
if (b[0].equals("Max_length")) {
maxlength = Integer.parseInt(b[1]);
//maxlength = Integer.parseInt(b[1]);
TextProcessor.maxlength = Integer.parseInt(b[1]);
}
if (b[0].equals("Min_length")) {
//maxlength = Integer.parseInt(b[1]);
TextProcessor.minlength = Integer.parseInt(b[1]);
}
}
}
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.scidetect.Checker.Utils;
import java.util.HashMap;
......@@ -34,8 +18,8 @@ public class DistancesSet extends HashMap<String, HashMap<String, Double>>{
/**
* Get the value of the distance between A and B
* @param A text
* @param B text
* @param A
* @param B
* @return the distance between A and B
*/
public Double getDist(String A, String B){
......@@ -44,9 +28,9 @@ public class DistancesSet extends HashMap<String, HashMap<String, Double>>{
/**
* Set the distance between A and B to the value d
* @param A text
* @param B text
* @param d distance
* @param A
* @param B
* @param d
*/
public void setDist(String A, String B, Double d){
if (this.get(A) == null) {this.put(A,new HashMap<String, Double>());}
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.scidetect.Checker.Utils;
......@@ -34,6 +18,8 @@ public class ThresholdsSet extends HashMap<String, Double[]> {
/**
* Initialize the thresholds Set by reading the configuration file
* @throws FileNotFoundException
* @throws IOException
*/
public void Init() {
try{
......
......@@ -43,6 +43,7 @@ public class TextProcessor {
*
*/
public static int maxlength;
public static int minlength;
ArrayList<Text> text = new ArrayList<Text>();
/**
......@@ -81,7 +82,12 @@ public class TextProcessor {
}
//lets deal with long file over here
//split content and the index part by part
if (content.length() < maxlength) {
if (content.length() < minlength) {
Indexer b = new Indexer();
b.index(" ", original);
readindexfile(original.getParent() + "/" + indexname);
} else if (content.length() < maxlength) {
Indexer b = new Indexer();
b.index(content, original);
......@@ -156,9 +162,9 @@ public class TextProcessor {
lower = upper;
upper = content.length();
if (content.substring(lower, upper).length() < maxlength / 3) {
if (content.substring(lower, upper).length() < minlength) {
part[i - 1] = part[i - 1] + content.substring(lower, upper);
part = Arrays.copyOf(part, part.length - 1);
part = Arrays.copyOf(part, part.length - 1);
} else {
part[i] = (content.substring(lower, upper));
}
......
......@@ -22,33 +22,37 @@ import java.io.PrintWriter;
import java.util.HashMap;
/**
* Manages the log files, where final results are written.
* Manages the log files, where final results are written.
*
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class Log {
/**
* The location of the log File
* The location of the log File
*/
public static String loglocation;
/**
* The location of the detail log file
* The location of the detail log file
*/
public static String detailloglocation;
/**
* Time when the log was created
* Time when the log was created
*/
public static String logtime;
/**
* Write in a log file all the computed distances.
* Write in a log file all the computed distances.
*
* @param distant The distances set
* @param distant The distances set
*/
public void savedetaillog(HashMap<String, HashMap<String, Double>> distant) {
File dloglocation = new File(detailloglocation);
if (!dloglocation.exists()) {
dloglocation.mkdir();
}
File distantout = new File(detailloglocation + logtime + ".xls");
//File distantout = new File(testpath+"/alldistant.xls");
PrintWriter out;
......@@ -69,28 +73,32 @@ public class Log {
}
/**
* Write in a log file the classification decision
* Write in a log file the classification decision
*
* @param conclusion contains the txt to be written in the log
* @param conclusion contains the txt to be written in the log
*/
public void savelog(String conclusion) {
File distantout ;
File distantout;
if (!loglocation.equals("logs/")) {
distantout = new File(loglocation);
} else {
distantout = new File(loglocation+ logtime + ".xls");
File location = new File(loglocation);
if (!location.exists()) {
location.mkdir();
}
distantout = new File(loglocation + logtime + ".xls");
}
PrintWriter out;
try {
out = new PrintWriter(distantout);
out.write(conclusion);
out.close();
} catch (FileNotFoundException e) {
System.out.println("***** Scidetect : Output file error \n");
System.out.println("***** Scidetect : Output file error \n");
System.out.println("***** Most probably the specified file is a Dir \n");
//e.printStackTrace();
}
......
......@@ -38,6 +38,7 @@ public class normalizer {
* @return contains all the normalized text.
* @throws java.io.IOException
*/
public String normalize(File txt) throws IOException {
BufferedReader br;
br = new BufferedReader(new FileReader(txt));
......
......@@ -27,7 +27,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* Extract raw txt from a pdf File
* extract raw txt from a pdf File
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class pdfextractor {
......
......@@ -160,7 +160,7 @@ The following are needed :
\begin{itemize}
\item The configuration file ({\tt config.txt})
\item The samples directory directories ({\tt data})
\item For log files ({\tt logs} and {\tt detaillogs}) are required by the stand-alone Java program.
\end{itemize}
\section{Usage}
......@@ -244,15 +244,17 @@ INDEX-011.txt is Genuine 0.60918242 data/samples/Scigen/INDEX-scigen41.txt
INDEX-013.txt is Genuine 0.61375975 data/samples/Scigen/INDEX-scigen25.txt
\end{lstlisting}
\subsection{Max text length}
\subsection{Max-Min text length}
\begin{lstlisting}[language=bash]
# Max_length is the maximum size of a text
Max_length 30000
Min_length 10000
\end{lstlisting}
This set the max length in character (including white space char) for a text to be eligible for classification. This parameter is used in order to avoid miss classification: when an article is too long, this cause the characteristic of the article to becomes too generic and very long paper may be misclassified (without splitting misclassification rate: ??).
The default value is set at 30000 characters (about 15 pages). A longer text will be split into several part which are tested individually.
This set the max(min) length in character (including white space char) for a text to be eligible for classification. This parameter is used in order to avoid miss classification: when an article is too long, this cause the characteristic of the article to becomes too generic and very long paper may be misclassified (without splitting misclassification rate: 0.13\% or 42 misclassification/ 31577 samples). When the article is shorter than Min length, it will be marked as cant classify.
The default value for max length is set at 30000 characters (about 10 pages); a longer text will be split into several part which are tested individually. Default min length is set at 10000 characters.
\section{Make use of detail logging}
\label{detaillog}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment