Commit 6a14f2f9 authored by Tien's avatar Tien

Merge branch 'master' of…

Merge branch 'master' of git+ssh://scm.forge.imag.fr/var/lib/gforge/chroot/scmrepos/git/scidetect/scidetect
parents 573a63ac bd425d31
......@@ -8,6 +8,7 @@ JAVAC = javac
JAVADOC = javadoc -d doc
JAVAFLAGS = -O -d classes -encoding utf-8
JAVACLASSPATH = -cp lib/pdfbox-app-1.8.8.jar
SRC = src/fr/imag/forge/scidetect/Checker/Utils/*.java src/fr/imag/forge/scidetect/Checker/*.java src/fr/imag/forge/scidetect/Corpus/*.java src/fr/imag/forge/scidetect/Logger/*.java src/fr/imag/forge/scidetect/SciDetect_local/*.java src/fr/imag/forge/scidetect/TextExtractor/*.java
default: all
......@@ -15,10 +16,10 @@ all: classes doc jar run
classes:
mkdir -p classes
$(JAVAC) $(JAVAFLAGS) $(JAVACLASSPATH) src/fr/imag/forge/scidetect/*/*.java
$(JAVAC) $(JAVAFLAGS) $(JAVACLASSPATH) $(SRC)
doc:
$(JAVADOC) src/*/*/*/*/*/*
$(JAVADOC) $(SRC)
jar:
cd classes ; jar -cfvm ../SciDetect_Local`date +%Y-%m-%d`.jar ../manifest.mf *; cd ..
......
Manifest-Version: 1.0
Class-Path: lib/pdfbox-app-1.8.8.jar
X-COMMENT: Main-Class will be added automatically by build
Main-Class: fr.imag.forge.Scidetect.scigenchecker_local.ScigenChecker_Local
Main-Class: fr.imag.forge.scidetect.SciDetect_local.SciDetect_Local
......@@ -37,7 +37,7 @@ public class Classifier {
private ThresholdsSet SetOfThresholds;
/**
*
* Build a new classifier, thresholds are read in the configuration file
*/
public Classifier() {
this.SetOfThresholds = new ThresholdsSet();
......@@ -86,10 +86,10 @@ public class Classifier {
*
* @param result a string composed having for each classes the value of its
* NN
* @return
* @return a string composed of the classes and the distances to the nearest neighbor in each class.
*/
private String checkdistant(String result) {
String conclution = "";
String conclusion = "";
String[] eachtype = result.split("\n");
for (int i = 0; i < eachtype.length; i++) {
......@@ -105,16 +105,16 @@ public class Classifier {
}
//check distant with threshold
if (Double.parseDouble(eachNN[1]) < threshold[0]) {
conclution += "is a " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
conclusion += "is a " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
} else if (Double.parseDouble(eachNN[1]) < threshold[1]) {
conclution += "is suppected " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
conclusion += "is suppected " + eachNN[0] + "\t" + eachNN[1] + "\t" + eachNN[2] + "\n";
}
}
if (conclution == "") {
conclution = findmindistant(result);
if (conclusion == "") {
conclusion = findmindistant(result);
}
return conclution;
return conclusion;
}
/**
......
......@@ -24,7 +24,7 @@ import fr.imag.forge.scidetect.Checker.Utils.DistancesSet;
import fr.imag.forge.scidetect.Corpus.Corpus;
/**
*
* Compute distances between two sets of texts
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class DistantCalculator {
......@@ -33,10 +33,10 @@ public class DistantCalculator {
private DistancesSet distant = new DistancesSet();
/**
*calculate the distant between 2 text corpus
* Compute distances between each text of a corpus and the samples
* @param samples corpus
* @param tests corpus
* @return DistancesSet from test to sample
* @return DistancesSet distances from text in test to text in sample
*/
public DistancesSet caldistant(Corpus samples, Corpus tests) {
for (String key : tests.keySet()) {
......@@ -53,9 +53,13 @@ public class DistantCalculator {
return distant;
}
/**
*Calculate distant between 2 text index
*/
/**
* Compute the distance between 2 texts index
* @param text1
* @param text2
* @return the distance between text 1 and text 2.
*/
private double cal_textdistant(HashMap<String, Integer> text1,
HashMap<String, Integer> text2) {
double nboftoken = 0.0;
......
......@@ -22,7 +22,7 @@ import java.io.PrintWriter;
import java.util.HashMap;
/**
*
* Index texts (i.e. for each word computes its occurrence number
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class Indexer {
......@@ -30,8 +30,8 @@ public class Indexer {
private Object content;
/**
* Index a text (count how many time each word appeared)
* and write to file under then name INDEX-filename.txt
* Index a text (count how many time each word is appearing)
* and writes results in a file INDEX-filename.txt
* @param content
* @param textfile
* @throws FileNotFoundException
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.scidetect.Checker.Utils;
import java.util.HashMap;
......@@ -18,8 +34,8 @@ public class DistancesSet extends HashMap<String, HashMap<String, Double>>{
/**
* Get the value of the distance between A and B
* @param A
* @param B
* @param A text
* @param B text
* @return the distance between A and B
*/
public Double getDist(String A, String B){
......@@ -28,9 +44,9 @@ public class DistancesSet extends HashMap<String, HashMap<String, Double>>{
/**
* Set the distance between A and B to the value d
* @param A
* @param B
* @param d
* @param A text
* @param B text
* @param d distance
*/
public void setDist(String A, String B, Double d){
if (this.get(A) == null) {this.put(A,new HashMap<String, Double>());}
......
/*
* Copyright (C) 2015 UNIVERSITE JOSEPH FOURIER (Grenoble 1)/ Springer-Verlag GmbH
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.imag.forge.scidetect.Checker.Utils;
......@@ -18,8 +34,6 @@ public class ThresholdsSet extends HashMap<String, Double[]> {
/**
* Initialize the thresholds Set by reading the configuration file
* @throws FileNotFoundException
* @throws IOException
*/
public void Init() {
try{
......
......@@ -27,7 +27,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* extract raw txt from a pdf File
* Extract raw txt from a pdf File
* @author Nguyen Minh Tien - minh-tien.nguyen@imag.fr
*/
public class pdfextractor {
......
......@@ -6,6 +6,7 @@
\usepackage{url}
\usepackage{hyperref}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
%%%%% Package for Customizing
\usepackage{listings}
\usepackage{caption}
......@@ -27,7 +28,7 @@
\fancyhf{}
\fancyhead[LE,RO]{\leftmark}
\fancyhead[RE,LO]{}
\fancyfoot[RE,LO]{SciDetect Documentation}
\fancyfoot[RE,LO]{SciDetect{\small\texttrademark} Documentation}
\fancyfoot[LE,RO]{\thepage}
\renewcommand{\headrulewidth}{0.5pt}
\renewcommand{\footrulewidth}{0.5pt}
......@@ -96,7 +97,7 @@ language=sh,breaklines=true,breakatwhitespace=true}
\begin{document}
\title{SciDetect Documentation
\title{SciDetect{\huge\texttrademark} Documentation
\\
\url{http://scidetect.forge.imag.fr}}
......@@ -139,20 +140,31 @@ Version & Date & Author & Comment \\ [0.5ex]
\newpage
\section{Installation Requirements}
A runnable program for the SciDetect software is available in:
\section{Installation-Requirements-Quick start}
\paragraph{Installation}
A stand-alone Java program, the documentation and the source code are available at the following URL:
\url{http://scidetect.forge.imag.fr}
\paragraph{Requirements}
The stand-alone Java program requires {\tt Java SE 6} or higher. It is also using an additional libraries for pdf converter (should be included in the {\tt lib/} directory).
\paragraph{Quick start}
The runnable program for the SciDetect software is packaged inside:
\begin{lstlisting}
SciDetect_Local.jar
\end{lstlisting}
It can be used as a stand-alone Java program. The program component requires {\tt Java SE 6} or higher, with an additional libraries for pdf converter(included in {\tt lib/}).
Furthermore the configuration file ({\tt config.txt}) and directories for log files ({\tt logs} and {\tt detaillogs}) are required by the stand-alone Java program.
The following are needed :
\begin{itemize}
\item The configuration file ({\tt config.txt})
\item The samples directory directories ({\tt data})
\item For log files ({\tt logs} and {\tt detaillogs}) are required by the stand-alone Java program.
\end{itemize}
\section{Usage}
\subsection{Command line client-side}
\subsection{Command line client}
SciDetect program is included in a runnable JAR file. The program is started by invoking:
\begin{lstlisting}
......@@ -162,8 +174,8 @@ SciDetect program is included in a runnable JAR file. The program is started by
Where {\tt <parameters>} stands for a combination of one or more of
the following command line options:
\begin{itemize}
\item[] {\tt -c <path\_to\_check>} gives the path to the directory (or file) that need to be checked;
\item[] {\tt -l <log\_filename>} gives path and name of the log file (defaults to {\tt /logs/start\_time.xls});
\item[] {\tt -c <path\_to\_check>} gives the path to the directory containing the files to be checked;
\item[] {\tt -l <log\_filename>} gives the name of the log file (defaults to {\tt /logs/start\_time.xls});
\item[] {\tt -d} Save detail log (optional, default false).
\item[] {\tt -h} Show usage.
\end{itemize}
......@@ -185,7 +197,7 @@ A configuration file ({\tt config.txt}) should be accessible by the program. It
# Where samples can be found
samples data/samples
\end{lstlisting}
This is used to set the directory where samples of texts produced by known generator can be found. This directory contains one directory per \emph{classes}. One directory contains examples that are representative of its class. In a standard release, the {\tt data/samples} directory contains four subdirectories with texts generated by the following generator:
This is used to set the directory where samples of texts produced by known generators can be found. This directory contains one directory per \emph{classes} (i.e. per known generator). One directory contains examples that are representative of its class. In a standard release, the {\tt data/samples} directory contains four subdirectories with texts generated by the following generator:
\begin{itemize}
\item \url{http://thatsmathematics.com/mathgen/} (dir {\tt data/samples/Mathgen});
\item \url{https://bitbucket.org/birkenfeld/scigen-physics} (dir {\tt data/samples/Physgen)};
......@@ -195,8 +207,8 @@ This is used to set the directory where samples of texts produced by known gener
New subdirectories can be added. This can be done for two purpose:
\begin{enumerate}
\item add a corpus that represents fairly enough a particular field. By setting appropriate threshold, this will flag papers that appeared to be too far from that field.
\item In case new a generator appears, new samples (pdf) can be added in a new subdirectory (in {\tt data/samples}) containing a representative corpora of the new class.
\item Adding a corpus that represents fairly enough a particular field. By setting appropriate threshold, this will flag papers that appeared to be too far from that field.
\item When a generator appears, new samples (pdf) can be added in a new subdirectory (in {\tt data/samples}) containing a representative corpora of the new class.
\end{enumerate}
\subsection{Threshold configuration}
......@@ -205,7 +217,7 @@ New subdirectories can be added. This can be done for two purpose:
Threshold_Scigen 0.48 0.56
\end{lstlisting}
A line starting with {\tt Threshold\_Dirname} is used to define thresholds needed to take decisions to assigned tested texts the class for which examples can be found in the directory {\tt Dirname}. There should have one line (i.e. two Thresholds) per classe. These values are 2 real numbers between 0 and 1. The smallest one is use to take the decision to assigned the tested paper (almost certainly) to the class. The second one is used as a threshold for suspicion for containing parts of generated text.
A line starting with {\tt Threshold\_Dirname} is used to define thresholds. Thresholds are needed to take decisions to assigned tested texts to a class. Examples of each class can be found in the directory {\tt Dirname}. There should have one line (i.e. two Thresholds) per classe. These values are 2 real numbers between 0 and 1. The smallest one is use to take the decision to assigned the tested paper (almost certainly) to the class. The second one is used as a threshold for suspicion for containing parts of generated text.
The previous example (concerning Scigen class) has the following meaning. Given distances from the tested text to its nearest neighbour in the set of samples (i.e. texts found in the Scigen dir):
\begin{itemize}
......@@ -214,7 +226,7 @@ The previous example (concerning Scigen class) has the following meaning. Given
\item If the distance is less than 0.48, there is a very high chance that this is an automatic Scigen generated article.
\end{itemize}
If new samples are added to the sample folder, the threshold configuration should also be added, if not the default-threshold values are used (0.48 and 0.56).
If new samples are added to the sample folder (i.e new dir), the threshold configuration should also be added, if not the default-threshold values are used (0.48 and 0.56).
\subsection{Path for log files}
\begin{lstlisting}[language=bash]
......@@ -240,10 +252,10 @@ Max_length 30000
\end{lstlisting}
This set the max length in character (including white space char) for a text to be eligible for classification. This parameter is used in order to avoid miss classification: when an article is too long, this cause the characteristic of the article to becomes too generic and very long paper may be misclassified (without splitting misclassification rate: ??).
The default value is set at 30000 characters (about 15 pages). A longer text will be splitter into several part which are tested individually.
The default value is set at 30000 characters (about 15 pages). A longer text will be split into several part which are tested individually.
\section{Make use of detail logging}
\label{deteaillog}
\label{detaillog}
The detail log (parameter -d) stores all the distances from the text under test to all other samples in the sample set (i.e. all texts in all directories found at {\tt /data/sample}).
This can be use to get a more detail look at the results.
......@@ -279,7 +291,53 @@ INDEX-053.txt data/samples/Mathgen/INDEX-mathgen16.txt 0.682802115990133
INDEX-053.txt data/samples/Physgen/INDEX-physgen10.txt 0.5261174636174665
\end{lstlisting}
\section{Tuning/Setting Thresholds}
Thresholds for the current known generator have been empirically set according to tests presented in this section.
\paragraph{SCIgen}
\url{http://pdos.csail.mit.edu/scigen/} (dir {\tt data/samples/SCIgen)}.
\begin{figure}[htb]
\begin{center}
%\includegraphics[width=0.65\linewidth]{}
\caption{Distribution of distances to the {\emph Scigen} nearest neighbour. In blue for a set of {\emph non-scigen} paper. In red for a set of {\emph scigen} paper}
\end{center}
\end{figure}
\paragraph{Mathgen}
\url{http://thatsmathematics.com/mathgen/} (dir {\tt data/samples/Mathgen});
\begin{figure}[htb]
\begin{center}
\includegraphics[width=0.65\linewidth]{fig/hist_math.pdf}
\caption{Distribution of distances to the {\emph mathgen} nearest neighbour. In blue for a set of {\emph non-mathgen} paper. In red for a set of {\emph mathgen} paper}
\end{center}
\end{figure}
\paragraph{scigen-physics}
\url{https://bitbucket.org/birkenfeld/scigen-physics} (dir {\tt data/samples/Physgen)};
\begin{figure}[htb]
\begin{center}
\includegraphics[width=0.65\linewidth]{fig/hist_phy.pdf}
\caption{Distribution of distances to the {\emph scigen-physics} nearest neighbour. In blue for a set of {\emph non-scigen-physics} paper. In red for a set of {\emph scigen-physics} paper}
\end{center}
\end{figure}
\paragraph{propgen}
\url{http://www.nadovich.com/chris/randprop/} ( dir {\tt data/samples/Propgen)};
\begin{figure}[htb]
\begin{center}
\includegraphics[width=0.65\linewidth]{fig/hist_prop.pdf}
\caption{Distribution of distances to the {\emph randprop} nearest neighbour. In blue for a set of {\emph non-randprop} paper. In red for a set of {\emph randprop} paper}
\end{center}
\end{figure}
\end{document}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment