diff --git a/tex/README.pdf b/tex/README.pdf index 90337adce37c1f80a2d2c18087edee452af189e3..a01c43285ddf2fc82be0644d9c286c9e20e2b107 100644 Binary files a/tex/README.pdf and b/tex/README.pdf differ diff --git a/tex/README.tex b/tex/README.tex index bf828d019af2cfa09373ecac30b8087fb368fd17..193a3fe574ba647f5f495eed2dc28e0f6d947e3f 100644 --- a/tex/README.tex +++ b/tex/README.tex @@ -1,27 +1,52 @@ -\documentclass[10pt,a4paper,titlepage]{article} +\documentclass[11pt,a4paper,titlepage]{article} %%%%% Packages -\usepackage{fullpage} \usepackage{url} +\usepackage{hyperref} \usepackage[T1]{fontenc} +%%%%% Package for Customizing \usepackage{listings} \usepackage{caption} \usepackage{titling} - +\usepackage{titlesec} +\usepackage{titletoc} +\usepackage{geometry} +\geometry{ + top=2.5cm, + inner=2cm, + outer=2cm, + bottom=2.5cm, + headheight=2ex, + headsep=0.5cm, + } +%%%%%%% Heading and footer +\usepackage{fancyhdr} +\pagestyle{fancy} +\fancyhf{} +\fancyhead[LE,RO]{\leftmark} +\fancyhead[RE,LO]{} +\fancyfoot[RE,LO]{SciDetect Documentation} +\fancyfoot[LE,RO]{\thepage} +\renewcommand{\headrulewidth}{0.5pt} +\renewcommand{\footrulewidth}{0.5pt} +%%%%%%% Interlignes +\usepackage{setspace} +\onehalfspacing %%%%% Customizing Caption \captionsetup{labelsep=space,justification=justified,singlelinecheck=off} %%%%% Customizing itemize -\def\labelitemi{} +\def\labelitemi{--} %%%%% Customizing Title Page \pretitle{ \begin{flushleft}\Huge \rule{\linewidth}{0.5mm} \vskip 0.5em +\begin{textsf} } - \posttitle{ +\end{textsf} \rule{\linewidth}{0.5mm} \par\end{flushleft} \vskip 20em @@ -34,7 +59,41 @@ \end{flushleft} \begin{flushright}\large\scshape} \postdate{\par\end{flushright}} + +%%%%% Customizing Section +\usepackage{etoolbox} +\preto\section{\clearpage} +%\newcommand{\sectionbreak}{\newpage} +\titleformat +{\section} % command +[display] +{\scshape\Large\bfseries} % format +{} % label \thesection +{1ex} % sep +{ +\vskip 5mm + \rule{\textwidth}{0.5mm} + \vskip 5mm + %\hspace{1ex} + %\centering +} % before-code +[ +%\vspace{-0.5ex}% +\vskip 3mm +\rule{\textwidth}{0.5mm} +\vskip 6mm +] %%%%% +%%%%% Customizing Listing +\usepackage{color} +\definecolor{lightgray}{rgb}{0.95,0.95,0.95} +\definecolor{commentgreen}{rgb}{0,0.6,0} +\renewcommand\lstlistingname{} +\lstset{backgroundcolor=\color{lightgray},frame=single, +%keywordstyle=\color{blue},commentstyle=\color{commentgreen}, +language=sh,breaklines=true,breakatwhitespace=true} + + \begin{document} \title{SciDetect Documentation} @@ -61,7 +120,7 @@ Cyril Labb\'e \\ \begin{table}[ht] \caption*{Revision History} -\begin{tabular}{|c|c|c|c|} +\begin{tabular}{|l|c|c|r|} \hline \hline Version & Date & Author & Comment \\ [0.5ex] @@ -80,32 +139,35 @@ Version & Date & Author & Comment \\ [0.5ex] \section{Installation Requirements} -A runnable program for the SciDetect software is implemented in: -\begin{lstlisting}[language=bash] +A runnable program for the SciDetect software is available in: + +\begin{lstlisting} ScigenChecker_Local.jar \end{lstlisting} -It can be used as a stand-alone Java program. The program component requires {\tt Java SE 6} or higher, with an additional libraries for pdf converter(included in {\tt lib/}); Furthermore the configuration file ({\tt config.txt}) and directories for log files ({\tt logs} and {\tt detaillogs}) are required by the client. +It can be used as a stand-alone Java program. The program component requires {\tt Java SE 6} or higher, with an additional libraries for pdf converter(included in {\tt lib/}). + +Furthermore the configuration file ({\tt config.txt}) and directories for log files ({\tt logs} and {\tt detaillogs}) are required by the stand-alone Java program. \section{Usage} \subsection{Command line client-side} SciDetect program is included in a runnable JAR file. The program is started by invoking: -\begin{lstlisting}[language=bash] +\begin{lstlisting} $java -jar ScigenChecker_Local.jar \end{lstlisting} Where {\tt } stands for a combination of one or more of the following command line options: \begin{itemize} -\item {\tt -c } gives the path to the directory (or file) that need to be checked; -\item {\tt -l } gives path and name of the log file (defaults to {\tt /logs/start\_time.xls}); -\item {\tt -d} Save detail log (optional, default false). +\item[] {\tt -c } gives the path to the directory (or file) that need to be checked; +\item[] {\tt -l } gives path and name of the log file (defaults to {\tt /logs/start\_time.xls}); +\item[] {\tt -d} Save detail log (optional, default false). \end{itemize} Typical use: -\begin{lstlisting}[language=bash] -$Java -jar ScigenChecker_Local.jar -c /tien/Test_demo -l /tien/Test_log.xls -d +\begin{lstlisting}[language=sh] +$java -jar ScigenChecker_Local.jar -c /tien/Test_demo -l /tien/Test_log.xls -d \end{lstlisting} \subsection{Supported file types} @@ -118,6 +180,7 @@ A configuration file ({\tt config.txt}) should be accessible by the program. It \subsection{Path to sample folder} \begin{lstlisting}[language=bash] +# Where samples can be found samples data/samples \end{lstlisting} This is used to set the directory where samples of texts produced by known generator can be found. This directory contains one directory per \emph{classes}. One directory contains examples that are representative of its class. In a standard release, the {\tt data/samples} directory contains four subdirectories with texts generated by the following generator: @@ -136,6 +199,7 @@ New subdirectories can be added. This can be done for two purpose: \subsection{Threshold configuration} \begin{lstlisting}[language=bash] +# Defining Thresholds for Scigen Threshold_Scigen 0.48 0.56 \end{lstlisting} @@ -152,15 +216,24 @@ If new samples are added to the sample folder, the threshold configuration shoul \subsection{Path for log files} \begin{lstlisting}[language=bash] +# Set the default path for log files Default_log_folder logs/ Default_detail_log_folder detaillogs/ \end{lstlisting} -These lines are use to set the default log folder and a default detail log folder (see section ?? for more information). In case the path to a log file is not set (no -l parameter), the log file will be saved in the default log folder under the name: {\tt time\_date.xls} (e/g: 09:46 25.02.2015.xls means the check was started at 9:46 on 25/2/2015). +These lines are use to set the default log folder and a default detail log folder (see section~\ref{detaillog} for more information). In case the path to a log file is not set (no -l parameter), the log file will be saved in the default log folder under the name: {\tt time\_date.xls} (e.g. 09:46 25.02.2015.xls means the check was started at 9:46 on 25/2/2015). + +\begin{lstlisting}[basicstyle=\ttfamily\scriptsize] +INDEX-53.txt is a Scigen 0.34236384 data/samples/Scigen/INDEX-scigen25.txt +INDEX-53.txt is a Physgen 0.47908222 data/samples/Physgen/INDEX-physgen7.txt +INDEX-011.txt is Genuine 0.60918242 data/samples/Scigen/INDEX-scigen41.txt +INDEX-013.txt is Genuine 0.61375975 data/samples/Scigen/INDEX-scigen25.txt +\end{lstlisting} \subsection{Max text length} \begin{lstlisting}[language=bash] +# Max_length is the maximum size of a text Max_length 30000 \end{lstlisting} This set the max length in character (including white space char) for a text to be eligible for classification. This parameter is used in order to avoid miss classification: when an article is too long, this cause the characteristic of the article to becomes too generic and very long paper may be misclassified (without splitting misclassification rate: ??). @@ -168,12 +241,43 @@ This set the max length in character (including white space char) for a text to The default value is set at 30000 characters (about 15 pages). A longer text will be splitter into several part which are tested individually. \section{Make use of detail logging} - +\label{deteaillog} The detail log (parameter -d) stores all the distances from the text under test to all other samples in the sample set (i.e. all texts in all directories found at {\tt /data/sample}). This can be use to get a more detail look at the results. For example: an article returned with a distant to the nearest neighbour that barely pass the threshold. Turning on the detail log for that article and checking the results may help the decision. %if it is just a rare incident or the distances to other samples are also suspicious and have a better estimation. +\begin{lstlisting}[basicstyle=\ttfamily\scriptsize] +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen55.txt 0.6821885795569994 +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen63.txt 0.6608131367167517 +INDEX-053.txt data/samples/Scigen/INDEX-scigen36.txt 0.39296257670516693 +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen9.txt 0.6679829987841077 +INDEX-053.txt data/samples/Scigen/INDEX-scigen0.txt 0.35342658461094817 +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen47.txt 0.660816573503142 +INDEX-053.txt data/samples/Scigen/INDEX-scigen52.txt 0.3808927385660057 +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen71.txt 0.6897595647595604 +INDEX-053.txt data/samples/Scigen/INDEX-scigen28.txt 0.38955875898790254 +INDEX-053.txt data/samples/Scigen/INDEX-scigen60.txt 0.39994884474379633 +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen39.txt 0.6868800914402744 +INDEX-053.txt data/samples/Physgen/INDEX-physgen81.txt 0.5303053819516341 +INDEX-053.txt data/samples/Propgen/INDEX-17-html.txt 0.7981193467108959 +INDEX-053.txt data/samples/Physgen/INDEX-physgen65.txt 0.510647010647008 +INDEX-053.txt data/samples/Propgen/INDEX-53-html.txt 0.7880669668830156 +INDEX-053.txt data/samples/Physgen/INDEX-physgen5.txt 0.5160079114941755 +INDEX-053.txt data/samples/Physgen/INDEX-physgen73.txt 0.5115960731657623 +INDEX-053.txt data/samples/Physgen/INDEX-physgen49.txt 0.5055891144600811 +INDEX-053.txt data/samples/Propgen/INDEX-86-html.txt 0.7643301386956208 +INDEX-053.txt data/samples/Physgen/INDEX-physgen96.txt 0.5069873754844876 +INDEX-053.txt data/samples/Propgen/INDEX-45-html.txt 0.7918353315721742 +INDEX-053.txt data/samples/Scigen/INDEX-scigen21.txt 0.38484926003355824 +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen78.txt 0.6692076400040969 +INDEX-053.txt data/samples/Propgen/INDEX-0-html.txt 0.7876861141791592 +INDEX-053.txt data/samples/Mathgen/INDEX-mathgen16.txt 0.682802115990133 +INDEX-053.txt data/samples/Physgen/INDEX-physgen10.txt 0.5261174636174665 +\end{lstlisting} + + + \end{document}