Commit 9940f8ac authored by Cyril Labbe's avatar Cyril Labbe

Doc

parent b63617ba
\documentclass[11pt,a4paper,titlepage]{article}
%%%%% Packages
\usepackage{url}
\usepackage{hyperref}
......@@ -295,17 +294,25 @@ INDEX-053.txt data/samples/Physgen/INDEX-physgen10.txt 0.5261174636174665
\section{Tuning/Setting Thresholds}
Thresholds for the current known generator have been empirically set according to tests presented in this section.
Thresholds for the current known generators have been empirically set according to tests presented in this section. These tests involves the computation of the intertextual distance presented in~\cite{FakeDuplication}.
For each generator (Scigen, scigen-physics, Mathgen and propgen) a set of 400 texts is used (i.e: 1600 texts for the whole). For each text the distance to its nearest neighbor in the sample set is computed. The sample is composed of an extra 100 texts per generator (i.e: 400 additional texts). The nearest neighbor is always of the same nature than the tested text and columns 1-2-3-4 of Table~\ref{table1} show statistical information about the observed distances.
A set of 8200 genuine papers is also used. For each genuine text the distance to its nearest fake in the sample set is computed. The sample still being composed of the same 400 texts (100 per generator). For each of the 8200 genuine papers, the nearest fake neighbor is in one of the generated sample group.
Columns 1 of Table~\ref{table1} shows that, for a genuine paper, the minimal distance to the nearest fake is always greater than the maximal distance of the nearest neighbor of a fake.
In each test, 400 samples of a specific generator were used along with 8200 samples of genuine paper with the following statistical information:
\begin{table}[ht]
\caption{Mean, Standard deviation and median for the distances between papers and theirs nearest neighbor.}
\label{table1}
\begin{tabular}{|l|c|c|c|c|c|}
%\hline
\hline
& Scigen & Physgen & Mathgen & Propgen & Genuine \\ \hline
Mean distant to NN& 0.3516657439 & 0.3531284423 & 0.2284321961 & 0.1425688279 & 0.6909262827\\
Standard deviation & 0.0144390015 & 0.0120515277 & 0.0143429504 & 0.0157618469 & 0.1170299643\\
Median & 0.3518156669 & 0.3527417268 & 0.2272039843 & 0.140403222 & 0.6424112226\\
& Scigen & scigen-physics & Mathgen & Propgen & Genuine \\ \hline
Mean distance to NN& 0.35 & 0.35 & 0.22 & 0.14 & 0.69\\
Standard deviation & 0.014 & 0.012 & 0.014 & 0.015 & 0.117\\
Median & 0.35 & 0.35 & 0.22 & 0.14 & 0.64\\
\hline
\end{tabular}
\end{table}
......@@ -313,36 +320,39 @@ Median & 0.3518156669 & 0.3527417268 & 0.2272039843 & 0.140403222 & 0.642411222
\paragraph{SCIgen}
\paragraph{Scigen}
\url{http://pdos.csail.mit.edu/scigen/} (dir {\tt data/samples/SCIgen)}.
(\url{http://pdos.csail.mit.edu/scigen/} (dir {\tt data/samples/SCIgen)}) The graph~\ref{scigen} shows the observed distribution for texts having a Scigen text as nearest fake neighbor.
\begin{figure}[htb]
\begin{center}
\includegraphics[width=0.65\linewidth]{fig/hist_sci.pdf}
\caption{Distribution of distances to the {\emph Scigen} nearest neighbour. In blue for a set of {\emph non-scigen} paper. In red for a set of {\emph scigen} paper}
\caption{Distribution of distances to the {\emph Scigen} nearest neighbor. In blue for a set of {\emph non-scigen} paper. In red for a set of {\emph scigen} papers}
\label{scigen}
\end{center}
\end{figure}
\paragraph{scigen-physics}
\url{https://bitbucket.org/birkenfeld/scigen-physics} (dir {\tt data/samples/Physgen)};
\url{https://bitbucket.org/birkenfeld/scigen-physics} (dir {\tt data/samples/Physgen)} The graph~\ref{phygen} shows the observed distribution for texts having a scigen-physics text as nearest fake neighbor.
\begin{figure}[htb]
\begin{center}
\includegraphics[width=0.65\linewidth]{fig/hist_phy.pdf}
\caption{Distribution of distances to the {\emph scigen-physics} nearest neighbour. In blue for a set of {\emph non-scigen-physics} paper. In red for a set of {\emph scigen-physics} paper}
\caption{Distribution of distances to the {\emph scigen-physics} nearest neighbor. In blue for a set of {\emph non-scigen-physics} paper. In red for a set of {\emph scigen-physics} papers}
\label{phygen}
\end{center}
\end{figure}
\paragraph{Mathgen}
\url{http://thatsmathematics.com/mathgen/} (dir {\tt data/samples/Mathgen});
\url{http://thatsmathematics.com/mathgen/} (dir {\tt data/samples/Mathgen}) The graph~\ref{mathgen} shows the observed distribution for texts having a mathgen text as nearest fake neighbor.
\begin{figure}[htb]
\begin{center}
\includegraphics[width=0.65\linewidth]{fig/hist_math.pdf}
\caption{Distribution of distances to the {\emph mathgen} nearest neighbour. In blue for a set of {\emph non-mathgen} paper. In red for a set of {\emph mathgen} paper}
\caption{Distribution of distances to the \emph{mathgen} nearest neighbor. In blue for a set of \emph{non-mathgen} paper. In red for a set of \emph{mathgen} papers}
\label{mathgen}
\end{center}
\end{figure}
......@@ -350,14 +360,18 @@ Median & 0.3518156669 & 0.3527417268 & 0.2272039843 & 0.140403222 & 0.642411222
\paragraph{propgen}
\url{http://www.nadovich.com/chris/randprop/} ( dir {\tt data/samples/Propgen)};
\url{http://www.nadovich.com/chris/randprop/} (dir {\tt data/samples/Propgen)} The graph~\ref{propgen} shows the observed distribution for texts having a randprop text as nearest fake neighbor.
\begin{figure}[htb]
\begin{center}
\includegraphics[width=0.65\linewidth]{fig/hist_prop.pdf}
\caption{Distribution of distances to the {\emph randprop} nearest neighbour. In blue for a set of {\emph non-randprop} paper. In red for a set of {\emph randprop} paper}
\caption{Distribution of distances to the \emph{randprop} nearest neighbour. In blue for a set of \emph{non-randprop} paper. In red for a set of {\emph randprop} papers}
\label{propgen}
\end{center}
\end{figure}
\begin{thebibliography}{widest entry}\bibitem[1]{FakeDuplication} Cyril Labbé, Dominique Labbé. \emph{Duplicate and fake publications in the scientific literature: how many SCIgen papers in computer science?} Scientometrics 94, no. 1 (2013): 379-396 (http://hal.archives-ouvertes.fr/hal-00641906v2/document).
%\bibitem[label2]{cite_key2} bibliographic information \end{thebibliography}
\end{document}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment