Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Due to inactivity, this project is scheduled to be deleted on 2035-04-24.
Why is this scheduled?
Open sidebar
Cyril Labbe
scidetect
Commits
45f7cbea
Commit
45f7cbea
authored
Mar 10, 2015
by
Tien
Browse files
rename pakages
parent
a555af2b
Changes
23
Hide whitespace changes
Inline
Side-by-side
src/fr/imag/forge/
S
cidetect/Checker/Classifier.java
→
src/fr/imag/forge/
s
cidetect/Checker/Classifier.java
View file @
45f7cbea
...
...
@@ -14,7 +14,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.
S
cidetect.Checker
;
package
fr.imag.forge.
s
cidetect.Checker
;
import
java.io.BufferedReader
;
import
java.io.File
;
...
...
@@ -24,8 +24,8 @@ import java.io.IOException;
import
java.io.PrintWriter
;
import
java.util.HashMap
;
import
fr.imag.forge.
S
cidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.
S
cidetect.Checker.Utils.ThresholdsSet
;
import
fr.imag.forge.
s
cidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.
s
cidetect.Checker.Utils.ThresholdsSet
;
/**
* Classifier is tagging input files has being of a certain class. Example of classes are SCIgen, Mathgen,...
...
...
src/fr/imag/forge/scidetect/Checker/Classifier.java~
0 → 100644
View file @
45f7cbea
/*
*
Copyright
(
C
)
2015
UNIVERSITE
JOSEPH
FOURIER
(
Grenoble
1
)/
Springer
-
Verlag
GmbH
*
author
Nguyen
Minh
Tien
-
minh
-
tien
.
nguyen
@
imag
.
fr
*
*
This
program
is
free
software
;
you
can
redistribute
it
and
/
or
*
modify
it
under
the
terms
of
the
GNU
General
Public
License
*
as
published
by
the
Free
Software
Foundation
;
either
version
2
*
of
the
License
,
or
(
at
your
option
)
any
later
version
.
*
*
This
program
is
distributed
in
the
hope
that
it
will
be
useful
,
*
but
WITHOUT
ANY
WARRANTY
;
without
even
the
implied
warranty
of
*
MERCHANTABILITY
or
FITNESS
FOR
A
PARTICULAR
PURPOSE
.
See
the
*
GNU
General
Public
License
for
more
details
.
*
*
You
should
have
received
a
copy
of
the
GNU
General
Public
License
*
along
with
this
program
;
if
not
,
write
to
the
Free
Software
*
Foundation
,
Inc
.,
59
Temple
Place
-
Suite
330
,
Boston
,
MA
02111
-
1307
,
USA
.
*/
package
fr
.
imag
.
Scidetect
.
Checker
;
import
java
.
io
.
BufferedReader
;
import
java
.
io
.
File
;
import
java
.
io
.
FileNotFoundException
;
import
java
.
io
.
FileReader
;
import
java
.
io
.
IOException
;
import
java
.
io
.
PrintWriter
;
import
java
.
util
.
HashMap
;
/**
*
*
@
author
tien
*/
public
class
Classifier
{
HashMap
<
String
,
Double
[]>
Threshold
=
new
HashMap
<
String
,
Double
[]>();
public
void
readconfig
()
throws
FileNotFoundException
,
IOException
{
File
conf
=
new
File
(
"config.txt"
);
BufferedReader
br
=
new
BufferedReader
(
new
FileReader
(
conf
));
String
line
;
while
((
line
=
br
.
readLine
())
!= null) {
if
(
line
.
startsWith
(
"Threshold_"
))
{
//
System
.
out
.
println
(
line
);
String
[]
b
=
line
.
split
(
"
\t
"
);
Double
[]
temp
=
new
Double
[
2
];
temp
[
0
]
=
Double
.
parseDouble
(
b
[
1
]);
temp
[
1
]
=
Double
.
parseDouble
(
b
[
2
]);
Threshold
.
put
(
b
[
0
].
substring
(
10
,
b
[
0
].
length
()),
temp
);
//
10
because
i
want
to
cut
Threshold_
}
}
}
public
String
classify
(
HashMap
<
String
,
HashMap
<
String
,
Double
>>
distant
)
throws
IOException
{
String
result
=
""
;
String
conclusion
=
""
;
readconfig
();
for
(
String
key
:
distant
.
keySet
())
{
//
for
each
file
in
the
test
result
=
find_NN
(
distant
.
get
(
key
));
//
System
.
out
.
println
(
result
);
//
System
.
out
.
println
(
key
);
//
System
.
out
.
println
(
result
);
String
[]
a
=
checkdistant
(
result
).
split
(
"
\n
"
);
if
(
a
[
0
].
length
()
==
0
)
{
conclusion
+=
key
+
"
\t
"
+
"cant classify
\t
1
\t
null
\n
"
;
}
else
{
for
(
int
i
=
0
;
i
<
a
.
length
;
i
++)
{
conclusion
+=
key
+
"
\t
"
+
a
[
i
]
+
"
\n
"
;
}
}
}
//
System
.
out
.
println
(
conclusion
);
return
conclusion
;
}
private
String
checkdistant
(
String
result
)
{
String
conclution
=
""
;
String
[]
eachtype
=
result
.
split
(
"
\n
"
);
for
(
int
i
=
0
;
i
<
eachtype
.
length
;
i
++)
{
String
[]
eachNN
=
eachtype
[
i
].
split
(
"
\t
"
);
//
System
.
out
.
println
(
eachtype
[
i
]);
//
get
threshold
for
the
corresponding
type
Double
[]
threshold
=
new
Double
[
2
];
if
(
Threshold
.
containsKey
(
eachNN
[
0
]))
{
threshold
=
Threshold
.
get
(
eachNN
[
0
]);
}
else
{
threshold
=
Threshold
.
get
(
"Default"
);
}
//
check
distant
with
threshold
if
(
Double
.
parseDouble
(
eachNN
[
1
])
<
threshold
[
0
])
{
conclution
+=
"is a "
+
eachNN
[
0
]
+
"
\t
"
+
eachNN
[
1
]
+
"
\t
"
+
eachNN
[
2
]
+
"
\n
"
;
}
else
if
(
Double
.
parseDouble
(
eachNN
[
1
])
<
threshold
[
1
])
{
conclution
+=
"is suppected "
+
eachNN
[
0
]
+
"
\t
"
+
eachNN
[
1
]
+
"
\t
"
+
eachNN
[
2
]
+
"
\n
"
;
}
}
if
(
conclution
==
""
)
{
conclution
=
findmindistant
(
result
);
}
return
conclution
;
}
private
String
findmindistant
(
String
result
)
{
Double
mindistant
=
1.0
;
String
[]
eachtype
=
result
.
split
(
"
\n
"
);
String
conclu
=
""
;
for
(
int
i
=
0
;
i
<
eachtype
.
length
;
i
++)
{
String
[]
eachNN
=
eachtype
[
i
].
split
(
"
\t
"
);
if
(
Double
.
parseDouble
(
eachNN
[
1
])
<
mindistant
)
{
mindistant
=
Double
.
parseDouble
(
eachNN
[
1
]);
conclu
=
"is Genuine
\t
"
+
eachNN
[
1
]
+
"
\t
"
+
eachNN
[
2
]
+
"
\n
"
;
}
}
return
conclu
;
}
private
String
gettype
(
String
indexpath
)
{
File
indexfile
=
new
File
(
indexpath
);
String
parent
=
indexfile
.
getParent
();
//
String
type
=
parent
.
substring
(
0
,
parent
.
lastIndexOf
(
"/"
));
parent
=
parent
.
substring
(
parent
.
lastIndexOf
(
"/"
)
+
1
,
parent
.
length
());
return
parent
;
}
private
String
find_NN
(
HashMap
<
String
,
Double
>
distantto
)
{
HashMap
<
String
,
Double
>
distotype
=
new
HashMap
<
String
,
Double
>();
HashMap
<
String
,
String
>
NNname
=
new
HashMap
<
String
,
String
>();
Double
MinNN
=
1.0
;
String
NN
=
""
;
for
(
String
key
:
distantto
.
keySet
())
{
String
type
=
gettype
(
key
);
if
(
!distotype.containsKey(type)) {
distotype
.
put
(
type
,
distantto
.
get
(
key
));
NNname
.
put
(
type
,
key
);
}
else
if
(
distantto
.
get
(
key
)
<
distotype
.
get
(
type
))
{
distotype
.
put
(
type
,
distantto
.
get
(
key
));
NNname
.
put
(
type
,
key
);
}
}
//
it
returns
the
path
to
the
NN
String
result
=
""
;
for
(
String
key
:
distotype
.
keySet
())
{
result
+=
key
+
"
\t
"
+
distotype
.
get
(
key
)
+
"
\t
"
+
NNname
.
get
(
key
)
+
"
\n
"
;
}
return
result
;
}
}
src/fr/imag/forge/
S
cidetect/Checker/DistantCalculator.java
→
src/fr/imag/forge/
s
cidetect/Checker/DistantCalculator.java
View file @
45f7cbea
...
...
@@ -14,13 +14,13 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.
S
cidetect.Checker
;
package
fr.imag.forge.
s
cidetect.Checker
;
import
java.util.HashMap
;
import
java.util.HashSet
;
import
java.util.Set
;
import
fr.imag.forge.
S
cidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.
s
cidetect.Checker.Utils.DistancesSet
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
/**
...
...
src/fr/imag/forge/scidetect/Checker/DistantCalculator.java~
0 → 100644
View file @
45f7cbea
/*
*
Copyright
(
C
)
2015
UNIVERSITE
JOSEPH
FOURIER
(
Grenoble
1
)/
Springer
-
Verlag
GmbH
*
author
Nguyen
Minh
Tien
-
minh
-
tien
.
nguyen
@
imag
.
fr
*
*
This
program
is
free
software
;
you
can
redistribute
it
and
/
or
*
modify
it
under
the
terms
of
the
GNU
General
Public
License
*
as
published
by
the
Free
Software
Foundation
;
either
version
2
*
of
the
License
,
or
(
at
your
option
)
any
later
version
.
*
*
This
program
is
distributed
in
the
hope
that
it
will
be
useful
,
*
but
WITHOUT
ANY
WARRANTY
;
without
even
the
implied
warranty
of
*
MERCHANTABILITY
or
FITNESS
FOR
A
PARTICULAR
PURPOSE
.
See
the
*
GNU
General
Public
License
for
more
details
.
*
*
You
should
have
received
a
copy
of
the
GNU
General
Public
License
*
along
with
this
program
;
if
not
,
write
to
the
Free
Software
*
Foundation
,
Inc
.,
59
Temple
Place
-
Suite
330
,
Boston
,
MA
02111
-
1307
,
USA
.
*/
package
fr
.
imag
.
Scidetect
.
Checker
;
import
java
.
util
.
HashMap
;
import
java
.
util
.
HashSet
;
import
java
.
util
.
Set
;
/**
*
*
@
author
tien
*/
public
class
DistantCalculator
{
private
HashMap
<
String
,
HashMap
<
String
,
Double
>>
distant
=
new
HashMap
<
String
,
HashMap
<
String
,
Double
>>();
public
HashMap
<
String
,
HashMap
<
String
,
Double
>>
caldistant
(
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
,
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
)
{
for
(
String
key
:
tests
.
keySet
())
{
HashMap
<
String
,
Double
>
distantto
=
new
HashMap
<
String
,
Double
>();
for
(
String
key2
:
samples
.
keySet
())
{
double
distanttt
=
cal_textdistant
(
tests
.
get
(
key
),
samples
.
get
(
key2
));
//
System
.
out
.
println
(
"distant between "
+
key
+
" and "
+
key2
//
+
": "
+
distanttt
);
distantto
.
put
(
key2
,
distanttt
);
}
distant
.
put
(
key
,
distantto
);
}
return
distant
;
}
private
double
cal_textdistant
(
HashMap
<
String
,
Integer
>
text1
,
HashMap
<
String
,
Integer
>
text2
)
{
double
nboftoken
=
0.0
;
double
sum
=
0.0
;
Set
<
String
>
keys1
=
text1
.
keySet
();
Set
<
String
>
keys2
=
text2
.
keySet
();
Set
<
String
>
allkeys
=
new
HashSet
<
String
>();
allkeys
.
addAll
(
keys1
);
allkeys
.
addAll
(
keys2
);
Integer
Na
=
0
,
Nb
=
0
;
//
get
the
nb
of
token
in
each
text
for
(
String
key
:
allkeys
)
{
Integer
Fa
=
0
;
Integer
Fb
=
0
;
if
(
text1
.
containsKey
(
key
))
{
Fa
=
text1
.
get
(
key
);
}
if
(
text2
.
containsKey
(
key
))
{
Fb
=
text2
.
get
(
key
);
}
Na
+=
Fa
;
Nb
+=
Fb
;
}
//
reduce
propotion
for
text
of
different
lenght
if
(
Na
<=
Nb
)
{
for
(
String
key
:
allkeys
)
{
Integer
Fa
=
0
;
Integer
Fb
=
0
;
if
(
text1
.
containsKey
(
key
))
{
Fa
=
text1
.
get
(
key
);
}
if
(
text2
.
containsKey
(
key
))
{
Fb
=
text2
.
get
(
key
);
}
sum
+=
Math
.
abs
(
Fa
-
(
double
)
Fb
*
(
Na
/
(
double
)
Nb
));
}
return
sum
/
(
2
*
Na
);
}
else
{
for
(
String
key
:
allkeys
)
{
Integer
Fa
=
0
;
Integer
Fb
=
0
;
if
(
text1
.
containsKey
(
key
))
{
Fa
=
text1
.
get
(
key
);
}
if
(
text2
.
containsKey
(
key
))
{
Fb
=
text2
.
get
(
key
);
}
sum
+=
Math
.
abs
(
Fa
*
(
Nb
/
(
double
)
Na
)
-
(
double
)
Fb
);
}
return
sum
/
(
2
*
Nb
);
}
}
}
src/fr/imag/forge/
S
cidetect/Checker/Indexer.java
→
src/fr/imag/forge/
s
cidetect/Checker/Indexer.java
View file @
45f7cbea
...
...
@@ -14,7 +14,7 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.
S
cidetect.Checker
;
package
fr.imag.forge.
s
cidetect.Checker
;
import
java.io.File
;
import
java.io.FileNotFoundException
;
...
...
src/fr/imag/forge/scidetect/Checker/Indexer.java~
0 → 100644
View file @
45f7cbea
/*
*
Copyright
(
C
)
2015
UNIVERSITE
JOSEPH
FOURIER
(
Grenoble
1
)/
Springer
-
Verlag
GmbH
*
author
Nguyen
Minh
Tien
-
minh
-
tien
.
nguyen
@
imag
.
fr
*
*
This
program
is
free
software
;
you
can
redistribute
it
and
/
or
*
modify
it
under
the
terms
of
the
GNU
General
Public
License
*
as
published
by
the
Free
Software
Foundation
;
either
version
2
*
of
the
License
,
or
(
at
your
option
)
any
later
version
.
*
*
This
program
is
distributed
in
the
hope
that
it
will
be
useful
,
*
but
WITHOUT
ANY
WARRANTY
;
without
even
the
implied
warranty
of
*
MERCHANTABILITY
or
FITNESS
FOR
A
PARTICULAR
PURPOSE
.
See
the
*
GNU
General
Public
License
for
more
details
.
*
*
You
should
have
received
a
copy
of
the
GNU
General
Public
License
*
along
with
this
program
;
if
not
,
write
to
the
Free
Software
*
Foundation
,
Inc
.,
59
Temple
Place
-
Suite
330
,
Boston
,
MA
02111
-
1307
,
USA
.
*/
package
fr
.
imag
.
Scidetect
.
Checker
;
import
java
.
io
.
File
;
import
java
.
io
.
FileNotFoundException
;
import
java
.
io
.
PrintWriter
;
import
java
.
util
.
HashMap
;
/**
*
*
@
author
tien
*/
public
class
Indexer
{
private
Object
content
;
public
void
index
(
String
content
,
File
textfile
)
throws
FileNotFoundException
{
String
filename
=
textfile
.
getName
().
substring
(
0
,
textfile
.
getName
().
lastIndexOf
(
"."
));
filename
+=
".txt"
;
String
path
=
textfile
.
getParent
();
String
[]
words
=
content
.
split
(
" "
);
//
System
.
out
.
println
(
words
.
length
);
HashMap
<
String
,
Integer
>
counter
=
new
HashMap
<
String
,
Integer
>();
for
(
int
i
=
0
;
i
<
words
.
length
;
i
++)
{
if
(
!counter.containsKey(words[i])) {
counter
.
put
(
words
[
i
],
1
);
}
else
{
counter
.
put
(
words
[
i
],
counter
.
get
(
words
[
i
])
+
1
);
}
}
File
indexout
=
new
File
(
path
+
"/INDEX-"
+
filename
);
//
String
filepath
=
(
indexout
.
getPath
());
PrintWriter
out
=
new
PrintWriter
(
indexout
);
for
(
String
key
:
counter
.
keySet
())
{
out
.
println
(
key
+
"
\t
"
+
counter
.
get
(
key
));
}
out
.
close
();
}
}
src/fr/imag/forge/
S
cidetect/Checker/Reader.java
→
src/fr/imag/forge/
s
cidetect/Checker/Reader.java
View file @
45f7cbea
...
...
@@ -14,11 +14,11 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package
fr.imag.forge.
S
cidetect.Checker
;
package
fr.imag.forge.
s
cidetect.Checker
;
import
com.sun.corba.se.spi.transport.CorbaAcceptor
;
import
fr.imag.forge.
S
cidetect.TextExtractor.Xmlextractor
;
import
fr.imag.forge.
S
cidetect.TextExtractor.pdfextractor
;
import
fr.imag.forge.
s
cidetect.TextExtractor.Xmlextractor
;
import
fr.imag.forge.
s
cidetect.TextExtractor.pdfextractor
;
import
fr.imag.forge.scidetect.Corpus.Corpus
;
import
fr.imag.forge.scidetect.Corpus.ProcessText
;
import
fr.imag.forge.scidetect.Corpus.Text
;
...
...
src/fr/imag/forge/scidetect/Checker/Reader.java~
0 → 100644
View file @
45f7cbea
/*
*
Copyright
(
C
)
2015
UNIVERSITE
JOSEPH
FOURIER
(
Grenoble
1
)/
Springer
-
Verlag
GmbH
*
author
Nguyen
Minh
Tien
-
minh
-
tien
.
nguyen
@
imag
.
fr
*
*
This
program
is
free
software
;
you
can
redistribute
it
and
/
or
*
modify
it
under
the
terms
of
the
GNU
General
Public
License
*
as
published
by
the
Free
Software
Foundation
;
either
version
2
*
of
the
License
,
or
(
at
your
option
)
any
later
version
.
*
*
This
program
is
distributed
in
the
hope
that
it
will
be
useful
,
*
but
WITHOUT
ANY
WARRANTY
;
without
even
the
implied
warranty
of
*
MERCHANTABILITY
or
FITNESS
FOR
A
PARTICULAR
PURPOSE
.
See
the
*
GNU
General
Public
License
for
more
details
.
*
*
You
should
have
received
a
copy
of
the
GNU
General
Public
License
*
along
with
this
program
;
if
not
,
write
to
the
Free
Software
*
Foundation
,
Inc
.,
59
Temple
Place
-
Suite
330
,
Boston
,
MA
02111
-
1307
,
USA
.
*/
package
fr
.
imag
.
Scidetect
.
Checker
;
import
fr
.
imag
.
Scidetect
.
TextExtractor
.
Xmlextractor
;
import
fr
.
imag
.
Scidetect
.
TextExtractor
.
pdfextractor
;
import
java
.
io
.
BufferedReader
;
import
java
.
io
.
File
;
import
java
.
io
.
FileNotFoundException
;
import
java
.
io
.
FileReader
;
import
java
.
io
.
FileWriter
;
import
java
.
io
.
IOException
;
import
java
.
io
.
PrintWriter
;
import
java
.
util
.
ArrayList
;
import
java
.
util
.
Arrays
;
import
java
.
util
.
HashMap
;
import
java
.
util
.
List
;
/**
*
*
@
author
tien
*/
public
class
Reader
{
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
samples
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>();
private
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
tests
=
new
HashMap
<
String
,
HashMap
<
String
,
Integer
>>();
private
String
SamplesFolder
;
private
int
maxlength
;
public
void
readconfig
()
throws
FileNotFoundException
,
IOException
{
File
conf
=
new
File
(
"config.txt"
);
BufferedReader
br
=
new
BufferedReader
(
new
FileReader
(
conf
));
String
line
;
while
((
line
=
br
.
readLine
())
!= null) {
if
(
!line.startsWith("#")) {
//
System
.
out
.
println
(
line
);
String
[]
b
=
line
.
split
(
"
\t
"
);
if
(
b
[
0
].
equals
(
"samples"
))
{
SamplesFolder
=
b
[
1
];
}
//
other
config
should
be
read
over
here
if
(
b
[
0
].
equals
(
"Max_length"
))
{
maxlength
=
Integer
.
parseInt
(
b
[
1
]);
}
}
}
}
public
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
readsamples
(
String
foldername
)
throws
IOException
{
File
folder
=
new
File
(
foldername
);
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readsamples
(
listOfFile
[
j
].
getPath
());
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
))
{
//
find
if
there
is
already
index
for
it
String
indexname
=
"INDEX-"
+
listOfFile
[
j
].
getName
().
substring
(
0
,
listOfFile
[
j
].
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
if
(
Arrays
.
asList
(
listOfFile
).
toString
().
contains
(
indexname
))
{
//
System
.
out
.
println
(
"lets read from index file"
);
readindexfile
(
listOfFile
[
j
].
getParent
()
+
"/"
+
indexname
);
}
else
{
pdfextractor
a
=
new
pdfextractor
();
String
content
=
a
.
pdfextract
(
listOfFile
[
j
]);
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
listOfFile
[
j
]);
readindexfile
(
listOfFile
[
j
].
getParent
()
+
"/"
+
indexname
);
}
}
}
return
samples
;
}
private
void
readindexfile
(
String
path
)
throws
IOException
{
File
index
=
new
File
(
path
);
BufferedReader
br
;
br
=
new
BufferedReader
(
new
FileReader
(
index
));
String
line
;
HashMap
<
String
,
Integer
>
a
=
new
HashMap
<
String
,
Integer
>();
while
((
line
=
br
.
readLine
())
!= null) {
String
[]
b
=
line
.
split
(
"
\t
"
);
a
.
put
(
b
[
0
],
Integer
.
parseInt
(
b
[
1
]));
}
br
.
close
();
if
(
path
.
contains
(
SamplesFolder
))
{
samples
.
put
(
path
,
a
);
}
else
{
tests
.
put
(
path
,
a
);
}
}
private
void
readfile
(
File
pdf
)
throws
IOException
{
String
content
=
""
;
if
(
pdf
.
getName
().
endsWith
(
".pdf"
))
{
pdfextractor
a
=
new
pdfextractor
();
content
=
a
.
pdfextract
(
pdf
);
}
else
if
(
pdf
.
getName
().
endsWith
(
".xml"
)
||
pdf
.
getName
().
endsWith
(
".xtx"
))
{
Xmlextractor
a
=
new
Xmlextractor
();
content
=
a
.
xmlextract
(
pdf
);
}
//
lets
deal
with
long
file
over
here
//
split
content
and
the
index
part
by
part
if
(
content
.
length
()
<
maxlength
)
{
String
indexname
=
"INDEX-"
+
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
".txt"
;
Indexer
b
=
new
Indexer
();
b
.
index
(
content
,
pdf
);
readindexfile
(
pdf
.
getParent
()
+
"/"
+
indexname
);
}
else
{
String
[]
part
=
splitcontent
(
content
);
for
(
int
i
=
0
;
i
<
part
.
length
;
i
++)
{
String
indexname
=
"INDEX-"
+
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
String
filename
=
pdf
.
getName
().
substring
(
0
,
pdf
.
getName
().
lastIndexOf
(
"."
))
+
"_part"
+
i
+
".txt"
;
Indexer
b
=
new
Indexer
();
File
a
=
new
File
(
pdf
.
getParent
()
+
"/"
+
filename
);
PrintWriter
out
=
new
PrintWriter
(
new
FileWriter
(
a
));
out
.
println
(
part
[
i
]);
//
System
.
out
.
println
(
text
);
out
.
close
();
b
.
index
(
part
[
i
],
a
);
readindexfile
(
a
.
getParent
()
+
"/"
+
indexname
);
}
}
}
public
HashMap
<
String
,
HashMap
<
String
,
Integer
>>
readtests
(
String
testpath
)
throws
IOException
{
File
folder
=
new
File
(
testpath
);
if
(
folder
.
isDirectory
())
{
File
[]
listOfFile
=
folder
.
listFiles
();
for
(
int
j
=
0
;
j
<
listOfFile
.
length
;
j
++)
{
if
(
listOfFile
[
j
].
isDirectory
())
{
readtests
(
listOfFile
[
j
].
getPath
());
}
else
if
(
listOfFile
[
j
].
getName
().
endsWith
(
".pdf"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xml"
)
||
listOfFile
[
j
].
getName
().
endsWith
(
".xtx"
))
{
readfile
(
listOfFile
[
j
]);
}
}
}
else
if
(
folder
.
getName
().
endsWith
(
".pdf"
)
||
folder
.
getName
().
endsWith
(
".xml"
)
||
folder
.
getName
().
endsWith
(
".xtx"
))
{
readfile
(
folder
);
}
return
tests
;
}
private
String
[]
splitcontent
(
String
content