Commit 90fbf481 authored by Cyril Labbe's avatar Cyril Labbe
Redirection of sdterr for org.apache.pdfbox.util.PDFTextStripper

	modified:   src/fr/imag/forge/Scidetect/TextExtractor/
parent 56957cb7
......@@ -21,7 +21,8 @@ import;
import java.text.Normalizer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
......@@ -32,28 +33,37 @@ import org.apache.pdfbox.util.PDFTextStripper;
public class pdfextractor {
* Extracts raw txt from a pdf file. The extracted txt is written in a File having the same name but with a .txt extension
* The file StdErrPDFExtractor.txt contains sdterr messages from org.apache.pdfbox.util.PDFTextStripper
* @param pdf a pdf File
* @return a string containing the extracted text.
public String pdfextract(File pdf) throws IOException {
PDFTextStripper stripper = new PDFTextStripper();
PDFTextStripper stripper = new PDFTextStripper("UTF-8");
PDDocument pd;
BufferedWriter wr;
.println("converting: " + pdf.getPath());
.println("Converting: " + pdf.getPath());
File totxt = new File(pdf.getPath()
.substring(0, pdf.getPath().lastIndexOf('.')) + ".txt");
try {
pd = PDDocument.load(pdf.getPath());
wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(totxt)));
try {stripper.writeText(pd, wr);}
try {
// dirty redirection of stderr because stripper.writeText is complaining
PrintStream orgStream = System.err;
PrintStream fileStream = new PrintStream(new FileOutputStream("StdErrPDFExtractor.txt",true));
System.err.println("* Sterr txt extraction from file:"+pdf.getPath());
stripper.writeText(pd, wr);
catch (Exception e) {
System.out.println("* Something went wrong during:");
System.out.println(" - txt extraction from pdf:"+pdf);
System.out.println(" - txt extraction from pdf:"+pdf.getPath());
System.out.println("* Continuing anyway...");
if (pd != null) {
