Created
January 31, 2012 23:50
-
-
Save schmmd/1713952 to your computer and use it in GitHub Desktop.
PDFBox
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*** | |
* Bypass the Tika wrapper for PDFs and go directly to PDFBox since | |
* Apache Tika 0.8 has a bug when processing PDFs. This should be fixed | |
* when Apache Tika 0.9 is released. | |
* @param file | |
* @param fileType | |
* @param writer | |
* @throws IOException | |
* @throws ConversionException | |
*/ | |
public static void convertPDFBox(InputStream is, Writer writer) throws IOException, ConversionException { | |
Boolean force = true; | |
// use PDFBox as the Tika wrapper has a bug for PDFs | |
PDDocument document = null; | |
try { | |
document = PDDocument.load(is, force); // force extraction | |
stripper.setForceParsing(force); // continue when errors are encountered | |
stripper.setSortByPosition(false); // text may not be in visual order. | |
stripper.setShouldSeparateByBeads(true); // beads are articles | |
stripper.writeText(document, writer); | |
} | |
finally { | |
try { | |
if (document != null) { | |
document.close(); | |
} | |
} | |
catch (Exception e) { | |
throw new ConversionException(e); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment