LorisBachert/TikaExtractor.java

## TikaExtractor.java
/**
 * Uses Tikas {@link AutoDetectParser} to extract the text of a file.
 *
 * @param document
 * @return The text content of a file
 */
@Override
public String extractTextOfDocument(File file) throws Exception {
	InputStream fileStream = new FileInputStream(file);
	Parser parser = new AutoDetectParser();
	Metadata metadata = new Metadata();
	BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);

	TesseractOCRConfig config = new TesseractOCRConfig();
	PDFParserConfig pdfConfig = new PDFParserConfig();
	pdfConfig.setExtractInlineImages(true);

	// To parse images in files those lines are needed
	ParseContext parseContext = new ParseContext();
	parseContext.set(TesseractOCRConfig.class, config);
	parseContext.set(PDFParserConfig.class, pdfConfig);
	parseContext.set(Parser.class, parser); // need to add this to make sure
											// recursive parsing happens!
	try {
		parser.parse(fileStream, handler, metadata, parseContext);
		String text = handler.toString();
		if (text.trim().isEmpty()) {
			logger.warn("Could not extract text of '" + document.getName() + "'");
		} else {
			logger.debug("Successfully extracted the text of '" + document.getName() + "'");
		}
		return text;
	} catch (IOException | SAXException | TikaException e) {
		throw new Exception("TIKA was not able to exctract text of file '" + document.getName() + "'", e);
	} finally {
		try {
			fileStream.close();
		} catch (IOException e) {
			throw new Exception(e);
		}
	}
}
	/**
	* Uses Tikas {@link AutoDetectParser} to extract the text of a file.
	*
	* @param document
	* @return The text content of a file
	*/
	@Override
	public String extractTextOfDocument(File file) throws Exception {
	InputStream fileStream = new FileInputStream(file);
	Parser parser = new AutoDetectParser();
	Metadata metadata = new Metadata();
	BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);

	TesseractOCRConfig config = new TesseractOCRConfig();
	PDFParserConfig pdfConfig = new PDFParserConfig();
	pdfConfig.setExtractInlineImages(true);

	// To parse images in files those lines are needed
	ParseContext parseContext = new ParseContext();
	parseContext.set(TesseractOCRConfig.class, config);
	parseContext.set(PDFParserConfig.class, pdfConfig);
	parseContext.set(Parser.class, parser); // need to add this to make sure
	// recursive parsing happens!
	try {
	parser.parse(fileStream, handler, metadata, parseContext);
	String text = handler.toString();
	if (text.trim().isEmpty()) {
	logger.warn("Could not extract text of '" + document.getName() + "'");
	} else {
	logger.debug("Successfully extracted the text of '" + document.getName() + "'");
	}
	return text;
	} catch (IOException \| SAXException \| TikaException e) {
	throw new Exception("TIKA was not able to exctract text of file '" + document.getName() + "'", e);
	} finally {
	try {
	fileStream.close();
	} catch (IOException e) {
	throw new Exception(e);
	}
	}
	}