Skip to content

Instantly share code, notes, and snippets.

@NRBPerdijk
Created April 10, 2020 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NRBPerdijk/b59332173c9598991f8774d98266e57d to your computer and use it in GitHub Desktop.
Save NRBPerdijk/b59332173c9598991f8774d98266e57d to your computer and use it in GitHub Desktop.
package tika.example
import java.io.InputStream
import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.ocr.TesseractOCRConfig
import org.apache.tika.parser.pdf.PDFParserConfig
import org.apache.tika.parser.{AutoDetectParser, ParseContext, Parser}
import org.apache.tika.sax.BodyContentHandler
object TikaOCRParser {
private val pdfConfig: PDFParserConfig = {
val pdfConf = new PDFParserConfig()
pdfConf.setOcrDPI(100) //scalastyle:ignore magic.number
pdfConf.setDetectAngles(true)
pdfConf.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)
pdfConf
}
private val tesseractOCRConfig: TesseractOCRConfig = {
val tessConf = new TesseractOCRConfig()
tessConf.setLanguage("eng")
tessConf.setEnableImageProcessing(1)
tessConf
}
private val parser = new AutoDetectParser(TikaConfig.getDefaultConfig)
private val parseContext = {
val parseCont = new ParseContext()
parseCont.set(classOf[Parser], parser)
parseCont.set(classOf[PDFParserConfig], pdfConfig)
parseCont.set(classOf[TesseractOCRConfig], tesseractOCRConfig)
parseCont
}
def parse(inputStream: InputStream, handler: BodyContentHandler, metadata: Metadata): Unit = parser.parse(inputStream, handler, metadata, parseContext)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment