Skip to content

Instantly share code, notes, and snippets.

@paambaati
Created December 19, 2018 11:03
Show Gist options
  • Save paambaati/1f61bdfda50d7536d3edb67345ff1f9c to your computer and use it in GitHub Desktop.
Save paambaati/1f61bdfda50d7536d3edb67345ff1f9c to your computer and use it in GitHub Desktop.
Extracting HTML from PDFs
name := "pdftohtml"
version := "0.1"
scalaVersion := "2.12.8"
libraryDependencies += "org.apache.pdfbox" % "pdfbox" % "2.0.13"
libraryDependencies += "org.apache.pdfbox" % "pdfbox-tools" % "2.0.13"
package me
import java.io.{File, FileInputStream}
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.tools.PDFText2HTML
object PDFtoHTML {
def main(args: Array[String]): Unit = {
val stream = new FileInputStream(new File("/Users/me/Downloads/example.pdf"))
val document = PDDocument.load(stream)
val converter = new PDFText2HTML()
val html = converter.getText(document)
println(html)
document.close()
stream.close()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment