Skip to content

Instantly share code, notes, and snippets.

@marcgeld
Created April 21, 2019 21:49
Show Gist options
  • Save marcgeld/e6056f9b5e96d525c0bed218b42615fe to your computer and use it in GitHub Desktop.
Save marcgeld/e6056f9b5e96d525c0bed218b42615fe to your computer and use it in GitHub Desktop.
Extract images from a pdf file to multipage TIFF (Tesseract-ocr accepts multipage TIFF, but not a pdf file with images as input)
#!/usr/bin/env groovy
// Java 9 or later (…for the TIFF ImageIO Plugin)
@Grab(group='ch.qos.logback', module='logback-classic', version='1.2.3')
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.15')
@Grab(group='commons-io', module='commons-io', version='2.6')
import org.apache.pdfbox.pdfwriter.*
import org.apache.pdfbox.pdmodel.*
import org.apache.pdfbox.pdmodel.font.*
import org.apache.pdfbox.pdmodel.edit.*
import org.apache.pdfbox.pdmodel.graphics.*
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
import java.awt.image.BufferedImage
import javax.imageio.IIOImage
import javax.imageio.ImageIO
import javax.imageio.stream.ImageOutputStream
import javax.imageio.ImageWriter
import javax.imageio.ImageWriteParam
import org.apache.pdfbox.cos.COSName
import java.nio.*
def appName = this.getClass().getName()
// Add .removeExtension() to String
String.metaClass.mixin org.apache.commons.io.FilenameUtils
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true")
def cli = new CliBuilder(
usage:"${appName} [<options>]",
header: 'Options:',
footer: 'Use with file OR dir path'
)
cli.with {
f(longOpt: 'file', 'filepath', args: 1, required: false)
d(longOpt: 'dir', 'path to directory', args: 1, required: false)
h(longOpt: 'help', 'Print help', required: false)
}
def opt = cli.parse(args)
if ( !opt || opt.h ) {
cli.usage()
return
} else if (opt.f && opt.d) {
cli.usage()
return
}
def found = []
if (opt.d) {
currentDir = new File(opt.d).getAbsoluteFile()
currentDir.traverse(type: groovy.io.FileType.FILES, nameFilter: ~/(?x).*.pdf/) { f ->
found << f.getAbsoluteFile()
}
} else {
found << new File(opt.f).getAbsoluteFile()
}
found.each{ f ->
PDDocument doc = null
if (f.exists() && f.canRead()) {
println "Processing: file ${f}"
}
else {
println "Error: file ${f} not found or not readble"
return
}
outFile="${f.getPath().removeExtension()}.tiff"
ImageOutputStream outputStream = ImageIO.createImageOutputStream(new FileOutputStream(outFile))
ImageWriter writer = ImageIO.getImageWritersByFormatName("TIFF").next()
writer.setOutput(outputStream)
ImageWriteParam params = writer.getDefaultWriteParam()
params.setCompressionMode(ImageWriteParam.MODE_EXPLICIT)
// Compression: None, PackBits, ZLib, Deflate, LZW, JPEG and CCITT variants allowed
params.setCompressionType("Deflate")
writer.prepareWriteSequence(null)
doc = PDDocument.load( f )
PDPageTree pageTree = doc.getDocumentCatalog().getPages()
for ( PDPage page : pageTree.iterator() ) {
PDResources pdResources = page.getResources()
for ( COSName xObjCosName : pdResources.getXObjectNames() ) {
PDXObject pdxObj = pdResources.getXObject(xObjCosName)
if ( pdxObj instanceof PDImageXObject ) {
PDImageXObject pdImageXObject = (PDImageXObject) pdxObj
BufferedImage bufferedImage = pdImageXObject.getImage()
writer.writeToSequence(new IIOImage(bufferedImage, null, null), params)
}
}
}
writer.endWriteSequence()
println( "Created outfile: ${outFile}")
doc.close()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment