Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Remove text layer from PDF using Apache PDFBox in a Groovy script
#!/usr/bin/env groovy
@Grab('org.apache.pdfbox:pdfbox:1.8.8')
import org.apache.pdfbox.cos.COSArray
import org.apache.pdfbox.cos.COSString
import org.apache.pdfbox.pdfparser.PDFStreamParser
import org.apache.pdfbox.pdfwriter.ContentStreamWriter
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.PDPage
import org.apache.pdfbox.pdmodel.common.PDStream
import org.apache.pdfbox.util.PDFOperator
def doc = PDDocument.load(args[0])
doc.documentCatalog.allPages.each { PDPage page ->
def parser = new PDFStreamParser(page.contents)
parser.parse()
def j = 0
def flags = []
parser.tokens.each { token ->
if (token instanceof PDFOperator) {
PDFOperator op = (PDFOperator) token;
if (op.operation == 'Tj') {
COSString str = (COSString) parser.tokens.get(j - 1)
if (str.string.contains('www.it-ebooks.info')) {
flags << j - 1
flags << j
}
}
}
j++
}
List tokens = []
for (int i = 0; i < parser.tokens.size(); i++) {
if (!flags.contains(i)) {
tokens << parser.tokens[i]
}
}
def newContents = new PDStream(doc)
def writer = new ContentStreamWriter(newContents.createOutputStream())
writer.writeTokens(tokens)
newContents.addCompression()
page.setContents(newContents)
}
doc.save(args[1])
if (doc) {
doc.close()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment