Skip to content

Instantly share code, notes, and snippets.

@rvprasad
Last active May 24, 2023 01:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rvprasad/f4cf1253561f7c39691c715a1dd71d9d to your computer and use it in GitHub Desktop.
Save rvprasad/f4cf1253561f7c39691c715a1dd71d9d to your computer and use it in GitHub Desktop.
Extracts comments associated with highlights in a PDF document along with page numbers
/*
* Copyright (c) 2018, Venkatesh-Prasad Ranganath
*
* Licensed under BSD 3-clause License
*
* Author: Venkatesh-Prasad Ranganath
*
* Adapted from https://stackoverflow.com/questions/33253757/java-apache-pdfbox-extract-highlighted-text
*/
// Execute by issuing "groovy extractCommentsWithPageNum.groovy" command from the terminal
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.6')
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripperByArea
import org.apache.pdfbox.cos.COSName
import java.awt.geom.Rectangle2D
import groovy.cli.commons.CliBuilder
def cli = new CliBuilder(usage:"groovy extractReview.groovy")
cli.header = 'Each output line contains one block of highlighted text and ' +
'corresponding comments separated by ;. New lines in comments are ' +
'replaced by #.'
cli.i(longOpt:'in', args:1, argName:'in', required:true,
'PDF file to extract content from')
cli.o(longOpt:'out', args:1, argName:'out', required:true,
'Output file')
def options = cli.parse(args)
if (!options) {
return
}
new File(options.o).withWriter { writer ->
final doc = PDDocument.load(new File(options.i))
final pageTree = doc.getDocumentCatalog().getPages()
pageTree.each { page ->
pageNum = pageTree.indexOf(page)
page.getAnnotations().findAll { it.getSubtype() == "Highlight" }
.collect { annotation ->
final stripperByArea = new PDFTextStripperByArea()
final quadPoints1 = annotation.getCOSObject().getDictionaryObject(
COSName.getPDFName("QuadPoints"))
final quadPoints2 = [].withDefault {[] }
(0..<quadPoints1.size() / 8).each { i ->
(0..<8).each { j ->
quadPoints2[i] << quadPoints1[i * 8 + j].floatValue()
}
}
def text = quadPoints2.collect { qp ->
float ulx = qp[0] - 1
float uly = page.getMediaBox().getHeight() - qp[1]
float width = qp[2] - qp[4]
float height = qp[3] - qp[5]
final rect = new Rectangle2D.Float(ulx, uly, width, height)
stripperByArea.addRegion('highlighted', rect)
stripperByArea.extractRegions(page)
stripperByArea.getTextForRegion('highlighted')
}.join('')
def comment = annotation.getContents() ?: ''
"PAGE$pageNum,\"${text.replace('\n', ' ').trim()}\" --- ${comment.trim().replace('\r', '\n')}\n\n"
}.each(writer.&println)
}
doc.close()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment