rvprasad/extractCommentsWithPageNum.groovy

## extractCommentsWithPageNum.groovy
/*
 * Copyright (c) 2018, Venkatesh-Prasad Ranganath
 *
 * Licensed under BSD 3-clause License
 *
 * Author: Venkatesh-Prasad Ranganath
 *
 * Adapted from https://stackoverflow.com/questions/33253757/java-apache-pdfbox-extract-highlighted-text
 */

// Execute by issuing "groovy extractCommentsWithPageNum.groovy" command from the terminal

@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.6')

import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripperByArea
import org.apache.pdfbox.cos.COSName

import java.awt.geom.Rectangle2D
import groovy.cli.commons.CliBuilder

def cli = new CliBuilder(usage:"groovy extractReview.groovy")
cli.header = 'Each output line contains one block of highlighted text and ' +
    'corresponding comments separated by ;.  New lines in comments are ' +
    'replaced by #.'
cli.i(longOpt:'in', args:1, argName:'in', required:true,
    'PDF file to extract content from')
cli.o(longOpt:'out', args:1, argName:'out', required:true,
    'Output file')
def options = cli.parse(args)
if (!options) {
    return
}

new File(options.o).withWriter { writer ->
    final doc = PDDocument.load(new File(options.i))
    final pageTree = doc.getDocumentCatalog().getPages()
    pageTree.each { page ->
        pageNum = pageTree.indexOf(page)
        page.getAnnotations().findAll { it.getSubtype() == "Highlight" }
            .collect { annotation ->
                final stripperByArea = new PDFTextStripperByArea()
                final quadPoints1 = annotation.getCOSObject().getDictionaryObject(
                        COSName.getPDFName("QuadPoints"))
                final quadPoints2 = [].withDefault {[] }
                (0..<quadPoints1.size() / 8).each { i ->
                    (0..<8).each { j ->
                        quadPoints2[i] << quadPoints1[i * 8 + j].floatValue()
                    }
                }
                def text = quadPoints2.collect { qp ->
                    float ulx = qp[0] - 1
                    float uly = page.getMediaBox().getHeight() - qp[1]
                    float width = qp[2] - qp[4]
                    float height = qp[3] - qp[5]
                    final rect = new Rectangle2D.Float(ulx, uly, width, height)
                    stripperByArea.addRegion('highlighted', rect)
                    stripperByArea.extractRegions(page)
                    stripperByArea.getTextForRegion('highlighted')
                }.join('')
                def comment = annotation.getContents() ?: ''
"PAGE$pageNum,\"${text.replace('\n', ' ').trim()}\" --- ${comment.trim().replace('\r', '\n')}\n\n"
            }.each(writer.&println)
    }
    doc.close()
}
	/*
	* Copyright (c) 2018, Venkatesh-Prasad Ranganath
	*
	* Licensed under BSD 3-clause License
	*
	* Author: Venkatesh-Prasad Ranganath
	*
	* Adapted from https://stackoverflow.com/questions/33253757/java-apache-pdfbox-extract-highlighted-text
	*/

	// Execute by issuing "groovy extractCommentsWithPageNum.groovy" command from the terminal

	@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.6')

	import org.apache.pdfbox.pdmodel.PDDocument
	import org.apache.pdfbox.text.PDFTextStripperByArea
	import org.apache.pdfbox.cos.COSName

	import java.awt.geom.Rectangle2D
	import groovy.cli.commons.CliBuilder

	def cli = new CliBuilder(usage:"groovy extractReview.groovy")
	cli.header = 'Each output line contains one block of highlighted text and ' +
	'corresponding comments separated by ;. New lines in comments are ' +
	'replaced by #.'
	cli.i(longOpt:'in', args:1, argName:'in', required:true,
	'PDF file to extract content from')
	cli.o(longOpt:'out', args:1, argName:'out', required:true,
	'Output file')
	def options = cli.parse(args)
	if (!options) {
	return
	}

	new File(options.o).withWriter { writer ->
	final doc = PDDocument.load(new File(options.i))
	final pageTree = doc.getDocumentCatalog().getPages()
	pageTree.each { page ->
	pageNum = pageTree.indexOf(page)
	page.getAnnotations().findAll { it.getSubtype() == "Highlight" }
	.collect { annotation ->
	final stripperByArea = new PDFTextStripperByArea()
	final quadPoints1 = annotation.getCOSObject().getDictionaryObject(
	COSName.getPDFName("QuadPoints"))
	final quadPoints2 = [].withDefault {[] }
	(0..<quadPoints1.size() / 8).each { i ->
	(0..<8).each { j ->
	quadPoints2[i] << quadPoints1[i * 8 + j].floatValue()
	}
	}
	def text = quadPoints2.collect { qp ->
	float ulx = qp[0] - 1
	float uly = page.getMediaBox().getHeight() - qp[1]
	float width = qp[2] - qp[4]
	float height = qp[3] - qp[5]
	final rect = new Rectangle2D.Float(ulx, uly, width, height)
	stripperByArea.addRegion('highlighted', rect)
	stripperByArea.extractRegions(page)
	stripperByArea.getTextForRegion('highlighted')
	}.join('')
	def comment = annotation.getContents() ?: ''
	"PAGE$pageNum,\"${text.replace('\n', ' ').trim()}\" --- ${comment.trim().replace('\r', '\n')}\n\n"
	}.each(writer.&println)
	}
	doc.close()
	}