Skip to content

Instantly share code, notes, and snippets.

@ameisehaufen
Created March 18, 2021 00:38
Show Gist options
  • Save ameisehaufen/9bb44c182a270673d1e2f1fffd615e5a to your computer and use it in GitHub Desktop.
Save ameisehaufen/9bb44c182a270673d1e2f1fffd615e5a to your computer and use it in GitHub Desktop.
Groovy Script to extract pdf text from highlight annotations using PDFBOX library. Useful in Freeplane scripts
//@ExecutionModes({ON_ALL_SELECTED_NODES})
// Author: Otavio Camargo @ameisehaufen
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.22')
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
// PDDocument document = new PDDocument();
String pdfFilePath = 'temp.pdf'
PDDocument pdfDoc = PDDocument.load(new File(pdfFilePath));
ArrayList<String> highlightedTexts = new ArrayList<>();
int pageNum=0;
for( PDPage pdfpage : pdfDoc.getPages()-60 )
{
pageNum++;
List<PDAnnotation> annotations = pdfpage.getAnnotations();
//first setup text extraction regions
for( int i=0; i<annotations.size(); i++ )
{
PDAnnotation annot = annotations.get(i);
annotNote = annot.getContents(); // Conteudo anotado na nota
annotSubType = annot.getSubtype() // Tipo da nota (Highlight, Text)
// annotTitle = annot.getTitlePopup(); // Autor da nota
if( annotSubType.equals('Highlight') )
{
// extract highlighted text
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
COSArray quadsArray = (COSArray) annot.getCOSObject().getCOSArray(COSName.getPDFName("QuadPoints"));
String str = null;
for(int j=1, k=0; j<=(quadsArray.size()/8); j++) {
Float ULX = quadsArray.get(0+k).floatValue();
Float ULY = quadsArray.get(1+k).floatValue();
Float URX = quadsArray.get(2+k).floatValue();
Float URY = quadsArray.get(3+k).floatValue();
Float LLX = quadsArray.get(4+k).floatValue();
Float LLY = quadsArray.get(5+k).floatValue();
Float LRX = quadsArray.get(6+k).floatValue();
Float LRY = quadsArray.get(7+k).floatValue();
k+=8;
float ulx = ULX - 1; // upper left x.
float uly = ULY; // upper left y.
float width = URX - LLX; // calculated by upperRightX - lowerLeftX.
float height = URY - LLY; // calculated by upperRightY - lowerLeftY.
PDRectangle pageSize = pdfpage.getMediaBox();
uly = pageSize.getHeight() - uly;
Rectangle2D.Float rectangle_2 = new Rectangle2D.Float(ulx, uly, width, height);
stripper.addRegion("highlightedRegion", rectangle_2);
stripper.extractRegions(pdfpage);
String highlightedText = stripper.getTextForRegion("highlightedRegion").replaceAll("[\\n\\t ]+", " ");
if(j > 1) {
str = str.concat(highlightedText);
} else {
str = highlightedText;
}
}
highlightedTexts.add(str);
logInfo = str;
logMsg=">>>>>>>>>>Pagina: " + pageNum + ", Sessão: " + annotNote + ", Nota: " + annotNote + "Texto sublinhado: " + logInfo;
logger.info(logMsg);
}
}
}
pdfDoc.close();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment