Created
March 18, 2021 00:38
-
-
Save ameisehaufen/9bb44c182a270673d1e2f1fffd615e5a to your computer and use it in GitHub Desktop.
Groovy Script to extract pdf text from highlight annotations using PDFBOX library. Useful in Freeplane scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//@ExecutionModes({ON_ALL_SELECTED_NODES}) | |
// Author: Otavio Camargo @ameisehaufen | |
@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.22') | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; | |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; | |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText; | |
import org.apache.pdfbox.text.PDFTextStripperByArea; | |
import org.apache.pdfbox.pdmodel.common.PDRectangle; | |
import java.awt.geom.Rectangle2D; | |
import org.apache.pdfbox.cos.COSArray; | |
import org.apache.pdfbox.cos.COSName; | |
import org.apache.pdfbox.pdmodel.PDDocumentCatalog; | |
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; | |
// PDDocument document = new PDDocument(); | |
String pdfFilePath = 'temp.pdf' | |
PDDocument pdfDoc = PDDocument.load(new File(pdfFilePath)); | |
ArrayList<String> highlightedTexts = new ArrayList<>(); | |
int pageNum=0; | |
for( PDPage pdfpage : pdfDoc.getPages()-60 ) | |
{ | |
pageNum++; | |
List<PDAnnotation> annotations = pdfpage.getAnnotations(); | |
//first setup text extraction regions | |
for( int i=0; i<annotations.size(); i++ ) | |
{ | |
PDAnnotation annot = annotations.get(i); | |
annotNote = annot.getContents(); // Conteudo anotado na nota | |
annotSubType = annot.getSubtype() // Tipo da nota (Highlight, Text) | |
// annotTitle = annot.getTitlePopup(); // Autor da nota | |
if( annotSubType.equals('Highlight') ) | |
{ | |
// extract highlighted text | |
PDFTextStripperByArea stripper = new PDFTextStripperByArea(); | |
COSArray quadsArray = (COSArray) annot.getCOSObject().getCOSArray(COSName.getPDFName("QuadPoints")); | |
String str = null; | |
for(int j=1, k=0; j<=(quadsArray.size()/8); j++) { | |
Float ULX = quadsArray.get(0+k).floatValue(); | |
Float ULY = quadsArray.get(1+k).floatValue(); | |
Float URX = quadsArray.get(2+k).floatValue(); | |
Float URY = quadsArray.get(3+k).floatValue(); | |
Float LLX = quadsArray.get(4+k).floatValue(); | |
Float LLY = quadsArray.get(5+k).floatValue(); | |
Float LRX = quadsArray.get(6+k).floatValue(); | |
Float LRY = quadsArray.get(7+k).floatValue(); | |
k+=8; | |
float ulx = ULX - 1; // upper left x. | |
float uly = ULY; // upper left y. | |
float width = URX - LLX; // calculated by upperRightX - lowerLeftX. | |
float height = URY - LLY; // calculated by upperRightY - lowerLeftY. | |
PDRectangle pageSize = pdfpage.getMediaBox(); | |
uly = pageSize.getHeight() - uly; | |
Rectangle2D.Float rectangle_2 = new Rectangle2D.Float(ulx, uly, width, height); | |
stripper.addRegion("highlightedRegion", rectangle_2); | |
stripper.extractRegions(pdfpage); | |
String highlightedText = stripper.getTextForRegion("highlightedRegion").replaceAll("[\\n\\t ]+", " "); | |
if(j > 1) { | |
str = str.concat(highlightedText); | |
} else { | |
str = highlightedText; | |
} | |
} | |
highlightedTexts.add(str); | |
logInfo = str; | |
logMsg=">>>>>>>>>>Pagina: " + pageNum + ", Sessão: " + annotNote + ", Nota: " + annotNote + "Texto sublinhado: " + logInfo; | |
logger.info(logMsg); | |
} | |
} | |
} | |
pdfDoc.close(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment