Skip to content

Instantly share code, notes, and snippets.

@aristotll
Forked from i000313/PDFAnnotations.java
Created April 9, 2016 15:25
Show Gist options
  • Save aristotll/8f368302d7eb86717d72c74f68712dc2 to your computer and use it in GitHub Desktop.
Save aristotll/8f368302d7eb86717d72c74f68712dc2 to your computer and use it in GitHub Desktop.
Java: Class to get PDF annotations using the itextpdf lib. #java, #pdf, #annotations, #pdf-annotations, #itextpdf
package com.itextpdf;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfArray;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfString;
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter;
import com.itextpdf.text.pdf.parser.RenderFilter;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import java.io.IOException;
/**
* Class for getting PDF annotations using the itextpdf lib (tested whith itextpdf-5.4.3).
* At the moment, this class allows to get highlighted text, underlined text, and
* text in comments.
* <br/>
* Document management — Portable document format — Part 1: PDF 1.7
* http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf
*
* @author PSantos
*/
public class PDFAnnotations {
public static void main(String args[]) {
try {
// Reads and parses a PDF document
PdfReader reader = new PdfReader("C:\\Users\\PSantos\\Desktop\\lixo\\exemplo.pdf");
// For each PDF page
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
// Get a page a PDF page
PdfDictionary page = reader.getPageN(i);
// Get all the annotations of page i
PdfArray annotsArray = page.getAsArray(PdfName.ANNOTS);
// If page does not have annotations
if (page.getAsArray(PdfName.ANNOTS) == null) {
continue;
}
// For each annotation
for (int j = 0; j < annotsArray.size(); ++j) {
// For current annotation
PdfDictionary curAnnot = annotsArray.getAsDict(j);
// Check the annotation subtype and print its text if not null
writeAnnotation(curAnnot, reader, i);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Check the annotation subtype and print its text.
*
* @param annot annotation to write.
* @param reader pdf document containing the annotation.
* @param pageNumber pdf page number containing the annotation.
* @throws IOException
*/
public static void writeAnnotation(PdfDictionary annot, PdfReader reader, int pageNumber) throws IOException {
if(annot == null) {
return;
}
// System.out.print(annot.get(PdfName.SUBTYPE));
// System.out.print(" -> Rect: " + annot.get(PdfName.RECT));
PdfString text = null;
boolean mayHaveTextAnnotated = false;
// Highlights with comments (balloons) and Highliths
if (PdfName.HIGHLIGHT.equals(annot.get(PdfName.SUBTYPE))) {
// Only Highlights with comments may have text
text = (PdfString) annot.get(PdfName.CONTENTS);
mayHaveTextAnnotated = true;
} else if (PdfName.UNDERLINE.equals(annot.get(PdfName.SUBTYPE))) {
text = annot.getAsString(PdfName.CONTENTS);
mayHaveTextAnnotated = true;
// A comment (a balloon with a comment)
} else if (PdfName.TEXT.equals(annot.get(PdfName.SUBTYPE))) {
text = annot.getAsString(PdfName.CONTENTS);
} else {
text = annot.getAsString(PdfName.CONTENTS);
}
if(text != null) {
System.out.println(" -> " + text);
}
if(mayHaveTextAnnotated) {
PdfArray rectangle = (PdfArray) annot.get(PdfName.RECT); // ex: [82.1569, 757.575, 124.395, 769.305]
String textHighlighted = getTextFromRectangle(rectangle, reader, pageNumber);
if(textHighlighted != null) {
System.out.println(" Annotated text -> " + textHighlighted);
}
}
}
/**
* Extracts the text {@code rectangle}, located on page {@code pageNumber}
* of the pdf {@code reader}.
* <p>The text extracted by this method is not perfect. Usually extracts an
* unnecessary (not desirable) extra character. For instance, for an annotations
* like "[This is an annotated] text", the method will extract
* "This is an annotated t". </p> The extra character may appear before or
* after the annotated text.
*
* @param reader
* @param pageNumber
* @param rectangle ex: [82.1569, 757.575, 124.395, 769.305]
* @return the extracted text or null.
* @throws IOException
*/
public static String getTextFromRectangle(PdfArray rectangle,
PdfReader reader, int pageNumber) throws IOException {
if(rectangle == null) {
return null;
}
// Get the retangle coodinates
float llx = rectangle.getAsNumber(0).floatValue();
float lly = rectangle.getAsNumber(1).floatValue();
float urx = rectangle.getAsNumber(2).floatValue();
float ury = rectangle.getAsNumber(3).floatValue();
Rectangle rect = new Rectangle(llx, lly, urx, ury);
RenderFilter filter = new RegionTextRenderFilter(rect);
TextExtractionStrategy strategy =
new FilteredTextRenderListener(
new LocationTextExtractionStrategy(), filter);
return PdfTextExtractor.getTextFromPage(reader, pageNumber, strategy);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment