Last active January 8, 2024 03:08
Java: Class to get PDF annotations using the itextpdf lib. #java, #pdf, #annotations, #pdf-annotations, #itextpdf
package com.itextpdf;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfArray;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfString;
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter;
import com.itextpdf.text.pdf.parser.RenderFilter;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
* Class for getting PDF annotations using the itextpdf lib (tested whith itextpdf-5.4.3).
* At the moment, this class allows to get highlighted text, underlined text, and
* text in comments.
* <br/>
* Document management — Portable document format — Part 1: PDF 1.7
* @author PSantos
public class PDFAnnotations {
public static void main(String args[]) {
try {
// Reads and parses a PDF document
PdfReader reader = new PdfReader("C:\\Users\\PSantos\\Desktop\\lixo\\exemplo.pdf");
// For each PDF page
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
// Get a page a PDF page
PdfDictionary page = reader.getPageN(i);
// Get all the annotations of page i
PdfArray annotsArray = page.getAsArray(PdfName.ANNOTS);
// If page does not have annotations
if (page.getAsArray(PdfName.ANNOTS) == null) {
// For each annotation
for (int j = 0; j < annotsArray.size(); ++j) {
// For current annotation
PdfDictionary curAnnot = annotsArray.getAsDict(j);
// Check the annotation subtype and print its text if not null
writeAnnotation(curAnnot, reader, i);
} catch (Exception e) {
* Check the annotation subtype and print its text.
* @param annot annotation to write.
* @param reader pdf document containing the annotation.
* @param pageNumber pdf page number containing the annotation.
* @throws IOException
public static void writeAnnotation(PdfDictionary annot, PdfReader reader, int pageNumber) throws IOException {
if(annot == null) {
// System.out.print(annot.get(PdfName.SUBTYPE));
// System.out.print(" -> Rect: " + annot.get(PdfName.RECT));
PdfString text = null;
boolean mayHaveTextAnnotated = false;
// Highlights with comments (balloons) and Highliths
if (PdfName.HIGHLIGHT.equals(annot.get(PdfName.SUBTYPE))) {
// Only Highlights with comments may have text
text = (PdfString) annot.get(PdfName.CONTENTS);
mayHaveTextAnnotated = true;
} else if (PdfName.UNDERLINE.equals(annot.get(PdfName.SUBTYPE))) {
text = annot.getAsString(PdfName.CONTENTS);
mayHaveTextAnnotated = true;
// A comment (a balloon with a comment)
} else if (PdfName.TEXT.equals(annot.get(PdfName.SUBTYPE))) {
text = annot.getAsString(PdfName.CONTENTS);
} else {
text = annot.getAsString(PdfName.CONTENTS);
if(text != null) {
System.out.println(" -> " + text);
if(mayHaveTextAnnotated) {
PdfArray rectangle = (PdfArray) annot.get(PdfName.RECT); // ex: [82.1569, 757.575, 124.395, 769.305]
String textHighlighted = getTextFromRectangle(rectangle, reader, pageNumber);
if(textHighlighted != null) {
System.out.println(" Annotated text -> " + textHighlighted);
* Extracts the text {@code rectangle}, located on page {@code pageNumber}
* of the pdf {@code reader}.
* <p>The text extracted by this method is not perfect. Usually extracts an
* unnecessary (not desirable) extra character. For instance, for an annotations
* like "[This is an annotated] text", the method will extract
* "This is an annotated t". </p> The extra character may appear before or
* after the annotated text.
* @param reader
* @param pageNumber
* @param rectangle ex: [82.1569, 757.575, 124.395, 769.305]
* @return the extracted text or null.
* @throws IOException
public static String getTextFromRectangle(PdfArray rectangle,
PdfReader reader, int pageNumber) throws IOException {
if(rectangle == null) {
return null;
// Get the retangle coodinates
float llx = rectangle.getAsNumber(0).floatValue();
float lly = rectangle.getAsNumber(1).floatValue();
float urx = rectangle.getAsNumber(2).floatValue();
float ury = rectangle.getAsNumber(3).floatValue();
Rectangle rect = new Rectangle(llx, lly, urx, ury);
RenderFilter filter = new RegionTextRenderFilter(rect);
TextExtractionStrategy strategy =
new FilteredTextRenderListener(
new LocationTextExtractionStrategy(), filter);
return PdfTextExtractor.getTextFromPage(reader, pageNumber, strategy);
Copy link

May I ask if is there a way to deal with non-ascii words in FreeText.
I have read several words that typed into pdf, it returns with correct ascii part, but the unicode part is mess.
I decipher with utf-16 and get better result: ~70% words correct, but there are still many words weired.

It seems that pdf store a paragraph of freetext using multiple encoding scheme ?

I call text..getEncoding andget nothing


Copy link

i000313 commented Jan 26, 2021

Hi. Sorry, but I can't help you because I never used this code.

Copy link

rmd13 commented Jan 26, 2021

Hi, I have solved. I mistake signed to unsigned.


Copy link

i000313 commented Feb 23, 2021

Glad to know. Thank you for let me know.

