Skip to content

Instantly share code, notes, and snippets.

@joelkuiper
Last active October 21, 2020 07:25
Show Gist options
  • Save joelkuiper/331a399961941989fec8 to your computer and use it in GitHub Desktop.
Save joelkuiper/331a399961941989fec8 to your computer and use it in GitHub Desktop.
A example class for adding highlights to PDFs based on a Pattern or String
/*
* Copyright 2014 Joël Kuiper
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.examples.pdmodel;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDGamma;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
/**
* This class implements the methods highlight and highlightDefault which will add a highlight to the PDF based on a
* Pattern or String. The idea is to extend the PDFTextStripper and override the methods that write to the Output to
* instead write to a TextCache that keeps data on the position of the TextPositions. From this information we can then
* derive bounding boxes (and quads) that can be used to write the annotations. See the main method for example usage
*
* @author Joël Kuiper <me@joelkuiper.eu>
*
*/
public class TextHighlight extends PDFTextStripper
{
private float verticalTolerance = 0;
private float heightModifier = (float) 1.250;
/**
* Internal utility class
*/
private class Match
{
public final String str;
public final List<TextPosition> positions;
public Match(final String str, final List<TextPosition> positions)
{
this.str = str;
this.positions = positions;
}
}
/**
* Internal utility class that keeps a mapping from the text contents to their TextPositions. This is needed to
* compute bounding boxes. The data is stored on a per-page basis (keyed on the 1-based pageNo)
*/
private class TextCache
{
private final Map<Integer, StringBuilder> texts = new HashMap<Integer, StringBuilder>();
private final Map<Integer, ArrayList<TextPosition>> positions = new HashMap<Integer, ArrayList<TextPosition>>();
private StringBuilder obtainStringBuilder(final Integer pageNo)
{
StringBuilder sb = texts.get(pageNo);
if (sb == null)
{
sb = new StringBuilder();
texts.put(pageNo, sb);
}
return sb;
}
private ArrayList<TextPosition> obtainTextPositions(final Integer pageNo)
{
ArrayList<TextPosition> textPositions = positions.get(pageNo);
if (textPositions == null)
{
textPositions = new ArrayList<TextPosition>();
positions.put(pageNo, textPositions);
}
return textPositions;
}
public String getText(final Integer pageNo)
{
return obtainStringBuilder(pageNo).toString();
}
public List<TextPosition> getTextPositions(final Integer pageNo)
{
return obtainTextPositions(pageNo);
}
public void append(final String str, final TextPosition pos)
{
final int currentPage = getCurrentPageNo();
final ArrayList<TextPosition> positions = obtainTextPositions(currentPage);
final StringBuilder sb = obtainStringBuilder(currentPage);
for (int i = 0; i < str.length(); i++)
{
sb.append(str.charAt(i));
positions.add(pos);
}
}
/**
* Given a page and a pattern it will return a list of matches for that pattern. A Match is a tuple of <String,
* List<TextPositions>>
*
* @param pageNo
* @param pattern
* @return list of matches
*/
public List<Match> match(final Integer pageNo, final Pattern pattern)
{
final Matcher matcher = pattern.matcher(this.getText(pageNo));
final List<Match> matches = new ArrayList<Match>();
while (matcher.find())
{
final List<TextPosition> elements = getTextPositions(pageNo).subList(
matcher.start(), matcher.end());
matches.add(new Match(matcher.group(), elements));
}
return matches;
}
}
private TextCache textCache;
private PDGamma defaultColor;
/**
* Instantiate a new object. This object will load properties from PDFTextAnnotator.properties and will apply
* encoding-specific conversions to the output text.
*
* @param encoding The encoding that the output will be written in.
* @throws IOException If there is an error reading the properties.
*/
public TextHighlight(final String encoding) throws IOException
{
super(encoding);
}
/**
* Computes a series of bounding boxes (PDRectangle) from a list of TextPositions. It will create a new bounding box
* if the vertical tolerance is exceeded
*
* @param positions
* @throws IOException
*/
public List<PDRectangle> getTextBoundingBoxes(final List<TextPosition> positions)
{
final List<PDRectangle> boundingBoxes = new ArrayList<PDRectangle>();
float lowerLeftX = -1, lowerLeftY = -1, upperRightX = -1, upperRightY = -1;
boolean first = true;
for (int i = 0; i < positions.size(); i++)
{
final TextPosition position = positions.get(i);
if (position == null)
{
continue;
}
final Matrix textPos = position.getTextPos();
final float height = position.getHeight() * getHeightModifier();
if (first)
{
lowerLeftX = textPos.getXPosition();
upperRightX = lowerLeftX + position.getWidth();
lowerLeftY = textPos.getYPosition();
upperRightY = lowerLeftY + height;
first = false;
continue;
}
// we are still on the same line
if (Math.abs(textPos.getYPosition() - lowerLeftY) <= getVerticalTolerance())
{
upperRightX = textPos.getXPosition() + position.getWidth();
upperRightY = textPos.getYPosition() + height;
}
else
{
final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX,
upperRightY);
boundingBoxes.add(boundingBox);
// new line
lowerLeftX = textPos.getXPosition();
upperRightX = lowerLeftX + position.getWidth();
lowerLeftY = textPos.getYPosition();
upperRightY = lowerLeftY + height;
}
}
if (!(lowerLeftX == -1 && lowerLeftY == -1 && upperRightX == -1 && upperRightY == -1))
{
final PDRectangle boundingBox = boundingBox(lowerLeftX, lowerLeftY, upperRightX,
upperRightY);
boundingBoxes.add(boundingBox);
}
return boundingBoxes;
}
private PDRectangle boundingBox(final float lowerLeftX, final float lowerLeftY,
final float upperRightX, final float upperRightY)
{
final PDRectangle boundingBox = new PDRectangle();
boundingBox.setLowerLeftX(lowerLeftX);
boundingBox.setLowerLeftY(lowerLeftY);
boundingBox.setUpperRightX(upperRightX);
boundingBox.setUpperRightY(upperRightY);
return boundingBox;
}
/**
* Highlights a pattern within the PDF with the default color.
* Returns the list of added annotations for further modification
* Note: it will process every page, but cannot process patterns that span multiple pages
* Note: it will not work for top-bottom text (such as Chinese)
*
* @param pattern String that will be converted to Regex pattern
* @throws IOException
*/
public List<PDAnnotationTextMarkup> highlightDefault(final String pattern) throws IOException
{
return this.highlightDefault(Pattern.compile(pattern));
}
/**
* Highlights a pattern within the PDF with the default color.
* Returns the list of added annotations for further modification.
* Note: it will process every page, but cannot process patterns that span multiple pages.
* Note: it will not work for top-bottom text (such as Chinese)
*
* @param pattern Pattern (regex)
* @throws IOException
*/
public List<PDAnnotationTextMarkup> highlightDefault(final Pattern pattern) throws IOException
{
final List<PDAnnotationTextMarkup> highlights = this.highlight(pattern,
PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
for (final PDAnnotationTextMarkup highlight : highlights)
{
highlight.setConstantOpacity((float) 0.8);
highlight.setColour(getDefaultColor());
highlight.setPrinted(true);
}
return highlights;
}
public List<PDAnnotationTextMarkup> highlight(final String pattern, final String subType)
throws IOException
{
return this.highlight(Pattern.compile(pattern), subType);
}
public List<PDAnnotationTextMarkup> highlight(final Pattern pattern, final String subType)
throws IOException
{
if (textCache == null || document == null)
{
throw new IllegalArgumentException("TextCache was not initilized");
}
final List<PDPage> pages = document.getDocumentCatalog().getAllPages();
final ArrayList<PDAnnotationTextMarkup> newAnnotations = new ArrayList<PDAnnotationTextMarkup>();
for (int pageIndex = getStartPage() - 1; pageIndex < getEndPage()
&& pageIndex < pages.size(); pageIndex++)
{
final PDPage page = pages.get(pageIndex);
final List<PDAnnotation> annotations = page.getAnnotations();
final List<Match> matches = textCache.match(pageIndex + 1, pattern);
for (final Match match : matches)
{
final List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(match.positions);
if (textBoundingBoxes.size() > 0)
{
final PDAnnotationTextMarkup annotation = new PDAnnotationTextMarkup(subType);
annotation.setRectangle(textBoundingBoxes.get(0));
final float[] quads = this.getQuads(textBoundingBoxes);
annotation.setQuadPoints(quads);
annotation.setContents(match.str);
annotations.add(annotation);
newAnnotations.add(annotation);
}
}
}
return newAnnotations;
}
/**
* Computes a float array of size 8 * length(rects) with all the vertices of the consecutive PDRectangles
*/
public float[] getQuads(final List<PDRectangle> rects)
{
final float[] quads = new float[8 * rects.size()];
int cursor = 0;
for (final PDRectangle rect : rects)
{
final float[] tmp = this.getQuads(rect);
for (int i = 0; i < tmp.length; i++)
{
quads[cursor + i] = tmp[i];
}
cursor = cursor + 8;
}
return quads;
}
/**
* Computes a float array of size eight with all the vertices of the PDRectangle
*/
public float[] getQuads(final PDRectangle rect)
{
final float[] quads = new float[8];
// top left
quads[0] = rect.getLowerLeftX(); // x1
quads[1] = rect.getUpperRightY(); // y1
// bottom left
quads[2] = rect.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
// top right
quads[4] = quads[0]; // x3
quads[5] = rect.getLowerLeftY(); // y3
// bottom right
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
return quads;
}
public void setDefaultColor(final PDGamma color)
{
defaultColor = color;
}
public PDGamma getDefaultColor()
{
if (defaultColor != null)
{
return defaultColor;
}
else
{ // #fbe85a
final PDGamma c = new PDGamma();
c.setR((float) 0.9843);
c.setG((float) 0.9098);
c.setB((float) 0.3879);
return c;
}
}
/**
* The vertical tolerance determines whether a character is still on the same line
*/
public float getVerticalTolerance()
{
return verticalTolerance;
}
/**
* {@link getVerticalTolerance}
*/
public void setVerticalTolerance(final float tolerance)
{
verticalTolerance = tolerance;
}
/**
* The height modifier is applied to the font height, it allows the annotations to be changed by a certain factor
*/
public float getHeightModifier()
{
return heightModifier;
}
/**
* {@link getHeightModifier}
*/
public void setHeightModifier(final float heightModifier)
{
this.heightModifier = heightModifier;
}
/*
* The following methods are overwritten from the PDTextStripper
*/
public void initialize(final PDDocument pdf) throws IOException
{
resetEngine();
document = pdf;
textCache = new TextCache();
if (getAddMoreFormatting())
{
setParagraphEnd(getLineSeparator());
setPageStart(getLineSeparator());
setArticleStart(getLineSeparator());
setArticleEnd(getLineSeparator());
}
startDocument(pdf);
processPages(pdf.getDocumentCatalog().getAllPages());
endDocument(pdf);
}
/**
* {@inheritDoc}
*/
@Override
public void resetEngine()
{
super.resetEngine();
textCache = null;
}
/**
* Start a new article, which is typically defined as a column on a single page (also referred to as a bead).
* Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param isltr true if primary direction of text is left to right.
* @throws IOException If there is any error writing to the stream.
*/
@Override
protected void startArticle(final boolean isltr) throws IOException
{
final String articleStart = getArticleStart();
textCache.append(articleStart, null);
}
/**
* End an article. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
@Override
protected void endArticle() throws IOException
{
final String articleEnd = getArticleEnd();
textCache.append(articleEnd, null);
}
/**
* Start a new page. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
@Override
protected void startPage(final PDPage page) throws IOException
{
// default is to do nothing.
}
/**
* End a page. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
@Override
protected void endPage(final PDPage page) throws IOException
{
// default is to do nothing
}
/**
* Write the page separator value to the text cache.
*
* @throws IOException If there is a problem writing out the pageseparator to the document.
*/
@Override
protected void writePageSeperator()
{
final String pageSeparator = getPageSeparator();
textCache.append(pageSeparator, null);
}
/**
* Write the line separator value to the text cache.
*
* @throws IOException If there is a problem writing out the lineseparator to the document.
*/
@Override
protected void writeLineSeparator()
{
final String lineSeparator = getLineSeparator();
textCache.append(lineSeparator, null);
}
/**
* Write the word separator value to the text cache.
*
* @throws IOException If there is a problem writing out the wordseparator to the document.
*/
@Override
protected void writeWordSeparator()
{
final String wordSeparator = getWordSeparator();
textCache.append(wordSeparator, null);
}
/**
* Write the string in TextPosition to the text cache.
*
* @param text The text to write to the stream.
*/
@Override
protected void writeCharacters(final TextPosition text)
{
final String character = text.getCharacter();
textCache.append(character, text);
}
/**
* Write a string to the text cache. The default implementation will ignore the <code>text</code> and just calls
* {@link #writeCharacters(TextPosition)} .
*
* @param text The text to write to the stream.
* @param textPositions The TextPositions belonging to the text.
*/
@Override
protected void writeString(final String text, final List<TextPosition> textPositions)
{
for (final TextPosition textPosition : textPositions)
{
writeCharacters(textPosition);
}
}
private boolean inParagraph;
/**
* writes the paragraph separator string to the text cache.
*
* @throws IOException
*/
@Override
protected void writeParagraphSeparator()
{
writeParagraphEnd();
writeParagraphStart();
}
/**
* Write something (if defined) at the start of a paragraph.
*
* @throws IOException
*/
@Override
protected void writeParagraphStart()
{
if (inParagraph)
{
writeParagraphEnd();
inParagraph = false;
}
final String paragraphStart = getParagraphStart();
textCache.append(paragraphStart, null);
inParagraph = true;
}
/**
* Write something (if defined) at the end of a paragraph.
*
* @throws IOException
*/
@Override
protected void writeParagraphEnd()
{
final String paragraphEnd = getParagraphEnd();
textCache.append(paragraphEnd, null);
inParagraph = false;
}
/**
* Write something (if defined) at the start of a page.
*
* @throws IOException
*/
@Override
protected void writePageStart()
{
final String pageStart = getPageStart();
textCache.append(pageStart, null);
}
/**
* Write something (if defined) at the start of a page.
*
* @throws IOException
*/
@Override
protected void writePageEnd()
{
final String pageEnd = getPageEnd();
textCache.append(pageEnd, null);
}
@Override
public String getText(final PDDocument doc) throws IOException
{
throw new IllegalArgumentException("Not applicable for TextHighlight");
}
@Override
public String getText(final COSDocument doc) throws IOException
{
throw new IllegalArgumentException("Not applicable for TextHighlight");
}
@Override
public void writeText(final COSDocument doc, final Writer outputStream) throws IOException
{
throw new IllegalArgumentException("Not applicable for TextHighlight");
}
@Override
public void writeText(final PDDocument doc, final Writer outputStream) throws IOException
{
throw new IllegalArgumentException("Not applicable for TextHighlight");
}
/* main */
public static void main(final String args[]) throws Exception
{
if (args.length != 3)
{
usage();
}
PDDocument pdDoc = null;
final File file = new File(args[0]);
if (!file.isFile())
{
System.err.println("File " + args[0] + " does not exist.");
return;
}
final PDFParser parser = new PDFParser(new FileInputStream(file));
parser.parse();
pdDoc = new PDDocument(parser.getDocument());
final TextHighlight pdfHighlight = new TextHighlight("UTF-8");
// depends on what you want to match, but this creates a long string without newlines
pdfHighlight.setLineSeparator(" ");
pdfHighlight.initialize(pdDoc);
final List<PDAnnotationTextMarkup> highlights = pdfHighlight.highlightDefault(args[2]);
pdDoc.save(args[1]);
try
{
if (parser.getDocument() != null)
{
parser.getDocument().close();
}
if (pdDoc != null)
{
pdDoc.close();
}
}
catch (final Exception e)
{
e.printStackTrace();
}
}
private static void usage()
{
System.err.println("Usage: <input-pdf> <output-pdf> <pattern>");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment