Skip to content

Instantly share code, notes, and snippets.

Created October 14, 2017 22:09
Show Gist options
  • Save beldaz/8ed6e7473bd228fcee8d4a3e4525be11 to your computer and use it in GitHub Desktop.
Save beldaz/8ed6e7473bd228fcee8d4a3e4525be11 to your computer and use it in GitHub Desktop.
Class to extract tabular PDF text using PDFBox
* Copyright 2017 Beldaz (
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.awt.Rectangle;
import java.awt.Shape;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
* Class to extract tabular data from a PDF.
* Works by making a first pass of the page to group all nearby text items
* together, and then inferring a 2D grid from these regions. Each table cell
* is then extracted using a PDFTextStripperByArea object.
* Works best when
* headers are included in the detected region, to ensure representative text
* in every column.
* Based upon DrawPrintTextLocations PDFBox example
* (
* @author Beldaz
public class PDFTableStripper extends PDFTextStripper
* This will print the documents data, for each table cell.
* @param args The command line arguments.
* @throws IOException If there is an error parsing the document.
public static void main(String[] args) throws IOException
try (PDDocument document = PDDocument.load(new File(args[0])))
final double res = 72; // PDF units are at 72 DPI
PDFTableStripper stripper = new PDFTableStripper();
// Choose a region in which to extract a table (here a 6"wide, 9" high rectangle offset 1" from top left of page)
stripper.setRegion(new Rectangle((int) Math.round(1.0*res), (int) Math.round(1*res), (int) Math.round(6*res), (int) Math.round(9.0*res)));
// Repeat for each page of PDF
for (int page = 0; page < document.getNumberOfPages(); ++page)
System.out.println("Page " + page);
PDPage pdPage = document.getPage(page);
for(int c=0; c<stripper.getColumns(); ++c) {
System.out.println("Column " + c);
for(int r=0; r<stripper.getRows(); ++r) {
System.out.println("Row " + r);
System.out.println(stripper.getText(r, c));
* Used in methods derived from DrawPrintTextLocations
private AffineTransform flipAT;
private AffineTransform rotateAT;
* Regions updated by calls to writeString
private Set<Rectangle2D> boxes;
// Border to allow when finding intersections
private double dx = 1.0; // This value works for me, feel free to tweak (or add setter)
private double dy = 0.000; // Rows of text tend to overlap, so need to extend
* Region in which to find table (otherwise whole page)
private Rectangle2D regionArea;
* Number of rows in inferred table
private int nRows=0;
* Number of columns in inferred table
private int nCols=0;
* This is the object that does the text extraction
private PDFTextStripperByArea regionStripper;
* 1D intervals - used for calculateTableRegions()
* @author Beldaz
public static class Interval {
double start;
double end;
public Interval(double start, double end) {
this.start=start; this.end = end;
public void add(Interval col) {
start = col.start;
end = col.end;
public static void addTo(Interval x, LinkedList<Interval> columns) {
int p = 0;
Iterator<Interval> it = columns.iterator();
// Find where x should go
while(it.hasNext()) {
Interval col =;
if(x.end>=col.start) {
if(x.start<=col.end) { // overlaps
while(it.hasNext()) {
Interval col =;
columns.add(p, x);
* Instantiate a new PDFTableStripper object.
* @param document
* @throws IOException If there is an error loading the properties.
public PDFTableStripper() throws IOException
regionStripper = new PDFTextStripperByArea();
regionStripper.setSortByPosition( true );
* Define the region to group text by.
* @param rect The rectangle area to retrieve the text from.
public void setRegion(Rectangle2D rect )
regionArea = rect;
public int getRows()
return nRows;
public int getColumns()
return nCols;
* Get the text for the region, this should be called after extractTable().
* @return The text that was identified in that region.
public String getText(int row, int col)
return regionStripper.getTextForRegion("el"+col+"x"+row);
public void extractTable(PDPage pdPage) throws IOException
boxes = new HashSet<Rectangle2D>();
// flip y-axis
flipAT = new AffineTransform();
flipAT.translate(0, pdPage.getBBox().getHeight());
flipAT.scale(1, -1);
// page may be rotated
rotateAT = new AffineTransform();
int rotation = pdPage.getRotation();
if (rotation != 0)
PDRectangle mediaBox = pdPage.getMediaBox();
switch (rotation)
case 90:
rotateAT.translate(mediaBox.getHeight(), 0);
case 270:
rotateAT.translate(0, mediaBox.getWidth());
case 180:
rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
// Trigger processing of the document so that writeString is called.
try (Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream())) {
super.output = dummy;
Rectangle2D[][] regions = calculateTableRegions();
// System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + " regions");
for(int i=0; i<nCols; ++i) {
for(int j=0; j<nRows; ++j) {
final Rectangle2D region = regions[i][j];
regionStripper.addRegion("el"+i+"x"+j, region);
* Infer a rectangular grid of regions from the boxes field.
* @return 2D array of table regions (as Rectangle2D objects). Note that
* some of these regions may have no content.
private Rectangle2D[][] calculateTableRegions() {
// Build up a list of all table regions, based upon the populated
// regions of boxes field. Treats the horizontal and vertical extents
// of each box as distinct
LinkedList<Interval> columns = new LinkedList<Interval>();
LinkedList<Interval> rows = new LinkedList<Interval>();
for(Rectangle2D box: boxes) {
Interval x = new Interval(box.getMinX(), box.getMaxX());
Interval y = new Interval(box.getMinY(), box.getMaxY());
Interval.addTo(x, columns);
Interval.addTo(y, rows);
nRows = rows.size();
nCols = columns.size();
Rectangle2D[][] regions = new Rectangle2D[nCols][nRows];
int i=0;
// Label regions from top left, rather than the transformed orientation
for(Interval column: columns) {
int j=0;
for(Interval row: rows) {
regions[nCols-i-1][nRows-j-1] = new Rectangle2D.Double(column.start, row.start, column.end - column.start, row.end - row.start);
return regions;
* Register each character's bounding box, updating boxes field to maintain
* a list of all distinct groups of characters.
* Overrides the default functionality of PDFTextStripper.
* Most of this is taken from, with extra steps
* at end of main loop
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
for (TextPosition text : textPositions)
// glyph space -> user space
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
AffineTransform at = text.getTextMatrix().createAffineTransform();
PDFont font = text.getFont();
BoundingBox bbox = font.getBoundingBox();
// advance width, bbox height (glyph space)
float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
Rectangle2D.Float rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
if (font instanceof PDType3Font)
// bbox and font matrix are unscaled
// bbox and font matrix are already scaled to 1000
at.scale(1/1000f, 1/1000f);
Shape s = at.createTransformedShape(rect);
s = flipAT.createTransformedShape(s);
s = rotateAT.createTransformedShape(s);
// Merge character's bounding box with boxes field
Rectangle2D bounds = s.getBounds2D();
// Pad sides to detect almost touching boxes
Rectangle2D hitbox = bounds.getBounds2D();
hitbox.add(bounds.getMinX() - dx , bounds.getMinY() - dy);
hitbox.add(bounds.getMaxX() + dx , bounds.getMaxY() + dy);
// Find all overlapping boxes
List<Rectangle2D> intersectList = new ArrayList<Rectangle2D>();
for(Rectangle2D box: boxes) {
if(box.intersects(hitbox)) {
// Combine all touching boxes and update
// (NOTE: Potentially this could leave some overlapping boxes un-merged,
// but it's sufficient for now and get's fixed up in calculateTableRegions)
for(Rectangle2D box: intersectList) {
* This method does nothing in this derived class, because beads and regions are incompatible. Beads are
* ignored when stripping by area.
* @param aShouldSeparateByBeads The new grouping of beads.
public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
* Adapted from PDFTextStripperByArea
* {@inheritDoc}
protected void processTextPosition( TextPosition text )
if(regionArea!=null && !regionArea.contains( text.getX(), text.getY() ) ) {
// skip character
} else {
super.processTextPosition( text );
Copy link

Thank you so much for sharing this! Worked for my use case perfectly when PDFTextStripper wasn't getting the job done. You are the best!

Copy link

Great God!
Please take my knees!

Copy link

uhm Im confused on how to use this D;

Copy link

Great program. Worked flawlessly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment