Skip to content

Instantly share code, notes, and snippets.

@jhpage
Created February 16, 2024 07:03
Show Gist options
  • Save jhpage/9521bf7c349417d9790c8ae960f34184 to your computer and use it in GitHub Desktop.
Save jhpage/9521bf7c349417d9790c8ae960f34184 to your computer and use it in GitHub Desktop.
Show text locations from a PDF file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.pdfcompose.app</groupId>
<artifactId>parse-app</artifactId>
<version>1.0-SNAPSHOT</version>
<name>parse-app</name>
<url>https://pdfcompose.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>com.pdfcompose.app.sample</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-my-jar-with-dependencies</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
import java.io.File;
import java.io.ByteArrayOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.awt.Color;
import java.awt.Shape;
import javax.imageio.ImageIO;
import java.util.List;
import java.awt.BasicStroke;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.awt.geom.Rectangle2D;
import java.awt.Graphics2D;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.text.TextPosition;
public class ShowTextLocation extends PDFTextStripper
{
private AffineTransform flipAT;
private AffineTransform rotateAT;
private AffineTransform transAT;
private final String filename;
static final int SCALE = 4;
private Graphics2D g2d;
public ShowTextLocation(PDDocument document, String filename) throws java.io.IOException
{
this.document = document;
this.filename = filename;
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws java.io.IOException
{
for (TextPosition text : textPositions) {
// glyph space -> user space
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
AffineTransform at = text.getTextMatrix().createAffineTransform();
// in red:
// show rectangles with the "height" (not a real height, but used for text extraction
// heuristics, it is 1/2 of the bounding box height and starts at y=0)
Rectangle2D.Float rect = new Rectangle2D.Float(0, 0,
text.getWidthDirAdj() / text.getTextMatrix().getScalingFactorX(),
text.getHeightDir() / text.getTextMatrix().getScalingFactorY());
Shape s = at.createTransformedShape(rect);
s = flipAT.createTransformedShape(s);
s = rotateAT.createTransformedShape(s);
g2d.setColor(Color.red);
g2d.draw(s);
// in blue:
// show rectangle with the real vertical bounds, based on the font bounding box y values
// usually, the height is identical to what you see when marking text in Adobe Reader
PDFont font = text.getFont();
BoundingBox bbox = font.getBoundingBox();
// advance width, bbox height (glyph space)
float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
if (font instanceof PDType3Font) {
// bbox and font matrix are unscaled
at.concatenate(font.getFontMatrix().createAffineTransform());
} else {
// bbox and font matrix are already scaled to 1000
at.scale(1/1000f, 1/1000f);
}
s = at.createTransformedShape(rect);
s = flipAT.createTransformedShape(s);
s = rotateAT.createTransformedShape(s);
g2d.setColor(Color.blue);
g2d.draw(s);
Rectangle2D bounds = s.getBounds2D();
System.out.printf("[%s] (%f, %f, %f %f)\n", text.toString(),
bounds.getX(), bounds.getY(), bounds.getWidth(), bounds.getHeight());
}
}
private void stripPage(int page) throws java.io.IOException
{
PDFRenderer pdfRenderer = new PDFRenderer(document);
BufferedImage image = pdfRenderer.renderImage(page, SCALE);
PDPage pdPage = document.getPage(page);
PDRectangle cropBox = pdPage.getCropBox();
// flip y-axis
flipAT = new AffineTransform();
flipAT.translate(0, pdPage.getBBox().getHeight());
flipAT.scale(1, -1);
// page may be rotated
rotateAT = new AffineTransform();
int rotation = pdPage.getRotation();
if (rotation != 0)
{
PDRectangle mediaBox = pdPage.getMediaBox();
switch (rotation)
{
case 90:
rotateAT.translate(mediaBox.getHeight(), 0);
break;
case 270:
rotateAT.translate(0, mediaBox.getWidth());
break;
case 180:
rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
break;
default:
break;
}
rotateAT.rotate(Math.toRadians(rotation));
}
// cropbox
transAT = AffineTransform.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY());
g2d = image.createGraphics();
g2d.setStroke(new BasicStroke(0.1f));
g2d.scale(SCALE, SCALE);
setStartPage(page + 1);
setEndPage(page + 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
writeText(document, dummy);
// beads in green
g2d.setStroke(new BasicStroke(0.4f));
List<PDThreadBead> pageArticles = pdPage.getThreadBeads();
for (PDThreadBead bead : pageArticles)
{
if (bead == null)
{
continue;
}
PDRectangle r = bead.getRectangle();
Shape s = r.toGeneralPath().createTransformedShape(transAT);
s = flipAT.createTransformedShape(s);
s = rotateAT.createTransformedShape(s);
g2d.setColor(Color.green);
g2d.draw(s);
}
g2d.dispose();
String imageFilename = filename;
int pt = imageFilename.lastIndexOf('.');
imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png";
ImageIO.write(image, "png", new File(imageFilename));
}
public static void main(String[] args) throws Exception
{
if (args.length != 1) {
System.err.println("usage: java ShowTextLocation pdf-file");
System.exit(1);
}
PDDocument document = null;
try {
document = PDDocument.load(new File(args[0]));
ShowTextLocation stripper = new ShowTextLocation(document, args[0]);
stripper.setSortByPosition(true);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
stripper.stripPage(page);
}
} finally {
if (document != null) document.close();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment