Created
February 16, 2024 07:03
-
-
Save jhpage/9521bf7c349417d9790c8ae960f34184 to your computer and use it in GitHub Desktop.
Show text locations from a PDF file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>com.pdfcompose.app</groupId> | |
<artifactId>parse-app</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
<name>parse-app</name> | |
<url>https://pdfcompose.com</url> | |
<properties> | |
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
<maven.compiler.source>1.7</maven.compiler.source> | |
<maven.compiler.target>1.7</maven.compiler.target> | |
</properties> | |
<dependencies> | |
<dependency> | |
<groupId>org.apache.pdfbox</groupId> | |
<artifactId>pdfbox</artifactId> | |
<version>2.0.29</version> | |
</dependency> | |
</dependencies> | |
<build> | |
<pluginManagement> | |
<plugins> | |
<plugin> | |
<artifactId>maven-clean-plugin</artifactId> | |
<version>3.1.0</version> | |
</plugin> | |
<plugin> | |
<artifactId>maven-assembly-plugin</artifactId> | |
<configuration> | |
<descriptorRefs> | |
<descriptorRef>jar-with-dependencies</descriptorRef> | |
</descriptorRefs> | |
<archive> | |
<manifest> | |
<mainClass>com.pdfcompose.app.sample</mainClass> | |
</manifest> | |
</archive> | |
</configuration> | |
<executions> | |
<execution> | |
<id>make-my-jar-with-dependencies</id> | |
<phase>package</phase> | |
<goals> | |
<goal>single</goal> | |
</goals> | |
</execution> | |
</executions> | |
</plugin> | |
<plugin> | |
<artifactId>maven-resources-plugin</artifactId> | |
<version>3.0.2</version> | |
</plugin> | |
<plugin> | |
<artifactId>maven-compiler-plugin</artifactId> | |
<version>3.8.0</version> | |
</plugin> | |
<plugin> | |
<artifactId>maven-surefire-plugin</artifactId> | |
<version>2.22.1</version> | |
</plugin> | |
<plugin> | |
<artifactId>maven-jar-plugin</artifactId> | |
<version>3.0.2</version> | |
</plugin> | |
<plugin> | |
<artifactId>maven-install-plugin</artifactId> | |
<version>2.5.2</version> | |
</plugin> | |
<plugin> | |
<artifactId>maven-deploy-plugin</artifactId> | |
<version>2.8.2</version> | |
</plugin> | |
</plugins> | |
</pluginManagement> | |
</build> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.ByteArrayOutputStream; | |
import java.io.OutputStreamWriter; | |
import java.io.Writer; | |
import java.awt.Color; | |
import java.awt.Shape; | |
import javax.imageio.ImageIO; | |
import java.util.List; | |
import java.awt.BasicStroke; | |
import java.awt.geom.AffineTransform; | |
import java.awt.image.BufferedImage; | |
import java.awt.geom.Rectangle2D; | |
import java.awt.Graphics2D; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.common.PDRectangle; | |
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; | |
import org.apache.pdfbox.text.PDFTextStripper; | |
import org.apache.pdfbox.rendering.PDFRenderer; | |
import org.apache.pdfbox.pdmodel.font.PDFont; | |
import org.apache.pdfbox.pdmodel.font.PDType3Font; | |
import org.apache.fontbox.util.BoundingBox; | |
import org.apache.pdfbox.text.TextPosition; | |
public class ShowTextLocation extends PDFTextStripper | |
{ | |
private AffineTransform flipAT; | |
private AffineTransform rotateAT; | |
private AffineTransform transAT; | |
private final String filename; | |
static final int SCALE = 4; | |
private Graphics2D g2d; | |
public ShowTextLocation(PDDocument document, String filename) throws java.io.IOException | |
{ | |
this.document = document; | |
this.filename = filename; | |
} | |
@Override | |
protected void writeString(String string, List<TextPosition> textPositions) throws java.io.IOException | |
{ | |
for (TextPosition text : textPositions) { | |
// glyph space -> user space | |
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix | |
AffineTransform at = text.getTextMatrix().createAffineTransform(); | |
// in red: | |
// show rectangles with the "height" (not a real height, but used for text extraction | |
// heuristics, it is 1/2 of the bounding box height and starts at y=0) | |
Rectangle2D.Float rect = new Rectangle2D.Float(0, 0, | |
text.getWidthDirAdj() / text.getTextMatrix().getScalingFactorX(), | |
text.getHeightDir() / text.getTextMatrix().getScalingFactorY()); | |
Shape s = at.createTransformedShape(rect); | |
s = flipAT.createTransformedShape(s); | |
s = rotateAT.createTransformedShape(s); | |
g2d.setColor(Color.red); | |
g2d.draw(s); | |
// in blue: | |
// show rectangle with the real vertical bounds, based on the font bounding box y values | |
// usually, the height is identical to what you see when marking text in Adobe Reader | |
PDFont font = text.getFont(); | |
BoundingBox bbox = font.getBoundingBox(); | |
// advance width, bbox height (glyph space) | |
float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars | |
rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight()); | |
if (font instanceof PDType3Font) { | |
// bbox and font matrix are unscaled | |
at.concatenate(font.getFontMatrix().createAffineTransform()); | |
} else { | |
// bbox and font matrix are already scaled to 1000 | |
at.scale(1/1000f, 1/1000f); | |
} | |
s = at.createTransformedShape(rect); | |
s = flipAT.createTransformedShape(s); | |
s = rotateAT.createTransformedShape(s); | |
g2d.setColor(Color.blue); | |
g2d.draw(s); | |
Rectangle2D bounds = s.getBounds2D(); | |
System.out.printf("[%s] (%f, %f, %f %f)\n", text.toString(), | |
bounds.getX(), bounds.getY(), bounds.getWidth(), bounds.getHeight()); | |
} | |
} | |
private void stripPage(int page) throws java.io.IOException | |
{ | |
PDFRenderer pdfRenderer = new PDFRenderer(document); | |
BufferedImage image = pdfRenderer.renderImage(page, SCALE); | |
PDPage pdPage = document.getPage(page); | |
PDRectangle cropBox = pdPage.getCropBox(); | |
// flip y-axis | |
flipAT = new AffineTransform(); | |
flipAT.translate(0, pdPage.getBBox().getHeight()); | |
flipAT.scale(1, -1); | |
// page may be rotated | |
rotateAT = new AffineTransform(); | |
int rotation = pdPage.getRotation(); | |
if (rotation != 0) | |
{ | |
PDRectangle mediaBox = pdPage.getMediaBox(); | |
switch (rotation) | |
{ | |
case 90: | |
rotateAT.translate(mediaBox.getHeight(), 0); | |
break; | |
case 270: | |
rotateAT.translate(0, mediaBox.getWidth()); | |
break; | |
case 180: | |
rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight()); | |
break; | |
default: | |
break; | |
} | |
rotateAT.rotate(Math.toRadians(rotation)); | |
} | |
// cropbox | |
transAT = AffineTransform.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY()); | |
g2d = image.createGraphics(); | |
g2d.setStroke(new BasicStroke(0.1f)); | |
g2d.scale(SCALE, SCALE); | |
setStartPage(page + 1); | |
setEndPage(page + 1); | |
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); | |
writeText(document, dummy); | |
// beads in green | |
g2d.setStroke(new BasicStroke(0.4f)); | |
List<PDThreadBead> pageArticles = pdPage.getThreadBeads(); | |
for (PDThreadBead bead : pageArticles) | |
{ | |
if (bead == null) | |
{ | |
continue; | |
} | |
PDRectangle r = bead.getRectangle(); | |
Shape s = r.toGeneralPath().createTransformedShape(transAT); | |
s = flipAT.createTransformedShape(s); | |
s = rotateAT.createTransformedShape(s); | |
g2d.setColor(Color.green); | |
g2d.draw(s); | |
} | |
g2d.dispose(); | |
String imageFilename = filename; | |
int pt = imageFilename.lastIndexOf('.'); | |
imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png"; | |
ImageIO.write(image, "png", new File(imageFilename)); | |
} | |
public static void main(String[] args) throws Exception | |
{ | |
if (args.length != 1) { | |
System.err.println("usage: java ShowTextLocation pdf-file"); | |
System.exit(1); | |
} | |
PDDocument document = null; | |
try { | |
document = PDDocument.load(new File(args[0])); | |
ShowTextLocation stripper = new ShowTextLocation(document, args[0]); | |
stripper.setSortByPosition(true); | |
for (int page = 0; page < document.getNumberOfPages(); ++page) { | |
stripper.stripPage(page); | |
} | |
} finally { | |
if (document != null) document.close(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment