Skip to content

Instantly share code, notes, and snippets.

@fkuehnel
Created March 11, 2012 22:30
Show Gist options
  • Save fkuehnel/2018466 to your computer and use it in GitHub Desktop.
Save fkuehnel/2018466 to your computer and use it in GitHub Desktop.
PDFBox extension to extract previously tagged math areas from a PDF document
package org.apache.pdfbox;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.util.PDFTextStripperByArea;
import java.awt.geom.Rectangle2D;
import java.util.List;
public class ExtractMath
{
// ExtractMath is a namespace
private ExtractMath() {}
public static void main(String[] args) throws Exception
{
int argc = args.length;
if (argc < 1) {
usage();
return;
}
PDDocument document = null;
float yOffset = 0.0f;
float addHeight = 0.0f;
try {
document = PDDocument.load(args[0]);
if (argc > 1) {
yOffset = Float.parseFloat(args[1]);
if (argc > 2)
addHeight = Float.parseFloat(args[2]);
}
if (document.isEncrypted()) {
try {
document.decrypt( "" );
} catch( InvalidPasswordException e ) {
System.err.println( "Error: Document is encrypted with a password." );
System.exit( 1 );
}
}
List<PDPage> allPages = document.getDocumentCatalog().getAllPages();
int pageIdx = 0;
for (PDPage page : allPages) {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDRectangle cropBox = page.findCropBox();
System.out.println("Crop box for page " + pageIdx + ": "+ cropBox);
PDRectangle mediaBox = page.findMediaBox();
System.out.println("Media box for this page: " + pageIdx + ": " + mediaBox);
List<PDAnnotation> pageAnnotations = page.getAnnotations();
int formulaIdx = 0;
for(PDAnnotation annot: pageAnnotations) {
if ("MathML".equals(annot.getSubtype())) {
PDRectangle pdRect = annot.getRectangle();
float h = pdRect.getHeight() + addHeight;
float w = pdRect.getWidth();
float y = mediaBox.getHeight() - pdRect.getLowerLeftY() - h + yOffset;
Rectangle2D rect = new Rectangle2D.Float();
rect.setRect(pdRect.getLowerLeftX()-0.01f, y + 0.01f, w, h);
System.out.println("define region: " + rect);
stripper.addRegion("formula"+formulaIdx, rect);
formulaIdx += 1;
}
}
stripper.extractRegions(page);
formulaIdx = 0;
for(PDAnnotation annot: pageAnnotations) {
if ("MathML".equals(annot.getSubtype())) {
System.out.println("MathML formula:");
System.out.println(annot.getContents());
PDRectangle pdRect = annot.getRectangle();
System.out.println("Text in the area: " + pdRect);
System.out.println(stripper.getTextForRegion("formula"+formulaIdx));
formulaIdx += 1;
}
}
}
}
finally {
if (document != null)
document.close();
}
}
/**
* This will print the usage for this program.
*/
private static void usage()
{
System.err.println( "Usage: java " + ExtractMath.class.getName() + " <input-pdf>" );
}
}
@fkuehnel
Copy link
Author

This file needs to be placed inside the src/main/java/org/apache/pdfbox folder. Additionally, PDFBox.java needs to be modified to include the new ExtractMath command:

        try {
            if (command.equals("ConvertColorspace")) {
                ConvertColorspace.main(arguments);
            } else if (command.equals("Decrypt")) {
                Decrypt.main(arguments);
            } else if (command.equals("Encrypt")) {
                Encrypt.main(arguments);
            } else if (command.equals("ExtractText")) {
                ExtractText.main(arguments);
            } else if (command.equals("ExtractMath")) {
                ExtractMath.main(arguments);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment