Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save JoelGeraci-Datalogics/d6458078556d75e19433 to your computer and use it in GitHub Desktop.
Save JoelGeraci-Datalogics/d6458078556d75e19433 to your computer and use it in GitHub Desktop.
This sample extracts the text from a PDF file and uses a very simple heuristic to detect headings. When a heading is detected, it adds a bookmark. The heuristic was designed for this input file but can be easily modified and applied to any PDF file.
/*
* Copyright Datalogics, Inc. 2015
*/
package pdfjt.cookbook.document;
import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.ByteWriter;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.core.fontset.PDFFontSet;
import com.adobe.pdfjt.core.types.ASCoordinate;
import com.adobe.pdfjt.graphicsDOM.GraphicsState;
import com.adobe.pdfjt.pdf.content.processor.GState;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkUtils;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFDestination;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.services.fontresources.PDFFontSetUtil;
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
import com.adobe.pdfjt.services.textextraction.ParagraphIterator;
import com.adobe.pdfjt.services.textextraction.TextExtractionOptions;
import com.adobe.pdfjt.services.textextraction.Word;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import pdfjt.util.SampleFileServices;
import pdfjt.util.SampleFontLoaderUtil;
/**
* This sample extracts the text from a PDF file and uses a very simple
* heuristic to detect headings. When a heading is detected, it adds a bookmark.
* The heuristic was designed for this input file but can be easily modified and
* applied to any PDF file.
*/
public class AddBookmarksBasedOnFontSize {
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf";
private static final String outputDir = "cookbook/Document/output/";
/*
* These are the font sizes that we'll be looking for when creating
* bookmarks. We'll create three levels of nested bookmarks using these
* sizes and fonts as a guide.
*/
private static final double h1FontSizeMin = 21;
private static final double h1FontSizeMax = 24;
private static final double h2FontSizeMin = 18;
private static final double h2FontSizeMax = 20;
private static final double h3FontSizeMin = 14;
private static final double h3FontSizeMax = 16;
private static final String h1FontName = "MinionPro-Bold";
private static final String h2FontName = "MyriadPro-Bold";
private static final String h3FontName = "MyriadPro-Bold";
public static void main(String[] args) throws Exception {
/*
* Read in PDF input file
*/
URLConnection connection = new URL(inputPDFURL).openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
connection.connect();
InputStream fis = connection.getInputStream();
ByteReader byteReader = new InputStreamByteReader(fis);
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
/*
* We also need to set up the root of the bookmark tree to be able to
* add to it.
*/
PDFBookmarkRoot pdfBookmarkRoot = PDFBookmarkRoot.newSkeletonInstance(pdfDocument);
/*
* The text extractor needs to understand what fonts the document is
* using to be able to map the glyphs and get at the text.
*/
PDFFontSet sysFontSet = SampleFontLoaderUtil.loadSampleFontSet();
PDFFontSet fontset = PDFFontSetUtil.buildWorkingFontSet(pdfDocument, sysFontSet, pdfDocument.getDocumentLocale(), null);
/*
* Now we can set up the ReadingOrderTextExtractor class.
*/
TextExtractionOptions textExtractionOptions = TextExtractionOptions.newInstance();
textExtractionOptions.setUseStructure(true);
textExtractionOptions.setIgnoreArtifacts(true);
ReadingOrderTextExtractor readingOrderTextExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, fontset,
textExtractionOptions);
PDFBookmark currentH1 = null;
PDFBookmark currentH2 = null;
System.out.println("Creating Bookmarks...");
/*
* Extract the text from the PDF file paragraph by paragraph using the
* document structure. Each paragraph will contain one or more List
* objects, "sentences", that contain a List of Word objects.
*/
ParagraphIterator paragraphIterator = readingOrderTextExtractor.getParagraphIterator();
while (paragraphIterator.hasNext()) {
List<List<Word>> paragraph = paragraphIterator.next();
// Remove non-printing Word objects
paragraph = cleanParagraph(paragraph);
/*
* Determine if this paragraph is a heading and if so, what level.
*/
String headingLevel = detectHeadingLevel(paragraph);
if (headingLevel != null) {
// Create a String from the Paragraph to use as label for the
// bookmark
String bookmarkLabel = paragraphListToString(paragraph, " ");
/*
* Determine the destination page number of the bookmark based
* on the page number of the first word in the first (and only)
* Sentence.
*/
int pdfPageNumber = paragraph.get(0).get(0).getPageNumber() - 1;
/*
* Get the destination PDFPage for the page number we got above.
*/
PDFPage pdfPage = pdfDocument.requirePages().getPage(pdfPageNumber);
/*
* Get the top/left coordinates of the first word in the first
* (and only) Sentence. This is used in the bookmark destination
* to set where the viewer scrolls the page to. We only need the
* y coordinate as you'll see below.
*/
ASCoordinate asCoordinate = paragraph.get(0).get(0).topLeft();
/*
* Create a new bookmark destination that will take the user to
* the correct page, zoom the PDF to fit to the width
* of the window it is displaying in and align the window to the
* top of the word we are bookmarking.
*/
PDFDestination pdfDestination = PDFDestination.newDestFitH(pdfDocument, pdfPage, asCoordinate.y());
/*
* Create a new bookmark with a label that matches the heading.
*/
PDFBookmark pdfBookmark = PDFBookmark.newInstance(pdfDocument, bookmarkLabel);
/*
* Now set the destination of the bookmark to the one we created
* above, then append the bookmark to it's parent after any other
* bookmarks that might be there already. Headings get added to
* the root.
*/
pdfBookmark.setDestination(pdfDestination);
if (headingLevel == "H1") {
System.out.println(bookmarkLabel);
PDFBookmarkUtils.appendLastKid(pdfBookmark, pdfBookmarkRoot);
/*
* We've found a new top level heading so change which
* parent H2 headings get added to.
*/
currentH1 = pdfBookmark;
}
if (headingLevel == "H2") {
System.out.println(" " + bookmarkLabel);
PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH1);
/*
* We've found a new H2 so change which parent H3
* headings get added to.
*/
currentH2 = pdfBookmark;
}
if (headingLevel == "H3") {
System.out.println(" " + bookmarkLabel);
PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH2);
}
}
}
/*
* Add the PDFBookmarkRoot object and it's tree to the document catalog.
*/
pdfDocument.requireCatalog().setBookmarkRoot(pdfBookmarkRoot);
// Save the file.
String outputFileName = "Bookmarked.pdf";
ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName);
pdfDocument.save(outputFile, PDFSaveFullOptions.newInstance());
System.out.println("Created: " + outputFileName);
}
/**
* Remove non-printing Word objects from a Paragraph
*
* @param paragraph
* the List<List<Word>> derived from the each Paragraph in the
* ReadingOrderTextExtractor's ParagraphIterator
* @return List<List<Word>>
*/
private static List<List<Word>> cleanParagraph(List<List<Word>> paragraph) throws Exception {
List<List<Word>> cleanParagraph = new ArrayList<List<Word>>();
for (List<Word> sentence : paragraph) {
List<Word> cleanSentence = cleanSentence(sentence);
cleanParagraph.add(cleanSentence);
}
return cleanParagraph;
}
/**
* Remove non-printing Word objects from a Sentence
*
* @param sentence
* the List<Word> derived from the each "sentence" in the
* ReadingOrderTextExtractor's ParagraphIterator
* @return List<Word>
*/
private static List<Word> cleanSentence(List<Word> sentence) throws Exception {
List<Word> cleanSentence = new ArrayList<Word>();
for (Word word : sentence) {
if (word.getBoundingQuads() != null) {
cleanSentence.add(word);
}
}
return cleanSentence;
}
/**
* Detect if a paragraph of text meets the criteria that matches a heading.
* Criteria are captured in the "if" statements.
*
* @param paragraph
* the List<List<Word>> derived from the
* ReadingOrderTextExtractor's ParagraphIterator
* @return A String (H1, H2, or H3) or null if criteria are not met.
*/
private static String detectHeadingLevel(List<List<Word>> paragraph) throws Exception {
// Assume the entire paragraph is the same height
double paragraphHeight = paragraph.get(0).get(0).getUarray().get(0).getVerticalFontSize();
// Assume that the entire paragraph is the same font.
String paragraphFont = paragraph.get(0).get(0).getUarray().get(0).getFont().toString();
/*
* Test for each combination of font and font size. Heading paragraphs
* are all only one sentence long so we test that first.
*/
if (paragraph.size() == 1) {
if (paragraphFont.contains(h1FontName) && paragraphHeight >= h1FontSizeMin && paragraphHeight <= h1FontSizeMax) {
return "H1";
} else if (paragraphFont.contains(h2FontName) && paragraphHeight >= h2FontSizeMin && paragraphHeight <= h2FontSizeMax) {
return "H2";
} else if (paragraphFont.contains(h3FontName) && paragraphHeight >= h3FontSizeMin && paragraphHeight <= h3FontSizeMax) {
return "H3";
} else {
return null;
}
} else {
return null;
}
}
/**
* Convert a paragraph in the form of List<List<Word>> to a string.
*
* @param paragraph
* the List<List<Word>> derived from the
* ReadingOrderTextExtractor's ParagraphIterator
* @param separator
* the character to insert between the words
* @return The paragraph List<List<Word>> as a String
*/
private static String paragraphListToString(List<List<Word>> paragraph, String separator) {
String paragraphString = "";
for (Word word : paragraph.get(0)) {
paragraphString = paragraphString + separator + word.toString();
}
paragraphString = paragraphString.trim();
return paragraphString;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment