Last active
June 10, 2016 21:33
-
-
Save JoelGeraci-Datalogics/d6458078556d75e19433 to your computer and use it in GitHub Desktop.
This sample extracts the text from a PDF file and uses a very simple heuristic to detect headings. When a heading is detected, it adds a bookmark. The heuristic was designed for this input file but can be easily modified and applied to any PDF file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright Datalogics, Inc. 2015 | |
*/ | |
package pdfjt.cookbook.document; | |
import com.adobe.internal.io.ByteReader; | |
import com.adobe.internal.io.ByteWriter; | |
import com.adobe.internal.io.InputStreamByteReader; | |
import com.adobe.pdfjt.core.fontset.PDFFontSet; | |
import com.adobe.pdfjt.core.types.ASCoordinate; | |
import com.adobe.pdfjt.graphicsDOM.GraphicsState; | |
import com.adobe.pdfjt.pdf.content.processor.GState; | |
import com.adobe.pdfjt.pdf.document.PDFDocument; | |
import com.adobe.pdfjt.pdf.document.PDFOpenOptions; | |
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions; | |
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark; | |
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot; | |
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkUtils; | |
import com.adobe.pdfjt.pdf.interactive.navigation.PDFDestination; | |
import com.adobe.pdfjt.pdf.page.PDFPage; | |
import com.adobe.pdfjt.services.fontresources.PDFFontSetUtil; | |
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor; | |
import com.adobe.pdfjt.services.textextraction.ParagraphIterator; | |
import com.adobe.pdfjt.services.textextraction.TextExtractionOptions; | |
import com.adobe.pdfjt.services.textextraction.Word; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.ArrayList; | |
import java.util.List; | |
import pdfjt.util.SampleFileServices; | |
import pdfjt.util.SampleFontLoaderUtil; | |
/** | |
* This sample extracts the text from a PDF file and uses a very simple | |
* heuristic to detect headings. When a heading is detected, it adds a bookmark. | |
* The heuristic was designed for this input file but can be easily modified and | |
* applied to any PDF file. | |
*/ | |
public class AddBookmarksBasedOnFontSize { | |
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf"; | |
private static final String outputDir = "cookbook/Document/output/"; | |
/* | |
* These are the font sizes that we'll be looking for when creating | |
* bookmarks. We'll create three levels of nested bookmarks using these | |
* sizes and fonts as a guide. | |
*/ | |
private static final double h1FontSizeMin = 21; | |
private static final double h1FontSizeMax = 24; | |
private static final double h2FontSizeMin = 18; | |
private static final double h2FontSizeMax = 20; | |
private static final double h3FontSizeMin = 14; | |
private static final double h3FontSizeMax = 16; | |
private static final String h1FontName = "MinionPro-Bold"; | |
private static final String h2FontName = "MyriadPro-Bold"; | |
private static final String h3FontName = "MyriadPro-Bold"; | |
public static void main(String[] args) throws Exception { | |
/* | |
* Read in PDF input file | |
*/ | |
URLConnection connection = new URL(inputPDFURL).openConnection(); | |
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); | |
connection.connect(); | |
InputStream fis = connection.getInputStream(); | |
ByteReader byteReader = new InputStreamByteReader(fis); | |
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance()); | |
/* | |
* We also need to set up the root of the bookmark tree to be able to | |
* add to it. | |
*/ | |
PDFBookmarkRoot pdfBookmarkRoot = PDFBookmarkRoot.newSkeletonInstance(pdfDocument); | |
/* | |
* The text extractor needs to understand what fonts the document is | |
* using to be able to map the glyphs and get at the text. | |
*/ | |
PDFFontSet sysFontSet = SampleFontLoaderUtil.loadSampleFontSet(); | |
PDFFontSet fontset = PDFFontSetUtil.buildWorkingFontSet(pdfDocument, sysFontSet, pdfDocument.getDocumentLocale(), null); | |
/* | |
* Now we can set up the ReadingOrderTextExtractor class. | |
*/ | |
TextExtractionOptions textExtractionOptions = TextExtractionOptions.newInstance(); | |
textExtractionOptions.setUseStructure(true); | |
textExtractionOptions.setIgnoreArtifacts(true); | |
ReadingOrderTextExtractor readingOrderTextExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, fontset, | |
textExtractionOptions); | |
PDFBookmark currentH1 = null; | |
PDFBookmark currentH2 = null; | |
System.out.println("Creating Bookmarks..."); | |
/* | |
* Extract the text from the PDF file paragraph by paragraph using the | |
* document structure. Each paragraph will contain one or more List | |
* objects, "sentences", that contain a List of Word objects. | |
*/ | |
ParagraphIterator paragraphIterator = readingOrderTextExtractor.getParagraphIterator(); | |
while (paragraphIterator.hasNext()) { | |
List<List<Word>> paragraph = paragraphIterator.next(); | |
// Remove non-printing Word objects | |
paragraph = cleanParagraph(paragraph); | |
/* | |
* Determine if this paragraph is a heading and if so, what level. | |
*/ | |
String headingLevel = detectHeadingLevel(paragraph); | |
if (headingLevel != null) { | |
// Create a String from the Paragraph to use as label for the | |
// bookmark | |
String bookmarkLabel = paragraphListToString(paragraph, " "); | |
/* | |
* Determine the destination page number of the bookmark based | |
* on the page number of the first word in the first (and only) | |
* Sentence. | |
*/ | |
int pdfPageNumber = paragraph.get(0).get(0).getPageNumber() - 1; | |
/* | |
* Get the destination PDFPage for the page number we got above. | |
*/ | |
PDFPage pdfPage = pdfDocument.requirePages().getPage(pdfPageNumber); | |
/* | |
* Get the top/left coordinates of the first word in the first | |
* (and only) Sentence. This is used in the bookmark destination | |
* to set where the viewer scrolls the page to. We only need the | |
* y coordinate as you'll see below. | |
*/ | |
ASCoordinate asCoordinate = paragraph.get(0).get(0).topLeft(); | |
/* | |
* Create a new bookmark destination that will take the user to | |
* the correct page, zoom the PDF to fit to the width | |
* of the window it is displaying in and align the window to the | |
* top of the word we are bookmarking. | |
*/ | |
PDFDestination pdfDestination = PDFDestination.newDestFitH(pdfDocument, pdfPage, asCoordinate.y()); | |
/* | |
* Create a new bookmark with a label that matches the heading. | |
*/ | |
PDFBookmark pdfBookmark = PDFBookmark.newInstance(pdfDocument, bookmarkLabel); | |
/* | |
* Now set the destination of the bookmark to the one we created | |
* above, then append the bookmark to it's parent after any other | |
* bookmarks that might be there already. Headings get added to | |
* the root. | |
*/ | |
pdfBookmark.setDestination(pdfDestination); | |
if (headingLevel == "H1") { | |
System.out.println(bookmarkLabel); | |
PDFBookmarkUtils.appendLastKid(pdfBookmark, pdfBookmarkRoot); | |
/* | |
* We've found a new top level heading so change which | |
* parent H2 headings get added to. | |
*/ | |
currentH1 = pdfBookmark; | |
} | |
if (headingLevel == "H2") { | |
System.out.println(" " + bookmarkLabel); | |
PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH1); | |
/* | |
* We've found a new H2 so change which parent H3 | |
* headings get added to. | |
*/ | |
currentH2 = pdfBookmark; | |
} | |
if (headingLevel == "H3") { | |
System.out.println(" " + bookmarkLabel); | |
PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH2); | |
} | |
} | |
} | |
/* | |
* Add the PDFBookmarkRoot object and it's tree to the document catalog. | |
*/ | |
pdfDocument.requireCatalog().setBookmarkRoot(pdfBookmarkRoot); | |
// Save the file. | |
String outputFileName = "Bookmarked.pdf"; | |
ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName); | |
pdfDocument.save(outputFile, PDFSaveFullOptions.newInstance()); | |
System.out.println("Created: " + outputFileName); | |
} | |
/** | |
* Remove non-printing Word objects from a Paragraph | |
* | |
* @param paragraph | |
* the List<List<Word>> derived from the each Paragraph in the | |
* ReadingOrderTextExtractor's ParagraphIterator | |
* @return List<List<Word>> | |
*/ | |
private static List<List<Word>> cleanParagraph(List<List<Word>> paragraph) throws Exception { | |
List<List<Word>> cleanParagraph = new ArrayList<List<Word>>(); | |
for (List<Word> sentence : paragraph) { | |
List<Word> cleanSentence = cleanSentence(sentence); | |
cleanParagraph.add(cleanSentence); | |
} | |
return cleanParagraph; | |
} | |
/** | |
* Remove non-printing Word objects from a Sentence | |
* | |
* @param sentence | |
* the List<Word> derived from the each "sentence" in the | |
* ReadingOrderTextExtractor's ParagraphIterator | |
* @return List<Word> | |
*/ | |
private static List<Word> cleanSentence(List<Word> sentence) throws Exception { | |
List<Word> cleanSentence = new ArrayList<Word>(); | |
for (Word word : sentence) { | |
if (word.getBoundingQuads() != null) { | |
cleanSentence.add(word); | |
} | |
} | |
return cleanSentence; | |
} | |
/** | |
* Detect if a paragraph of text meets the criteria that matches a heading. | |
* Criteria are captured in the "if" statements. | |
* | |
* @param paragraph | |
* the List<List<Word>> derived from the | |
* ReadingOrderTextExtractor's ParagraphIterator | |
* @return A String (H1, H2, or H3) or null if criteria are not met. | |
*/ | |
private static String detectHeadingLevel(List<List<Word>> paragraph) throws Exception { | |
// Assume the entire paragraph is the same height | |
double paragraphHeight = paragraph.get(0).get(0).getUarray().get(0).getVerticalFontSize(); | |
// Assume that the entire paragraph is the same font. | |
String paragraphFont = paragraph.get(0).get(0).getUarray().get(0).getFont().toString(); | |
/* | |
* Test for each combination of font and font size. Heading paragraphs | |
* are all only one sentence long so we test that first. | |
*/ | |
if (paragraph.size() == 1) { | |
if (paragraphFont.contains(h1FontName) && paragraphHeight >= h1FontSizeMin && paragraphHeight <= h1FontSizeMax) { | |
return "H1"; | |
} else if (paragraphFont.contains(h2FontName) && paragraphHeight >= h2FontSizeMin && paragraphHeight <= h2FontSizeMax) { | |
return "H2"; | |
} else if (paragraphFont.contains(h3FontName) && paragraphHeight >= h3FontSizeMin && paragraphHeight <= h3FontSizeMax) { | |
return "H3"; | |
} else { | |
return null; | |
} | |
} else { | |
return null; | |
} | |
} | |
/** | |
* Convert a paragraph in the form of List<List<Word>> to a string. | |
* | |
* @param paragraph | |
* the List<List<Word>> derived from the | |
* ReadingOrderTextExtractor's ParagraphIterator | |
* @param separator | |
* the character to insert between the words | |
* @return The paragraph List<List<Word>> as a String | |
*/ | |
private static String paragraphListToString(List<List<Word>> paragraph, String separator) { | |
String paragraphString = ""; | |
for (Word word : paragraph.get(0)) { | |
paragraphString = paragraphString + separator + word.toString(); | |
} | |
paragraphString = paragraphString.trim(); | |
return paragraphString; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment