JoelGeraci-Datalogics/AddBookmarksBasedOnFontSize.java

## AddBookmarksBasedOnFontSize.java
/*
 * Copyright Datalogics, Inc. 2015
 */

package pdfjt.cookbook.document;

import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.ByteWriter;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.core.fontset.PDFFontSet;
import com.adobe.pdfjt.core.types.ASCoordinate;
import com.adobe.pdfjt.graphicsDOM.GraphicsState;
import com.adobe.pdfjt.pdf.content.processor.GState;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkUtils;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFDestination;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.services.fontresources.PDFFontSetUtil;
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
import com.adobe.pdfjt.services.textextraction.ParagraphIterator;
import com.adobe.pdfjt.services.textextraction.TextExtractionOptions;
import com.adobe.pdfjt.services.textextraction.Word;

import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import pdfjt.util.SampleFileServices;
import pdfjt.util.SampleFontLoaderUtil;

/**
 * This sample extracts the text from a PDF file and uses a very simple
 * heuristic to detect headings. When a heading is detected, it adds a bookmark.
 * The heuristic was designed for this input file but can be easily modified and
 * applied to any PDF file.
 */
public class AddBookmarksBasedOnFontSize {

    private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf";
    private static final String outputDir = "cookbook/Document/output/";
    /*
     * These are the font sizes that we'll be looking for when creating
     * bookmarks. We'll create three levels of nested bookmarks using these
     * sizes and fonts as a guide.
     */
    private static final double h1FontSizeMin = 21;
    private static final double h1FontSizeMax = 24;
    private static final double h2FontSizeMin = 18;
    private static final double h2FontSizeMax = 20;
    private static final double h3FontSizeMin = 14;
    private static final double h3FontSizeMax = 16;

    private static final String h1FontName = "MinionPro-Bold";
    private static final String h2FontName = "MyriadPro-Bold";
    private static final String h3FontName = "MyriadPro-Bold";

    public static void main(String[] args) throws Exception {
        /*
         * Read in PDF input file
         */
        URLConnection connection = new URL(inputPDFURL).openConnection();
        connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
        connection.connect();
        InputStream fis = connection.getInputStream();
        ByteReader byteReader = new InputStreamByteReader(fis);
        PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
        /*
         * We also need to set up the root of the bookmark tree to be able to
         * add to it.
         */
        PDFBookmarkRoot pdfBookmarkRoot = PDFBookmarkRoot.newSkeletonInstance(pdfDocument);
        /*
         * The text extractor needs to understand what fonts the document is
         * using to be able to map the glyphs and get at the text.
         */
        PDFFontSet sysFontSet = SampleFontLoaderUtil.loadSampleFontSet();
        PDFFontSet fontset = PDFFontSetUtil.buildWorkingFontSet(pdfDocument, sysFontSet, pdfDocument.getDocumentLocale(), null);
        /*
         * Now we can set up the ReadingOrderTextExtractor class.
         */
        TextExtractionOptions textExtractionOptions = TextExtractionOptions.newInstance();
        textExtractionOptions.setUseStructure(true);
        textExtractionOptions.setIgnoreArtifacts(true);
        ReadingOrderTextExtractor readingOrderTextExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, fontset,
                textExtractionOptions);

        PDFBookmark currentH1 = null;
        PDFBookmark currentH2 = null;

        System.out.println("Creating Bookmarks...");
        /*
         * Extract the text from the PDF file paragraph by paragraph using the
         * document structure. Each paragraph will contain one or more List
         * objects, "sentences", that contain a List of Word objects.
         */
        ParagraphIterator paragraphIterator = readingOrderTextExtractor.getParagraphIterator();
        while (paragraphIterator.hasNext()) {
            List<List<Word>> paragraph = paragraphIterator.next();
            // Remove non-printing Word objects
            paragraph = cleanParagraph(paragraph);
            /*
             * Determine if this paragraph is a heading and if so, what level.
             */
            String headingLevel = detectHeadingLevel(paragraph);
            if (headingLevel != null) {
                // Create a String from the Paragraph to use as label for the
                // bookmark
                String bookmarkLabel = paragraphListToString(paragraph, " ");
                /*
                 * Determine the destination page number of the bookmark based
                 * on the page number of the first word in the first (and only)
                 * Sentence.
                 */
                int pdfPageNumber = paragraph.get(0).get(0).getPageNumber() - 1;
                /*
                 * Get the destination PDFPage for the page number we got above.
                 */
                PDFPage pdfPage = pdfDocument.requirePages().getPage(pdfPageNumber);
                /*
                 * Get the top/left coordinates of the first word in the first
                 * (and only) Sentence. This is used in the bookmark destination
                 * to set where the viewer scrolls the page to. We only need the
                 * y coordinate as you'll see below.
                 */
                ASCoordinate asCoordinate = paragraph.get(0).get(0).topLeft();
                /*
                 * Create a new bookmark destination that will take the user to
                 * the correct page, zoom the PDF to fit to the width
                 * of the window it is displaying in and align the window to the
                 * top of the word we are bookmarking.
                 */
                PDFDestination pdfDestination = PDFDestination.newDestFitH(pdfDocument, pdfPage, asCoordinate.y());
                /*
                 * Create a new bookmark with a label that matches the heading.
                 */
                PDFBookmark pdfBookmark = PDFBookmark.newInstance(pdfDocument, bookmarkLabel);
                /*
                 * Now set the destination of the bookmark to the one we created
                 * above, then append the bookmark to it's parent after any other
                 * bookmarks that might be there already. Headings get added to
                 * the root.
                 */
                pdfBookmark.setDestination(pdfDestination);
                if (headingLevel == "H1") {
                    System.out.println(bookmarkLabel);
                    PDFBookmarkUtils.appendLastKid(pdfBookmark, pdfBookmarkRoot);
                    /*
                     * We've found a new top level heading so change which
                     * parent H2 headings get added to.
                     */
                    currentH1 = pdfBookmark;
                }
                if (headingLevel == "H2") {
                    System.out.println("     " + bookmarkLabel);
                    PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH1);
                    /*
                     * We've found a new H2 so change which parent H3
                     * headings get added to.
                     */
                    currentH2 = pdfBookmark;
                }
                if (headingLevel == "H3") {
                    System.out.println("          " + bookmarkLabel);
                    PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH2);
                }
            }
        }
        /*
         * Add the PDFBookmarkRoot object and it's tree to the document catalog.
         */
        pdfDocument.requireCatalog().setBookmarkRoot(pdfBookmarkRoot);
        // Save the file.
        String outputFileName = "Bookmarked.pdf";
        ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName);
        pdfDocument.save(outputFile, PDFSaveFullOptions.newInstance());
        System.out.println("Created: " + outputFileName);
    }

    /**
     * Remove non-printing Word objects from a Paragraph
     *
     * @param paragraph
     *            the List<List<Word>> derived from the each Paragraph in the
     *            ReadingOrderTextExtractor's ParagraphIterator
     * @return List<List<Word>>
     */
    private static List<List<Word>> cleanParagraph(List<List<Word>> paragraph) throws Exception {
        List<List<Word>> cleanParagraph = new ArrayList<List<Word>>();
        for (List<Word> sentence : paragraph) {
            List<Word> cleanSentence = cleanSentence(sentence);
            cleanParagraph.add(cleanSentence);
        }
        return cleanParagraph;
    }

    /**
     * Remove non-printing Word objects from a Sentence
     *
     * @param sentence
     *            the List<Word> derived from the each "sentence" in the
     *            ReadingOrderTextExtractor's ParagraphIterator
     * @return List<Word>
     */
    private static List<Word> cleanSentence(List<Word> sentence) throws Exception {
        List<Word> cleanSentence = new ArrayList<Word>();
        for (Word word : sentence) {
            if (word.getBoundingQuads() != null) {
                cleanSentence.add(word);
            }
        }
        return cleanSentence;
    }

    /**
     * Detect if a paragraph of text meets the criteria that matches a heading.
     * Criteria are captured in the "if" statements.
     *
     * @param paragraph
     *            the List<List<Word>> derived from the
     *            ReadingOrderTextExtractor's ParagraphIterator
     * @return A String (H1, H2, or H3) or null if criteria are not met.
     */
    private static String detectHeadingLevel(List<List<Word>> paragraph) throws Exception {
        // Assume the entire paragraph is the same height
        double paragraphHeight = paragraph.get(0).get(0).getUarray().get(0).getVerticalFontSize();

        // Assume that the entire paragraph is the same font.
        String paragraphFont = paragraph.get(0).get(0).getUarray().get(0).getFont().toString();
        /*
         * Test for each combination of font and font size. Heading paragraphs
         * are all only one sentence long so we test that first.
         */
        if (paragraph.size() == 1) {
            if (paragraphFont.contains(h1FontName) && paragraphHeight >= h1FontSizeMin && paragraphHeight <= h1FontSizeMax) {
                return "H1";
            } else if (paragraphFont.contains(h2FontName) && paragraphHeight >= h2FontSizeMin && paragraphHeight <= h2FontSizeMax) {
                return "H2";
            } else if (paragraphFont.contains(h3FontName) && paragraphHeight >= h3FontSizeMin && paragraphHeight <= h3FontSizeMax) {
                return "H3";
            } else {
                return null;
            }
        } else {
            return null;
        }
    }

    /**
     * Convert a paragraph in the form of List<List<Word>> to a string.
     *
     * @param paragraph
     *            the List<List<Word>> derived from the
     *            ReadingOrderTextExtractor's ParagraphIterator
     * @param separator
     *            the character to insert between the words
     * @return The paragraph List<List<Word>> as a String
     */
    private static String paragraphListToString(List<List<Word>> paragraph, String separator) {
        String paragraphString = "";
        for (Word word : paragraph.get(0)) {
            paragraphString = paragraphString + separator + word.toString();
        }
        paragraphString = paragraphString.trim();
        return paragraphString;
    }

}
	/*
	* Copyright Datalogics, Inc. 2015
	*/

	package pdfjt.cookbook.document;

	import com.adobe.internal.io.ByteReader;
	import com.adobe.internal.io.ByteWriter;
	import com.adobe.internal.io.InputStreamByteReader;
	import com.adobe.pdfjt.core.fontset.PDFFontSet;
	import com.adobe.pdfjt.core.types.ASCoordinate;
	import com.adobe.pdfjt.graphicsDOM.GraphicsState;
	import com.adobe.pdfjt.pdf.content.processor.GState;
	import com.adobe.pdfjt.pdf.document.PDFDocument;
	import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
	import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions;
	import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark;
	import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot;
	import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkUtils;
	import com.adobe.pdfjt.pdf.interactive.navigation.PDFDestination;
	import com.adobe.pdfjt.pdf.page.PDFPage;
	import com.adobe.pdfjt.services.fontresources.PDFFontSetUtil;
	import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
	import com.adobe.pdfjt.services.textextraction.ParagraphIterator;
	import com.adobe.pdfjt.services.textextraction.TextExtractionOptions;
	import com.adobe.pdfjt.services.textextraction.Word;

	import java.io.InputStream;
	import java.net.URL;
	import java.net.URLConnection;
	import java.util.ArrayList;
	import java.util.List;

	import pdfjt.util.SampleFileServices;
	import pdfjt.util.SampleFontLoaderUtil;

	/**
	* This sample extracts the text from a PDF file and uses a very simple
	* heuristic to detect headings. When a heading is detected, it adds a bookmark.
	* The heuristic was designed for this input file but can be easily modified and
	* applied to any PDF file.
	*/
	public class AddBookmarksBasedOnFontSize {

	private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf";
	private static final String outputDir = "cookbook/Document/output/";
	/*
	* These are the font sizes that we'll be looking for when creating
	* bookmarks. We'll create three levels of nested bookmarks using these
	* sizes and fonts as a guide.
	*/
	private static final double h1FontSizeMin = 21;
	private static final double h1FontSizeMax = 24;
	private static final double h2FontSizeMin = 18;
	private static final double h2FontSizeMax = 20;
	private static final double h3FontSizeMin = 14;
	private static final double h3FontSizeMax = 16;

	private static final String h1FontName = "MinionPro-Bold";
	private static final String h2FontName = "MyriadPro-Bold";
	private static final String h3FontName = "MyriadPro-Bold";

	public static void main(String[] args) throws Exception {
	/*
	* Read in PDF input file
	*/
	URLConnection connection = new URL(inputPDFURL).openConnection();
	connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
	connection.connect();
	InputStream fis = connection.getInputStream();
	ByteReader byteReader = new InputStreamByteReader(fis);
	PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
	/*
	* We also need to set up the root of the bookmark tree to be able to
	* add to it.
	*/
	PDFBookmarkRoot pdfBookmarkRoot = PDFBookmarkRoot.newSkeletonInstance(pdfDocument);
	/*
	* The text extractor needs to understand what fonts the document is
	* using to be able to map the glyphs and get at the text.
	*/
	PDFFontSet sysFontSet = SampleFontLoaderUtil.loadSampleFontSet();
	PDFFontSet fontset = PDFFontSetUtil.buildWorkingFontSet(pdfDocument, sysFontSet, pdfDocument.getDocumentLocale(), null);
	/*
	* Now we can set up the ReadingOrderTextExtractor class.
	*/
	TextExtractionOptions textExtractionOptions = TextExtractionOptions.newInstance();
	textExtractionOptions.setUseStructure(true);
	textExtractionOptions.setIgnoreArtifacts(true);
	ReadingOrderTextExtractor readingOrderTextExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, fontset,
	textExtractionOptions);

	PDFBookmark currentH1 = null;
	PDFBookmark currentH2 = null;

	System.out.println("Creating Bookmarks...");
	/*
	* Extract the text from the PDF file paragraph by paragraph using the
	* document structure. Each paragraph will contain one or more List
	* objects, "sentences", that contain a List of Word objects.
	*/
	ParagraphIterator paragraphIterator = readingOrderTextExtractor.getParagraphIterator();
	while (paragraphIterator.hasNext()) {
	List<List<Word>> paragraph = paragraphIterator.next();
	// Remove non-printing Word objects
	paragraph = cleanParagraph(paragraph);
	/*
	* Determine if this paragraph is a heading and if so, what level.
	*/
	String headingLevel = detectHeadingLevel(paragraph);
	if (headingLevel != null) {
	// Create a String from the Paragraph to use as label for the
	// bookmark
	String bookmarkLabel = paragraphListToString(paragraph, " ");
	/*
	* Determine the destination page number of the bookmark based
	* on the page number of the first word in the first (and only)
	* Sentence.
	*/
	int pdfPageNumber = paragraph.get(0).get(0).getPageNumber() - 1;
	/*
	* Get the destination PDFPage for the page number we got above.
	*/
	PDFPage pdfPage = pdfDocument.requirePages().getPage(pdfPageNumber);
	/*
	* Get the top/left coordinates of the first word in the first
	* (and only) Sentence. This is used in the bookmark destination
	* to set where the viewer scrolls the page to. We only need the
	* y coordinate as you'll see below.
	*/
	ASCoordinate asCoordinate = paragraph.get(0).get(0).topLeft();
	/*
	* Create a new bookmark destination that will take the user to
	* the correct page, zoom the PDF to fit to the width
	* of the window it is displaying in and align the window to the
	* top of the word we are bookmarking.
	*/
	PDFDestination pdfDestination = PDFDestination.newDestFitH(pdfDocument, pdfPage, asCoordinate.y());
	/*
	* Create a new bookmark with a label that matches the heading.
	*/
	PDFBookmark pdfBookmark = PDFBookmark.newInstance(pdfDocument, bookmarkLabel);
	/*
	* Now set the destination of the bookmark to the one we created
	* above, then append the bookmark to it's parent after any other
	* bookmarks that might be there already. Headings get added to
	* the root.
	*/
	pdfBookmark.setDestination(pdfDestination);
	if (headingLevel == "H1") {
	System.out.println(bookmarkLabel);
	PDFBookmarkUtils.appendLastKid(pdfBookmark, pdfBookmarkRoot);
	/*
	* We've found a new top level heading so change which
	* parent H2 headings get added to.
	*/
	currentH1 = pdfBookmark;
	}
	if (headingLevel == "H2") {
	System.out.println(" " + bookmarkLabel);
	PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH1);
	/*
	* We've found a new H2 so change which parent H3
	* headings get added to.
	*/
	currentH2 = pdfBookmark;
	}
	if (headingLevel == "H3") {
	System.out.println(" " + bookmarkLabel);
	PDFBookmarkUtils.appendLastKid(pdfBookmark, currentH2);
	}
	}
	}
	/*
	* Add the PDFBookmarkRoot object and it's tree to the document catalog.
	*/
	pdfDocument.requireCatalog().setBookmarkRoot(pdfBookmarkRoot);
	// Save the file.
	String outputFileName = "Bookmarked.pdf";
	ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName);
	pdfDocument.save(outputFile, PDFSaveFullOptions.newInstance());
	System.out.println("Created: " + outputFileName);
	}

	/**
	* Remove non-printing Word objects from a Paragraph
	*
	* @param paragraph
	* the List<List<Word>> derived from the each Paragraph in the
	* ReadingOrderTextExtractor's ParagraphIterator
	* @return List<List<Word>>
	*/
	private static List<List<Word>> cleanParagraph(List<List<Word>> paragraph) throws Exception {
	List<List<Word>> cleanParagraph = new ArrayList<List<Word>>();
	for (List<Word> sentence : paragraph) {
	List<Word> cleanSentence = cleanSentence(sentence);
	cleanParagraph.add(cleanSentence);
	}
	return cleanParagraph;
	}

	/**
	* Remove non-printing Word objects from a Sentence
	*
	* @param sentence
	* the List<Word> derived from the each "sentence" in the
	* ReadingOrderTextExtractor's ParagraphIterator
	* @return List<Word>
	*/
	private static List<Word> cleanSentence(List<Word> sentence) throws Exception {
	List<Word> cleanSentence = new ArrayList<Word>();
	for (Word word : sentence) {
	if (word.getBoundingQuads() != null) {
	cleanSentence.add(word);
	}
	}
	return cleanSentence;
	}

	/**
	* Detect if a paragraph of text meets the criteria that matches a heading.
	* Criteria are captured in the "if" statements.
	*
	* @param paragraph
	* the List<List<Word>> derived from the
	* ReadingOrderTextExtractor's ParagraphIterator
	* @return A String (H1, H2, or H3) or null if criteria are not met.
	*/
	private static String detectHeadingLevel(List<List<Word>> paragraph) throws Exception {
	// Assume the entire paragraph is the same height
	double paragraphHeight = paragraph.get(0).get(0).getUarray().get(0).getVerticalFontSize();

	// Assume that the entire paragraph is the same font.
	String paragraphFont = paragraph.get(0).get(0).getUarray().get(0).getFont().toString();
	/*
	* Test for each combination of font and font size. Heading paragraphs
	* are all only one sentence long so we test that first.
	*/
	if (paragraph.size() == 1) {
	if (paragraphFont.contains(h1FontName) && paragraphHeight >= h1FontSizeMin && paragraphHeight <= h1FontSizeMax) {
	return "H1";
	} else if (paragraphFont.contains(h2FontName) && paragraphHeight >= h2FontSizeMin && paragraphHeight <= h2FontSizeMax) {
	return "H2";
	} else if (paragraphFont.contains(h3FontName) && paragraphHeight >= h3FontSizeMin && paragraphHeight <= h3FontSizeMax) {
	return "H3";
	} else {
	return null;
	}
	} else {
	return null;
	}
	}

	/**
	* Convert a paragraph in the form of List<List<Word>> to a string.
	*
	* @param paragraph
	* the List<List<Word>> derived from the
	* ReadingOrderTextExtractor's ParagraphIterator
	* @param separator
	* the character to insert between the words
	* @return The paragraph List<List<Word>> as a String
	*/
	private static String paragraphListToString(List<List<Word>> paragraph, String separator) {
	String paragraphString = "";
	for (Word word : paragraph.get(0)) {
	paragraphString = paragraphString + separator + word.toString();
	}
	paragraphString = paragraphString.trim();
	return paragraphString;
	}

	}