JoelGeraci-Datalogics/SplitDocumentBasedOnBookmarks.java

## SplitDocumentBasedOnBookmarks.java
/*
 * Copyright Datalogics, Inc. 2015
 */

package pdfjt.cookbook.document;

import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.ByteWriter;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.services.manipulations.PMMOptions;
import com.adobe.pdfjt.services.manipulations.PMMService;

import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import pdfjt.util.SampleFileServices;

/**
 * Splits a document based on bookmarks.
 *
 * What you need to know first:
 *
 * The Document Outline consists of a tree-structured hierarchy of outline
 * nodes, commonly called bookmarks, which serve as a visual table of contents
 * to display the document's structure to the user. For the sake of consistency
 * with the Java Toolkit API, we'll use the term "bookmark" to refer to an
 * outline item from here on.
 *
 * The nodes at each level of the hierarchy form a linked list (not a Java
 * LinkedList Object), chained together through their Prev and Next entries and
 * accessed through the First and Last entries in the parent node.
 */
public class SplitDocumentBasedOnBookmarks {

    private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf";
    private static final String outputDir = "cookbook/Document/output/split/";

    public static void main(String[] args) throws Exception {
        /*
         * Read in PDF input file
         */
        URLConnection connection = new URL(inputPDFURL).openConnection();
        connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
        connection.connect();
        InputStream fis = connection.getInputStream();
        ByteReader byteReader = new InputStreamByteReader(fis);
        PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
        PDFBookmarkRoot pdfBookmarkRoot = pdfDocument.requireCatalog().getBookmarkRoot();
        if (pdfBookmarkRoot != null) {
            /*
             * To split the document based on bookmarks we need to know the
             * destination of a particular bookmark and the destination of the
             * following bookmark. To make this task easier, we read the top
             * level of the bookmark tree and populate an ArrayList.
             */
            List<PDFBookmark> firstLevelBookmarks = new ArrayList<PDFBookmark>();
            int firstLevelBookmarkCount = pdfBookmarkRoot.getNumKids();
            PDFBookmark pdfBookmark = pdfBookmarkRoot.getFirstKid();
            firstLevelBookmarks.add(pdfBookmark);
            for (int i=1; i <  firstLevelBookmarkCount; i++) {
                pdfBookmark = pdfBookmark.getNext();
                firstLevelBookmarks.add(pdfBookmark);
            }
            /*
             * First we extract pages from the first page to the page just
             * before the destination of the second bookmark.
             */
            pdfBookmark = pdfBookmarkRoot.getFirstKid();
            PDFBookmark nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(1);
            int startPageNum = 0;
            int endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex();
            String outputFileName = "1_"+pdfBookmark.getTitle()+".pdf";
            extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
            /*
             * Now we can look through the array and extract the pages based on
             * the destinations of the bookmarks.
             */
            for (int i=1; i <  firstLevelBookmarkCount-1; i++) {
                pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(i);
                nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(i+1);
                startPageNum = pdfBookmark.getDestination().getPage().getIndex();
                endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex();
                outputFileName = String.valueOf(i+1)+"_"+pdfBookmark.getTitle()+".pdf";
                extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
            }
            /*
             * Finally, we extract pages from the destination page of the last
             * bookmark to the last page of the document.
             */
            pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(firstLevelBookmarkCount-1);
            startPageNum = pdfBookmark.getDestination().getPage().getIndex();
            endPageNum = pdfDocument.requirePages().getNumPages();
            outputFileName = firstLevelBookmarkCount+"_"+pdfBookmark.getTitle()+".pdf";
            extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
        }
    }

    /**
     * Extract a range of pages from the one document to create another.
     *
     * @param pdfDocument
     *            the source document
     * @param startPageNum
     *            the first zero based page in the range to extract.
     * @param endPageNum
     *            the last zero based page in the range to extract.
     * @param outputFileName
     *            the filename of the extracted file.
     * @return void
     */
    private static void extractToNewDocument(PDFDocument pdfDocument, int startPageNum, int endPageNum,  String outputFileName) throws Exception {
        PMMService pmmService = new PMMService(pdfDocument);
        PDFPage startPage = pdfDocument.requireCatalog().getPages().getPage(startPageNum);
        int pageCount = endPageNum - startPageNum;
        PDFDocument outputDocument = pmmService.extractPages(startPage, pageCount, PMMOptions.newInstanceAll(), PDFOpenOptions.newInstance());
        //System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize());
        ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName);
        outputDocument.save(outputFile, PDFSaveFullOptions.newInstance());
        System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize());
    }

}
	/*
	* Copyright Datalogics, Inc. 2015
	*/

	package pdfjt.cookbook.document;

	import com.adobe.internal.io.ByteReader;
	import com.adobe.internal.io.ByteWriter;
	import com.adobe.internal.io.InputStreamByteReader;
	import com.adobe.pdfjt.pdf.document.PDFDocument;
	import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
	import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions;
	import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark;
	import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot;
	import com.adobe.pdfjt.pdf.page.PDFPage;
	import com.adobe.pdfjt.services.manipulations.PMMOptions;
	import com.adobe.pdfjt.services.manipulations.PMMService;

	import java.io.InputStream;
	import java.net.URL;
	import java.net.URLConnection;
	import java.util.ArrayList;
	import java.util.List;

	import pdfjt.util.SampleFileServices;

	/**
	* Splits a document based on bookmarks.
	*
	* What you need to know first:
	*
	* The Document Outline consists of a tree-structured hierarchy of outline
	* nodes, commonly called bookmarks, which serve as a visual table of contents
	* to display the document's structure to the user. For the sake of consistency
	* with the Java Toolkit API, we'll use the term "bookmark" to refer to an
	* outline item from here on.
	*
	* The nodes at each level of the hierarchy form a linked list (not a Java
	* LinkedList Object), chained together through their Prev and Next entries and
	* accessed through the First and Last entries in the parent node.
	*/
	public class SplitDocumentBasedOnBookmarks {

	private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf";
	private static final String outputDir = "cookbook/Document/output/split/";

	public static void main(String[] args) throws Exception {
	/*
	* Read in PDF input file
	*/
	URLConnection connection = new URL(inputPDFURL).openConnection();
	connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
	connection.connect();
	InputStream fis = connection.getInputStream();
	ByteReader byteReader = new InputStreamByteReader(fis);
	PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
	PDFBookmarkRoot pdfBookmarkRoot = pdfDocument.requireCatalog().getBookmarkRoot();
	if (pdfBookmarkRoot != null) {
	/*
	* To split the document based on bookmarks we need to know the
	* destination of a particular bookmark and the destination of the
	* following bookmark. To make this task easier, we read the top
	* level of the bookmark tree and populate an ArrayList.
	*/
	List<PDFBookmark> firstLevelBookmarks = new ArrayList<PDFBookmark>();
	int firstLevelBookmarkCount = pdfBookmarkRoot.getNumKids();
	PDFBookmark pdfBookmark = pdfBookmarkRoot.getFirstKid();
	firstLevelBookmarks.add(pdfBookmark);
	for (int i=1; i < firstLevelBookmarkCount; i++) {
	pdfBookmark = pdfBookmark.getNext();
	firstLevelBookmarks.add(pdfBookmark);
	}
	/*
	* First we extract pages from the first page to the page just
	* before the destination of the second bookmark.
	*/
	pdfBookmark = pdfBookmarkRoot.getFirstKid();
	PDFBookmark nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(1);
	int startPageNum = 0;
	int endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex();
	String outputFileName = "1_"+pdfBookmark.getTitle()+".pdf";
	extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
	/*
	* Now we can look through the array and extract the pages based on
	* the destinations of the bookmarks.
	*/
	for (int i=1; i < firstLevelBookmarkCount-1; i++) {
	pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(i);
	nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(i+1);
	startPageNum = pdfBookmark.getDestination().getPage().getIndex();
	endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex();
	outputFileName = String.valueOf(i+1)+"_"+pdfBookmark.getTitle()+".pdf";
	extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
	}
	/*
	* Finally, we extract pages from the destination page of the last
	* bookmark to the last page of the document.
	*/
	pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(firstLevelBookmarkCount-1);
	startPageNum = pdfBookmark.getDestination().getPage().getIndex();
	endPageNum = pdfDocument.requirePages().getNumPages();
	outputFileName = firstLevelBookmarkCount+"_"+pdfBookmark.getTitle()+".pdf";
	extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
	}
	}

	/**
	* Extract a range of pages from the one document to create another.
	*
	* @param pdfDocument
	* the source document
	* @param startPageNum
	* the first zero based page in the range to extract.
	* @param endPageNum
	* the last zero based page in the range to extract.
	* @param outputFileName
	* the filename of the extracted file.
	* @return void
	*/
	private static void extractToNewDocument(PDFDocument pdfDocument, int startPageNum, int endPageNum, String outputFileName) throws Exception {
	PMMService pmmService = new PMMService(pdfDocument);
	PDFPage startPage = pdfDocument.requireCatalog().getPages().getPage(startPageNum);
	int pageCount = endPageNum - startPageNum;
	PDFDocument outputDocument = pmmService.extractPages(startPage, pageCount, PMMOptions.newInstanceAll(), PDFOpenOptions.newInstance());
	//System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize());
	ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName);
	outputDocument.save(outputFile, PDFSaveFullOptions.newInstance());
	System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize());
	}

	}