Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save JoelGeraci-Datalogics/c0ee1c860386430c0427 to your computer and use it in GitHub Desktop.
Save JoelGeraci-Datalogics/c0ee1c860386430c0427 to your computer and use it in GitHub Desktop.
Splits a PDF document based on bookmark destinations
/*
* Copyright Datalogics, Inc. 2015
*/
package pdfjt.cookbook.document;
import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.ByteWriter;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark;
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.services.manipulations.PMMOptions;
import com.adobe.pdfjt.services.manipulations.PMMService;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import pdfjt.util.SampleFileServices;
/**
* Splits a document based on bookmarks.
*
* What you need to know first:
*
* The Document Outline consists of a tree-structured hierarchy of outline
* nodes, commonly called bookmarks, which serve as a visual table of contents
* to display the document's structure to the user. For the sake of consistency
* with the Java Toolkit API, we'll use the term "bookmark" to refer to an
* outline item from here on.
*
* The nodes at each level of the hierarchy form a linked list (not a Java
* LinkedList Object), chained together through their Prev and Next entries and
* accessed through the First and Last entries in the parent node.
*/
public class SplitDocumentBasedOnBookmarks {
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf";
private static final String outputDir = "cookbook/Document/output/split/";
public static void main(String[] args) throws Exception {
/*
* Read in PDF input file
*/
URLConnection connection = new URL(inputPDFURL).openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
connection.connect();
InputStream fis = connection.getInputStream();
ByteReader byteReader = new InputStreamByteReader(fis);
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
PDFBookmarkRoot pdfBookmarkRoot = pdfDocument.requireCatalog().getBookmarkRoot();
if (pdfBookmarkRoot != null) {
/*
* To split the document based on bookmarks we need to know the
* destination of a particular bookmark and the destination of the
* following bookmark. To make this task easier, we read the top
* level of the bookmark tree and populate an ArrayList.
*/
List<PDFBookmark> firstLevelBookmarks = new ArrayList<PDFBookmark>();
int firstLevelBookmarkCount = pdfBookmarkRoot.getNumKids();
PDFBookmark pdfBookmark = pdfBookmarkRoot.getFirstKid();
firstLevelBookmarks.add(pdfBookmark);
for (int i=1; i < firstLevelBookmarkCount; i++) {
pdfBookmark = pdfBookmark.getNext();
firstLevelBookmarks.add(pdfBookmark);
}
/*
* First we extract pages from the first page to the page just
* before the destination of the second bookmark.
*/
pdfBookmark = pdfBookmarkRoot.getFirstKid();
PDFBookmark nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(1);
int startPageNum = 0;
int endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex();
String outputFileName = "1_"+pdfBookmark.getTitle()+".pdf";
extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
/*
* Now we can look through the array and extract the pages based on
* the destinations of the bookmarks.
*/
for (int i=1; i < firstLevelBookmarkCount-1; i++) {
pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(i);
nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(i+1);
startPageNum = pdfBookmark.getDestination().getPage().getIndex();
endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex();
outputFileName = String.valueOf(i+1)+"_"+pdfBookmark.getTitle()+".pdf";
extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
}
/*
* Finally, we extract pages from the destination page of the last
* bookmark to the last page of the document.
*/
pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(firstLevelBookmarkCount-1);
startPageNum = pdfBookmark.getDestination().getPage().getIndex();
endPageNum = pdfDocument.requirePages().getNumPages();
outputFileName = firstLevelBookmarkCount+"_"+pdfBookmark.getTitle()+".pdf";
extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName);
}
}
/**
* Extract a range of pages from the one document to create another.
*
* @param pdfDocument
* the source document
* @param startPageNum
* the first zero based page in the range to extract.
* @param endPageNum
* the last zero based page in the range to extract.
* @param outputFileName
* the filename of the extracted file.
* @return void
*/
private static void extractToNewDocument(PDFDocument pdfDocument, int startPageNum, int endPageNum, String outputFileName) throws Exception {
PMMService pmmService = new PMMService(pdfDocument);
PDFPage startPage = pdfDocument.requireCatalog().getPages().getPage(startPageNum);
int pageCount = endPageNum - startPageNum;
PDFDocument outputDocument = pmmService.extractPages(startPage, pageCount, PMMOptions.newInstanceAll(), PDFOpenOptions.newInstance());
//System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize());
ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName);
outputDocument.save(outputFile, PDFSaveFullOptions.newInstance());
System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment