Last active
June 10, 2016 21:31
-
-
Save JoelGeraci-Datalogics/c0ee1c860386430c0427 to your computer and use it in GitHub Desktop.
Splits a PDF document based on bookmark destinations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright Datalogics, Inc. 2015 | |
*/ | |
package pdfjt.cookbook.document; | |
import com.adobe.internal.io.ByteReader; | |
import com.adobe.internal.io.ByteWriter; | |
import com.adobe.internal.io.InputStreamByteReader; | |
import com.adobe.pdfjt.pdf.document.PDFDocument; | |
import com.adobe.pdfjt.pdf.document.PDFOpenOptions; | |
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions; | |
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmark; | |
import com.adobe.pdfjt.pdf.interactive.navigation.PDFBookmarkRoot; | |
import com.adobe.pdfjt.pdf.page.PDFPage; | |
import com.adobe.pdfjt.services.manipulations.PMMOptions; | |
import com.adobe.pdfjt.services.manipulations.PMMService; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.ArrayList; | |
import java.util.List; | |
import pdfjt.util.SampleFileServices; | |
/** | |
* Splits a document based on bookmarks. | |
* | |
* What you need to know first: | |
* | |
* The Document Outline consists of a tree-structured hierarchy of outline | |
* nodes, commonly called bookmarks, which serve as a visual table of contents | |
* to display the document's structure to the user. For the sake of consistency | |
* with the Java Toolkit API, we'll use the term "bookmark" to refer to an | |
* outline item from here on. | |
* | |
* The nodes at each level of the hierarchy form a linked list (not a Java | |
* LinkedList Object), chained together through their Prev and Next entries and | |
* accessed through the First and Last entries in the parent node. | |
*/ | |
public class SplitDocumentBasedOnBookmarks { | |
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/AcrobatDC_PDFCreationSettings.pdf"; | |
private static final String outputDir = "cookbook/Document/output/split/"; | |
public static void main(String[] args) throws Exception { | |
/* | |
* Read in PDF input file | |
*/ | |
URLConnection connection = new URL(inputPDFURL).openConnection(); | |
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); | |
connection.connect(); | |
InputStream fis = connection.getInputStream(); | |
ByteReader byteReader = new InputStreamByteReader(fis); | |
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance()); | |
PDFBookmarkRoot pdfBookmarkRoot = pdfDocument.requireCatalog().getBookmarkRoot(); | |
if (pdfBookmarkRoot != null) { | |
/* | |
* To split the document based on bookmarks we need to know the | |
* destination of a particular bookmark and the destination of the | |
* following bookmark. To make this task easier, we read the top | |
* level of the bookmark tree and populate an ArrayList. | |
*/ | |
List<PDFBookmark> firstLevelBookmarks = new ArrayList<PDFBookmark>(); | |
int firstLevelBookmarkCount = pdfBookmarkRoot.getNumKids(); | |
PDFBookmark pdfBookmark = pdfBookmarkRoot.getFirstKid(); | |
firstLevelBookmarks.add(pdfBookmark); | |
for (int i=1; i < firstLevelBookmarkCount; i++) { | |
pdfBookmark = pdfBookmark.getNext(); | |
firstLevelBookmarks.add(pdfBookmark); | |
} | |
/* | |
* First we extract pages from the first page to the page just | |
* before the destination of the second bookmark. | |
*/ | |
pdfBookmark = pdfBookmarkRoot.getFirstKid(); | |
PDFBookmark nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(1); | |
int startPageNum = 0; | |
int endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex(); | |
String outputFileName = "1_"+pdfBookmark.getTitle()+".pdf"; | |
extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName); | |
/* | |
* Now we can look through the array and extract the pages based on | |
* the destinations of the bookmarks. | |
*/ | |
for (int i=1; i < firstLevelBookmarkCount-1; i++) { | |
pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(i); | |
nextPDFBookmark = (PDFBookmark) firstLevelBookmarks.get(i+1); | |
startPageNum = pdfBookmark.getDestination().getPage().getIndex(); | |
endPageNum = nextPDFBookmark.getDestination().getPage().getPageIndex(); | |
outputFileName = String.valueOf(i+1)+"_"+pdfBookmark.getTitle()+".pdf"; | |
extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName); | |
} | |
/* | |
* Finally, we extract pages from the destination page of the last | |
* bookmark to the last page of the document. | |
*/ | |
pdfBookmark = (PDFBookmark) firstLevelBookmarks.get(firstLevelBookmarkCount-1); | |
startPageNum = pdfBookmark.getDestination().getPage().getIndex(); | |
endPageNum = pdfDocument.requirePages().getNumPages(); | |
outputFileName = firstLevelBookmarkCount+"_"+pdfBookmark.getTitle()+".pdf"; | |
extractToNewDocument(pdfDocument, startPageNum, endPageNum, outputFileName); | |
} | |
} | |
/** | |
* Extract a range of pages from the one document to create another. | |
* | |
* @param pdfDocument | |
* the source document | |
* @param startPageNum | |
* the first zero based page in the range to extract. | |
* @param endPageNum | |
* the last zero based page in the range to extract. | |
* @param outputFileName | |
* the filename of the extracted file. | |
* @return void | |
*/ | |
private static void extractToNewDocument(PDFDocument pdfDocument, int startPageNum, int endPageNum, String outputFileName) throws Exception { | |
PMMService pmmService = new PMMService(pdfDocument); | |
PDFPage startPage = pdfDocument.requireCatalog().getPages().getPage(startPageNum); | |
int pageCount = endPageNum - startPageNum; | |
PDFDocument outputDocument = pmmService.extractPages(startPage, pageCount, PMMOptions.newInstanceAll(), PDFOpenOptions.newInstance()); | |
//System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize()); | |
ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + outputFileName); | |
outputDocument.save(outputFile, PDFSaveFullOptions.newInstance()); | |
System.out.println("Created: " + outputFileName+" - "+outputDocument.getFileSize()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment