JoelGeraci-Datalogics/ExtractPagesMatchingSearchTerms.java

## ExtractPagesMatchingSearchTerms.java
/*
 * Copyright Datalogics, Inc. 2015
 */

package pdfjt.cookbook.document;

import com.adobe.fontengine.font.Font;
import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.graphics.font.PDFFont;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationHighlight;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.pdf.page.PDFPageTree;
import com.adobe.pdfjt.services.ap.AppearanceService;
import com.adobe.pdfjt.services.ap.spi.APContext;
import com.adobe.pdfjt.services.ap.spi.APResources;
import com.adobe.pdfjt.services.manipulations.PMMOptions;
import com.adobe.pdfjt.services.manipulations.PMMService;
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
import com.adobe.pdfjt.services.textextraction.Word;
import com.adobe.pdfjt.services.textextraction.WordsIterator;
import com.datalogics.pdf.document.DocumentHelper;
import com.datalogics.pdf.document.FontSetLoader;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;

/**
 * Searches for a word, highlights it, then extracts all pages that contain that word.
 */
public class ExtractPagesMatchingSearchTerms {

    private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/PDF32000_2008.pdf";
    private static final String outputDir = "cookbook/Document/output/";

    static public void main(String[] args) throws Exception {

        // First read in the PDF file
        URLConnection connection = new URL(inputPDFURL).openConnection();
        connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
        connection.connect();
        InputStream fis = connection.getInputStream();
        ByteReader byteReader = new InputStreamByteReader(fis);
        PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
        PDFPageTree pdfPageTree = pdfDocument.requirePages();

        // This List will hold the pages that need to be extracted
        List<PDFPage> pagesToExtract = new ArrayList<>();

        ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet());
        WordsIterator wordsIterator = textExtractor.getWordsIterator();
        System.out.println("Pages Extracted:");
        while (wordsIterator.hasNext()) {
            Word word = wordsIterator.next();
            if (word.toString().toLowerCase().contains("javascript")) {
                System.out.println(word.getPageNumber());
                /*
                 * Create a new Highlight annotation and use the location
                 * properties of the word to set the properties of the
                 * annotation.
                 */
                PDFAnnotationHighlight pdfAnnotationHighlight = PDFAnnotationHighlight.newInstance(pdfDocument);
                pdfAnnotationHighlight.setQuadPoints(wordQuadsToAnnotQuads(word));
                pdfAnnotationHighlight.setColor(new double[] { 1, 0.819611, 0 }); // yellow to match Acrobat Highlights
                PDFPage pdfPage = pdfPageTree.getPage(word.getPageNumber()-1);
                pdfPage.addAnnotation(pdfAnnotationHighlight);
                // Add the page to the list of pages to extract if it's not already there.
                if (pagesToExtract.contains(pdfPage) == false) {
                    pagesToExtract.add(pdfPage);
                }
            }
        }
        // Now create the appearances of the Highlight annotations
        APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(),
                pdfDocument.getCosDocument().getOptions().getDocLocale(),
                new HashMap<Font, PDFFont>());
        APContext apContext = new APContext(apResources, true, null);
        apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Highlight));
        AppearanceService.generateAppearances(pdfDocument, apContext, null);

        // Now extract the pages
        PDFPage[] pages = new PDFPage[pagesToExtract.size()];
        pages = pagesToExtract.toArray(pages);
        PMMService pmmService = new PMMService(pdfDocument);
        PDFDocument extractedPages = pmmService.extractPages(pages,PMMOptions.newInstance(PMMOptions.AnnotationsForms), PDFOpenOptions.newInstance());

        // Save and close
        DocumentHelper.saveFullAndClose(extractedPages, outputDir+"JavaScriptPages.pdf");
        System.out.println("Done!");
    }

    public static double[] wordQuadsToAnnotQuads(Word word) throws Exception {
        double[] quadPoints = new double[8];
        quadPoints[0] = word.getBoundingQuads().get(0).p4().x();
        quadPoints[1] = word.getBoundingQuads().get(0).p4().y();
        quadPoints[2] = word.getBoundingQuads().get(0).p3().x();
        quadPoints[3] = word.getBoundingQuads().get(0).p3().y();
        quadPoints[4] = word.getBoundingQuads().get(0).p1().x();
        quadPoints[5] = word.getBoundingQuads().get(0).p1().y();
        quadPoints[6] = word.getBoundingQuads().get(0).p2().x();
        quadPoints[7] = word.getBoundingQuads().get(0).p2().y();
        return quadPoints;
    }


}
	/*
	* Copyright Datalogics, Inc. 2015
	*/

	package pdfjt.cookbook.document;

	import com.adobe.fontengine.font.Font;
	import com.adobe.internal.io.ByteReader;
	import com.adobe.internal.io.InputStreamByteReader;
	import com.adobe.pdfjt.pdf.document.PDFDocument;
	import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
	import com.adobe.pdfjt.pdf.graphics.font.PDFFont;
	import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum;
	import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationHighlight;
	import com.adobe.pdfjt.pdf.page.PDFPage;
	import com.adobe.pdfjt.pdf.page.PDFPageTree;
	import com.adobe.pdfjt.services.ap.AppearanceService;
	import com.adobe.pdfjt.services.ap.spi.APContext;
	import com.adobe.pdfjt.services.ap.spi.APResources;
	import com.adobe.pdfjt.services.manipulations.PMMOptions;
	import com.adobe.pdfjt.services.manipulations.PMMService;
	import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
	import com.adobe.pdfjt.services.textextraction.Word;
	import com.adobe.pdfjt.services.textextraction.WordsIterator;
	import com.datalogics.pdf.document.DocumentHelper;
	import com.datalogics.pdf.document.FontSetLoader;
	import java.io.InputStream;
	import java.net.URL;
	import java.net.URLConnection;
	import java.util.ArrayList;
	import java.util.EnumSet;
	import java.util.HashMap;
	import java.util.List;

	/**
	* Searches for a word, highlights it, then extracts all pages that contain that word.
	*/
	public class ExtractPagesMatchingSearchTerms {

	private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/PDF32000_2008.pdf";
	private static final String outputDir = "cookbook/Document/output/";

	static public void main(String[] args) throws Exception {

	// First read in the PDF file
	URLConnection connection = new URL(inputPDFURL).openConnection();
	connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
	connection.connect();
	InputStream fis = connection.getInputStream();
	ByteReader byteReader = new InputStreamByteReader(fis);
	PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
	PDFPageTree pdfPageTree = pdfDocument.requirePages();

	// This List will hold the pages that need to be extracted
	List<PDFPage> pagesToExtract = new ArrayList<>();

	ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet());
	WordsIterator wordsIterator = textExtractor.getWordsIterator();
	System.out.println("Pages Extracted:");
	while (wordsIterator.hasNext()) {
	Word word = wordsIterator.next();
	if (word.toString().toLowerCase().contains("javascript")) {
	System.out.println(word.getPageNumber());
	/*
	* Create a new Highlight annotation and use the location
	* properties of the word to set the properties of the
	* annotation.
	*/
	PDFAnnotationHighlight pdfAnnotationHighlight = PDFAnnotationHighlight.newInstance(pdfDocument);
	pdfAnnotationHighlight.setQuadPoints(wordQuadsToAnnotQuads(word));
	pdfAnnotationHighlight.setColor(new double[] { 1, 0.819611, 0 }); // yellow to match Acrobat Highlights
	PDFPage pdfPage = pdfPageTree.getPage(word.getPageNumber()-1);
	pdfPage.addAnnotation(pdfAnnotationHighlight);
	// Add the page to the list of pages to extract if it's not already there.
	if (pagesToExtract.contains(pdfPage) == false) {
	pagesToExtract.add(pdfPage);
	}
	}
	}
	// Now create the appearances of the Highlight annotations
	APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(),
	pdfDocument.getCosDocument().getOptions().getDocLocale(),
	new HashMap<Font, PDFFont>());
	APContext apContext = new APContext(apResources, true, null);
	apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Highlight));
	AppearanceService.generateAppearances(pdfDocument, apContext, null);

	// Now extract the pages
	PDFPage[] pages = new PDFPage[pagesToExtract.size()];
	pages = pagesToExtract.toArray(pages);
	PMMService pmmService = new PMMService(pdfDocument);
	PDFDocument extractedPages = pmmService.extractPages(pages,PMMOptions.newInstance(PMMOptions.AnnotationsForms), PDFOpenOptions.newInstance());

	// Save and close
	DocumentHelper.saveFullAndClose(extractedPages, outputDir+"JavaScriptPages.pdf");
	System.out.println("Done!");
	}

	public static double[] wordQuadsToAnnotQuads(Word word) throws Exception {
	double[] quadPoints = new double[8];
	quadPoints[0] = word.getBoundingQuads().get(0).p4().x();
	quadPoints[1] = word.getBoundingQuads().get(0).p4().y();
	quadPoints[2] = word.getBoundingQuads().get(0).p3().x();
	quadPoints[3] = word.getBoundingQuads().get(0).p3().y();
	quadPoints[4] = word.getBoundingQuads().get(0).p1().x();
	quadPoints[5] = word.getBoundingQuads().get(0).p1().y();
	quadPoints[6] = word.getBoundingQuads().get(0).p2().x();
	quadPoints[7] = word.getBoundingQuads().get(0).p2().y();
	return quadPoints;
	}


	}