JoelGeraci-Datalogics/jsoupWithPDF_Java_Toolkit.java

## jsoupWithPDF_Java_Toolkit.java
/*
 * Copyright Datalogics, Inc. 2015
 */

package pdfjt.cookbook.document;

import java.awt.Color;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import com.adobe.pdfjt.core.types.ASString;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.interactive.action.PDFActionURI;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationLink;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFBorder;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
import com.adobe.pdfjt.services.textextraction.Word;
import com.adobe.pdfjt.services.textextraction.WordsIterator;
import com.datalogics.pdf.document.DocumentHelper;
import com.datalogics.pdf.document.FontSetLoader;
import com.datalogics.pdf.layout.LayoutEngine;
import com.datalogics.pdf.text.Dimension;
import com.datalogics.pdf.text.Heading;
import com.datalogics.pdf.text.Length;
import com.datalogics.pdf.text.Paragraph;
import com.datalogics.pdf.text.Span;

/**
 * This sample reads an HTML file and adds text to a new PDF file.
 */
public class jsoupWithPDF_Java_Toolkit {

    private static final String inputURL = "https://jsoup.org/";
    private static final String outputDir = "cookbook/Document/output/";

    static public void main(String[] args) throws Exception {
        /*
         * We're using JSoup so we'll read the HTML file from their home page
         * and select one of the elements that has some content we want to
         * layout.
         */
        Document doc = Jsoup.connect(inputURL).userAgent("Mozilla").get();
        Element col1 = doc.select(".col1").first();

        // Create a new blank PDF file.
        PDFDocument pdfDocument = PDFDocument.newInstance(PDFOpenOptions.newInstance());
        /*
         * Create a new LayoutEngine object that will actually perform the
         * addition of text to the page.
         */
        try (LayoutEngine layout = new LayoutEngine(pdfDocument)) {
            // Set the font for the entire document
            layout.getStyle().setFontFamily("Helvetica");
            // Work our way through the HTML Element by Element
            for (Element element : col1.children()) {
                if (element.isBlock() && element.text().isEmpty() == false) {
                    String nodeName = element.nodeName().toLowerCase();
                    switch (nodeName) {
                    case "h1":
                    case "h2":
                    case "h3":
                    case "h4":
                        /*
                         * Add the text content of the <Hx> Element to the page
                         * as a Heading object.
                         *
                         */
                        layout.add(new Heading(element.text()));
                        break;
                    case "p":
                        if (element.childNodes().size() == 1) {
                            /*
                             * If there are no children in the <p>, just add the
                             * text to the page as a Paragraph.
                             */
                            layout.add(new Paragraph(element.text()));
                        } else {
                            Paragraph para = new Paragraph();
                            /*
                             * If there are children in the <p> create a
                             * Paragraph object and add Span objects to it. If
                             * they are links, make them blue.
                             */
                            for (Node childNode : element.childNodes()) {
                                if (childNode.nodeName().matches("a")) {
                                    Element nodeAsElement = (Element) childNode;
                                    Span span = new Span(nodeAsElement.text());
                                    span.getStyle().setColor(Color.BLUE);
                                    para.add(span);
                                } else {
                                    if (childNode.nodeName().matches("#text")) {
                                        para.add(new Span(childNode.toString()));
                                    } else {
                                        Element nodeAsElement = (Element) childNode;
                                        para.add(new Span(nodeAsElement.text()));
                                    }
                                }
                            }
                            layout.add(para);
                        }
                        break;
                    case "ul":
                        /*
                         * Talkeetna doesn't do lists yet so we'll fake it and
                         * just append a bullet or number as needed.
                         */
                        for (Element li : element.children()) {
                            Paragraph para = new Paragraph("\u2022  " + li.text());
                            para.getStyle().setTextIndent(new Length(18, Dimension.PT));
                            para.getStyle().setFontSize(new Length(10, Dimension.PT));
                            para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
                            layout.add(para);
                        }
                        Paragraph para = new Paragraph("");
                        para.getStyle().setTextIndent(new Length(18, Dimension.PT));
                        para.getStyle().setFontSize(new Length(10, Dimension.PT));
                        para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
                        layout.add(para);
                        break;
                    case "ol":
                        int i = 1;
                        for (Element li : element.children()) {
                            para = new Paragraph(String.valueOf(i) + "  " + li.text());
                            para.getStyle().setTextIndent(new Length(18, Dimension.PT));
                            para.getStyle().setFontSize(new Length(10, Dimension.PT));
                            para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
                            layout.add(para);
                            i++;
                        }
                        para = new Paragraph("");
                        para.getStyle().setTextIndent(new Length(18, Dimension.PT));
                        para.getStyle().setFontSize(new Length(10, Dimension.PT));
                        para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
                        layout.add(para);
                        break;

                    case "pre":
                        String[] lines = StringUtils.split(element.text(), System.lineSeparator());
                        for (String line : lines) {
                            Paragraph pre = new Paragraph(line);
                            pre.getStyle().setColor(Color.GRAY);
                            pre.getStyle().setFontFamily("Courier");
                            pre.getStyle().setTextIndent(new Length(18, Dimension.PT));
                            pre.getStyle().setFontSize(new Length(10, Dimension.PT));
                            pre.getStyle().setMarginBottom(new Length(0, Dimension.PT));
                            layout.add(pre);
                        }
                        para = new Paragraph("");
                        para.getStyle().setTextIndent(new Length(18, Dimension.PT));
                        para.getStyle().setFontSize(new Length(10, Dimension.PT));
                        para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
                        layout.add(para);
                        break;
                    }
                }
            }
        }
        /*
         * Now collect the words that we just layed out. We'll use the list to locate where the text inside <a> tags are on the page.
         */
        ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument,
                FontSetLoader.newInstance().getFontSet());
        WordsIterator wordsIterator = textExtractor.getWordsIterator();
        List<Word> wordsArray = new ArrayList<Word>();
        List<String> stringsArray = new ArrayList<String>();

        while (wordsIterator.hasNext()) {
            Word word = wordsIterator.next();
            if (word.toString().matches(" ") == false && word.toString().contains("\n") == false) {
                wordsArray.add(word);
                // Strip punctuation
                stringsArray.add(word.toString().replaceAll("[^a-zA-Z0-9 ]", ""));
            }
        }

        /*
         * Select <a> Elements and then locate the text inside them on the page.
         */
        Elements links = col1.select("a[href]");
        int start = 0;
        for (Element link : links) {
            if (link.parent().nodeName().contains("li") == false) {
                if (link.hasText()) {
                    for (String linkWord : link.text().split(" ")) {
                        int position = stringsArray.subList(start, stringsArray.size()).indexOf(linkWord);
                        Word word = wordsArray.get(position + start);
                        /*
                         * Add a link to the page based on the bounding quads of
                         * the Word. Set the destination to be the same as the
                         * href in the <a> tag.
                         */
                        PDFAnnotationLink pdfAnnotationLink = PDFAnnotationLink.newInstance(pdfDocument);
                        pdfAnnotationLink.setRect(word.getBoundingQuads().get(0).p1().x(), word.getBoundingQuads().get(0).p1().y(),
                                word.getBoundingQuads().get(0).p3().x(), word.getBoundingQuads().get(0).p3().y());
                        PDFActionURI pdfActionURI = PDFActionURI.newInstance(pdfDocument);
                        pdfActionURI.setURI(new ASString(link.absUrl("href")));
                        pdfAnnotationLink.setAction(pdfActionURI);
                        PDFBorder pdfBorder = PDFBorder.newInstance(pdfDocument);
                        pdfBorder.setWidth(0);
                        pdfAnnotationLink.setBorder(pdfBorder);
                        PDFPage pdfPage = pdfDocument.requirePages().getPage(word.getPageNumber() - 1);
                        pdfPage.addAnnotation(pdfAnnotationLink);
                        if (position > 0) {
                            position += start;
                            start = position;
                        }
                    }
                }
            }
        }

        // Save and close
        DocumentHelper.saveFullAndClose(pdfDocument, outputDir + "jsoup_Output.pdf");

        // Save the file.
        System.out.println("Done!");
    }

}
	/*
	* Copyright Datalogics, Inc. 2015
	*/

	package pdfjt.cookbook.document;

	import java.awt.Color;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.commons.lang3.StringUtils;
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Document;
	import org.jsoup.nodes.Element;
	import org.jsoup.nodes.Node;
	import org.jsoup.select.Elements;

	import com.adobe.pdfjt.core.types.ASString;
	import com.adobe.pdfjt.pdf.document.PDFDocument;
	import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
	import com.adobe.pdfjt.pdf.interactive.action.PDFActionURI;
	import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationLink;
	import com.adobe.pdfjt.pdf.interactive.annotation.PDFBorder;
	import com.adobe.pdfjt.pdf.page.PDFPage;
	import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
	import com.adobe.pdfjt.services.textextraction.Word;
	import com.adobe.pdfjt.services.textextraction.WordsIterator;
	import com.datalogics.pdf.document.DocumentHelper;
	import com.datalogics.pdf.document.FontSetLoader;
	import com.datalogics.pdf.layout.LayoutEngine;
	import com.datalogics.pdf.text.Dimension;
	import com.datalogics.pdf.text.Heading;
	import com.datalogics.pdf.text.Length;
	import com.datalogics.pdf.text.Paragraph;
	import com.datalogics.pdf.text.Span;

	/**
	* This sample reads an HTML file and adds text to a new PDF file.
	*/
	public class jsoupWithPDF_Java_Toolkit {

	private static final String inputURL = "https://jsoup.org/";
	private static final String outputDir = "cookbook/Document/output/";

	static public void main(String[] args) throws Exception {
	/*
	* We're using JSoup so we'll read the HTML file from their home page
	* and select one of the elements that has some content we want to
	* layout.
	*/
	Document doc = Jsoup.connect(inputURL).userAgent("Mozilla").get();
	Element col1 = doc.select(".col1").first();

	// Create a new blank PDF file.
	PDFDocument pdfDocument = PDFDocument.newInstance(PDFOpenOptions.newInstance());
	/*
	* Create a new LayoutEngine object that will actually perform the
	* addition of text to the page.
	*/
	try (LayoutEngine layout = new LayoutEngine(pdfDocument)) {
	// Set the font for the entire document
	layout.getStyle().setFontFamily("Helvetica");
	// Work our way through the HTML Element by Element
	for (Element element : col1.children()) {
	if (element.isBlock() && element.text().isEmpty() == false) {
	String nodeName = element.nodeName().toLowerCase();
	switch (nodeName) {
	case "h1":
	case "h2":
	case "h3":
	case "h4":
	/*
	* Add the text content of the <Hx> Element to the page
	* as a Heading object.
	*
	*/
	layout.add(new Heading(element.text()));
	break;
	case "p":
	if (element.childNodes().size() == 1) {
	/*
	* If there are no children in the <p>, just add the
	* text to the page as a Paragraph.
	*/
	layout.add(new Paragraph(element.text()));
	} else {
	Paragraph para = new Paragraph();
	/*
	* If there are children in the <p> create a
	* Paragraph object and add Span objects to it. If
	* they are links, make them blue.
	*/
	for (Node childNode : element.childNodes()) {
	if (childNode.nodeName().matches("a")) {
	Element nodeAsElement = (Element) childNode;
	Span span = new Span(nodeAsElement.text());
	span.getStyle().setColor(Color.BLUE);
	para.add(span);
	} else {
	if (childNode.nodeName().matches("#text")) {
	para.add(new Span(childNode.toString()));
	} else {
	Element nodeAsElement = (Element) childNode;
	para.add(new Span(nodeAsElement.text()));
	}
	}
	}
	layout.add(para);
	}
	break;
	case "ul":
	/*
	* Talkeetna doesn't do lists yet so we'll fake it and
	* just append a bullet or number as needed.
	*/
	for (Element li : element.children()) {
	Paragraph para = new Paragraph("\u2022 " + li.text());
	para.getStyle().setTextIndent(new Length(18, Dimension.PT));
	para.getStyle().setFontSize(new Length(10, Dimension.PT));
	para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
	layout.add(para);
	}
	Paragraph para = new Paragraph("");
	para.getStyle().setTextIndent(new Length(18, Dimension.PT));
	para.getStyle().setFontSize(new Length(10, Dimension.PT));
	para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
	layout.add(para);
	break;
	case "ol":
	int i = 1;
	for (Element li : element.children()) {
	para = new Paragraph(String.valueOf(i) + " " + li.text());
	para.getStyle().setTextIndent(new Length(18, Dimension.PT));
	para.getStyle().setFontSize(new Length(10, Dimension.PT));
	para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
	layout.add(para);
	i++;
	}
	para = new Paragraph("");
	para.getStyle().setTextIndent(new Length(18, Dimension.PT));
	para.getStyle().setFontSize(new Length(10, Dimension.PT));
	para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
	layout.add(para);
	break;

	case "pre":
	String[] lines = StringUtils.split(element.text(), System.lineSeparator());
	for (String line : lines) {
	Paragraph pre = new Paragraph(line);
	pre.getStyle().setColor(Color.GRAY);
	pre.getStyle().setFontFamily("Courier");
	pre.getStyle().setTextIndent(new Length(18, Dimension.PT));
	pre.getStyle().setFontSize(new Length(10, Dimension.PT));
	pre.getStyle().setMarginBottom(new Length(0, Dimension.PT));
	layout.add(pre);
	}
	para = new Paragraph("");
	para.getStyle().setTextIndent(new Length(18, Dimension.PT));
	para.getStyle().setFontSize(new Length(10, Dimension.PT));
	para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
	layout.add(para);
	break;
	}
	}
	}
	}
	/*
	* Now collect the words that we just layed out. We'll use the list to locate where the text inside <a> tags are on the page.
	*/
	ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument,
	FontSetLoader.newInstance().getFontSet());
	WordsIterator wordsIterator = textExtractor.getWordsIterator();
	List<Word> wordsArray = new ArrayList<Word>();
	List<String> stringsArray = new ArrayList<String>();

	while (wordsIterator.hasNext()) {
	Word word = wordsIterator.next();
	if (word.toString().matches(" ") == false && word.toString().contains("\n") == false) {
	wordsArray.add(word);
	// Strip punctuation
	stringsArray.add(word.toString().replaceAll("[^a-zA-Z0-9 ]", ""));
	}
	}

	/*
	* Select <a> Elements and then locate the text inside them on the page.
	*/
	Elements links = col1.select("a[href]");
	int start = 0;
	for (Element link : links) {
	if (link.parent().nodeName().contains("li") == false) {
	if (link.hasText()) {
	for (String linkWord : link.text().split(" ")) {
	int position = stringsArray.subList(start, stringsArray.size()).indexOf(linkWord);
	Word word = wordsArray.get(position + start);
	/*
	* Add a link to the page based on the bounding quads of
	* the Word. Set the destination to be the same as the
	* href in the <a> tag.
	*/
	PDFAnnotationLink pdfAnnotationLink = PDFAnnotationLink.newInstance(pdfDocument);
	pdfAnnotationLink.setRect(word.getBoundingQuads().get(0).p1().x(), word.getBoundingQuads().get(0).p1().y(),
	word.getBoundingQuads().get(0).p3().x(), word.getBoundingQuads().get(0).p3().y());
	PDFActionURI pdfActionURI = PDFActionURI.newInstance(pdfDocument);
	pdfActionURI.setURI(new ASString(link.absUrl("href")));
	pdfAnnotationLink.setAction(pdfActionURI);
	PDFBorder pdfBorder = PDFBorder.newInstance(pdfDocument);
	pdfBorder.setWidth(0);
	pdfAnnotationLink.setBorder(pdfBorder);
	PDFPage pdfPage = pdfDocument.requirePages().getPage(word.getPageNumber() - 1);
	pdfPage.addAnnotation(pdfAnnotationLink);
	if (position > 0) {
	position += start;
	start = position;
	}
	}
	}
	}
	}

	// Save and close
	DocumentHelper.saveFullAndClose(pdfDocument, outputDir + "jsoup_Output.pdf");

	// Save the file.
	System.out.println("Done!");
	}

	}