Created
June 2, 2016 17:38
-
-
Save JoelGeraci-Datalogics/d7fc3faa1112345201b6ed1e62aeb049 to your computer and use it in GitHub Desktop.
This sample locates words and adds Polygon annotations over the word quads.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright Datalogics, Inc. 2015 | |
*/ | |
package pdfjt.cookbook.document; | |
import com.adobe.fontengine.font.Font; | |
import com.adobe.internal.io.ByteReader; | |
import com.adobe.internal.io.InputStreamByteReader; | |
import com.adobe.pdfjt.core.types.ASName; | |
import com.adobe.pdfjt.core.types.ASQuad; | |
import com.adobe.pdfjt.pdf.document.PDFDocument; | |
import com.adobe.pdfjt.pdf.document.PDFOpenOptions; | |
import com.adobe.pdfjt.pdf.graphics.font.PDFFont; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationPolygon; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFBorder; | |
import com.adobe.pdfjt.pdf.page.PDFPage; | |
import com.adobe.pdfjt.services.ap.AppearanceService; | |
import com.adobe.pdfjt.services.ap.spi.APContext; | |
import com.adobe.pdfjt.services.ap.spi.APResources; | |
import com.adobe.pdfjt.services.textextraction.TextExtractor; | |
import com.adobe.pdfjt.services.textextraction.Word; | |
import com.adobe.pdfjt.services.textextraction.WordsIterator; | |
import com.datalogics.pdf.document.DocumentHelper; | |
import com.datalogics.pdf.document.FontSetLoader; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.EnumSet; | |
import java.util.HashMap; | |
import java.util.List; | |
import org.apache.commons.lang3.ArrayUtils; | |
/** | |
* This sample locates words and adds Polygon annotations over the word quads. | |
*/ | |
public class Quadrilaterals { | |
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/Quadrilaterals.pdf"; | |
private static final String outputDir = "cookbook/Document/output/"; | |
private static final double red[] = { 1.0, 0, 0 }; //RGB Red | |
static public void main(String[] args) throws Exception { | |
// First read in the PDF file | |
URLConnection connection = new URL(inputPDFURL).openConnection(); | |
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); | |
connection.connect(); | |
InputStream fis = connection.getInputStream(); | |
ByteReader byteReader = new InputStreamByteReader(fis); | |
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance()); | |
// Then get the first (and only) page in the file. We'll need this object in order to add annotations to it. | |
PDFPage pdfPageOne = pdfDocument.requirePages().getPage(0); | |
TextExtractor textExtractor = TextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet()); | |
WordsIterator wordsIterator = textExtractor.getWordsIterator(); | |
System.out.println("Words Found:"); | |
while (wordsIterator.hasNext()) { | |
/* | |
* A Word object contains Unicode string representing the word | |
* along with a list of bounding boxes as needed to enclose all | |
* of the characters. If the characters of the word are | |
* colinear, then there might only be one bounding box covering | |
* the entire word. If the characters follow a curved path, then | |
* a list of bounding boxes, potentially one for each character, | |
* will be returned. | |
*/ | |
Word word = wordsIterator.next(); | |
System.out.println(word.toString()); | |
List<ASQuad> quads = word.getBoundingQuads(); | |
/* | |
* Now we iterate over each word quad and create a Polygon annotation using the same coordinates. | |
*/ | |
for (ASQuad quad : quads) { | |
double[] verticies = quad.getValues(); | |
ArrayUtils.add(verticies, quad.p1().x()); | |
ArrayUtils.add(verticies, quad.p1().y()); | |
PDFAnnotationPolygon pdfAnnotationPolygon = PDFAnnotationPolygon.newInstance(pdfDocument); | |
pdfAnnotationPolygon.setDictionaryArrayValue(ASName.k_Vertices, verticies); | |
pdfAnnotationPolygon.setColor(red); | |
PDFBorder pdfBorder = PDFBorder.newInstance(pdfDocument); | |
pdfBorder.setWidth(0); | |
pdfAnnotationPolygon.setBorder(pdfBorder); | |
pdfPageOne.addAnnotation(pdfAnnotationPolygon); | |
} | |
} | |
// Now create the appearances of the Polygon annotations | |
APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(), | |
pdfDocument.getCosDocument().getOptions().getDocLocale(), | |
new HashMap<Font, PDFFont>()); | |
APContext apContext = new APContext(apResources, true, null); | |
apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Polygon)); | |
AppearanceService.generateAppearances(pdfDocument, apContext, null); | |
// Save and close | |
DocumentHelper.saveFullAndClose(pdfDocument, outputDir+"Quadrilaterals.pdf"); | |
// Save the file. | |
System.out.println("Done!"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment