Skip to content

Instantly share code, notes, and snippets.

@JoelGeraci-Datalogics
Created October 23, 2016 22:26
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save JoelGeraci-Datalogics/c0b57d4f9a8c2a219f88f2f9227271e8 to your computer and use it in GitHub Desktop.
Search and Redact a PDF Using RegEx
/*
* Copyright Datalogics, Inc. 2015
*/
package pdfjt.cookbook.document;
import com.adobe.fontengine.font.Font;
import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.graphics.font.PDFFont;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationRedaction;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.services.ap.AppearanceService;
import com.adobe.pdfjt.services.ap.spi.APContext;
import com.adobe.pdfjt.services.ap.spi.APResources;
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
import com.adobe.pdfjt.services.redaction.RedactionOptions;
import com.adobe.pdfjt.services.redaction.RedactionService;
import com.adobe.pdfjt.services.textextraction.Word;
import com.adobe.pdfjt.services.textextraction.WordsIterator;
import com.datalogics.pdf.document.FontSetLoader;
import com.datalogics.pdf.samples.util.IoUtils;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.EnumSet;
import java.util.HashMap;
/**
* Searches for phone numbers and redacts them.
*/
public class SearchAndRedactUsingRegEx {
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/SearchAndRedactUsingRegEx_Input.pdf";
static public void main(String[] args) throws Exception {
// First read in the PDF file
URLConnection connection = new URL(inputPDFURL).openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
connection.connect();
InputStream fis = connection.getInputStream();
ByteReader byteReader = new InputStreamByteReader(fis);
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
// Then get the first (and only) page in the file. We'll need this object in order to add annotations to it.
PDFPage pdfPageOne = pdfDocument.requirePages().getPage(0);
ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet());
WordsIterator wordsIterator = textExtractor.getWordsIterator();
System.out.println("Phone Numbers Found:");
while (wordsIterator.hasNext()) {
Word word = wordsIterator.next();
// Look for phone number pattern ###-###-####
if (word.toString().matches("\\d{3}[-\\.\\s]\\d{3}[-\\.\\s]\\d{4}")) {
System.out.println(word.toString());
/*
* Create a new Redaction annotation and use the location
* properties of the word to set the properties of the
* annotation.
*
*/
PDFAnnotationRedaction pdfAnnotationRedaction = PDFAnnotationRedaction.newInstance(pdfDocument);
pdfAnnotationRedaction.setQuadPoints(wordQuadsToAnnotQuads(word));
pdfAnnotationRedaction.setRect(pdfAnnotationRedaction.getRedactionAreaBBox());
pdfAnnotationRedaction.setColor(new double[] { 1, 0, 0 }); // red
pdfAnnotationRedaction.setInteriorColor(new double[] { 0, 0, 0 }); // black
pdfPageOne.addAnnotation(pdfAnnotationRedaction);
}
}
// Now create the appearances of the Redaction annotations
APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(),
pdfDocument.getCosDocument().getOptions().getDocLocale(),
new HashMap<Font, PDFFont>());
APContext apContext = new APContext(apResources, true, null);
apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Redact));
AppearanceService.generateAppearances(pdfDocument, apContext, null);
// Apply the redactions
RedactionService.applyRedaction(pdfDocument,
new RedactionOptions(null),
IoUtils.newByteWriter(IoUtils.createUrlFromPath("SearchAndRedactUsingRegEx_Output.pdf")));
System.out.println("Done!");
}
public static double[] wordQuadsToAnnotQuads(Word word) throws Exception {
double[] quadPoints = new double[8];
quadPoints[0] = word.getBoundingQuads().get(0).p1().x();
quadPoints[1] = word.getBoundingQuads().get(0).p1().y();
quadPoints[2] = word.getBoundingQuads().get(0).p2().x();
quadPoints[3] = word.getBoundingQuads().get(0).p2().y();
quadPoints[4] = word.getBoundingQuads().get(0).p3().x();
quadPoints[5] = word.getBoundingQuads().get(0).p3().y();
quadPoints[6] = word.getBoundingQuads().get(0).p4().x();
quadPoints[7] = word.getBoundingQuads().get(0).p4().y();
return quadPoints;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment