Last active
June 10, 2016 21:30
-
-
Save JoelGeraci-Datalogics/024976f041520d29f0ab to your computer and use it in GitHub Desktop.
Finds and removes Watermarks and Backgrounds from all pages in a document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright Datalogics, Inc. 2015 | |
*/ | |
package pdfjt.cookbook.document; | |
import com.adobe.internal.io.ByteReader; | |
import com.adobe.internal.io.ByteWriter; | |
import com.adobe.internal.io.InputStreamByteReader; | |
import com.adobe.pdfjt.core.cos.CosDictionary; | |
import com.adobe.pdfjt.core.types.ASName; | |
import com.adobe.pdfjt.pdf.content.Content; | |
import com.adobe.pdfjt.pdf.content.ContentReader; | |
import com.adobe.pdfjt.pdf.content.Instruction; | |
import com.adobe.pdfjt.pdf.content.OperandStack; | |
import com.adobe.pdfjt.pdf.contentmodify.ContentWriter; | |
import com.adobe.pdfjt.pdf.document.PDFContents; | |
import com.adobe.pdfjt.pdf.document.PDFDocument; | |
import com.adobe.pdfjt.pdf.document.PDFOpenOptions; | |
import com.adobe.pdfjt.pdf.document.PDFResources; | |
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions; | |
import com.adobe.pdfjt.pdf.document.PDFSaveOptions; | |
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCGroup; | |
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCGroupArray; | |
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCGroupList; | |
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCMembership; | |
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCObject; | |
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCProperties; | |
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCUsage; | |
import com.adobe.pdfjt.pdf.graphics.xobject.PDFXObject; | |
import com.adobe.pdfjt.pdf.graphics.xobject.PDFXObjectMap; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotation; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationList; | |
import com.adobe.pdfjt.pdf.page.PDFPage; | |
import com.adobe.pdfjt.pdf.page.PDFPageTree; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.ArrayList; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Set; | |
import pdfjt.util.SampleFileServices; | |
/** | |
* Finds and removes Watermarks and Backgrounds from all pages in a document. | |
* | |
* What you need to know first: | |
* | |
* When creating a watermark in Adobe Acrobat and other conforming PDF creators, | |
* the dialog box referenced below will create the watermark using one of two | |
* methods. The method depends on the state of the bottom checkbox in the center | |
* of the screen; | |
* | |
* "Keep position and size of watermark text constant when printing on different page sizes" | |
* | |
* See image at the link below. | |
* | |
* <img src= | |
* "http://dev.datalogics.com/cookbook/javadoc_img/WatermarkKeepPositionAndSize.png" | |
* /> | |
* | |
* When the checkbox is off, Acrobat will use an XObject to add the watermark to | |
* the page which allows it to scale with the page. When on, Acrobat will add a | |
* "Watermark" annotation to the page, allowing it to scale independently of the | |
* underlying content. | |
* | |
* This sample looks for and removes watermarks created using both methods. | |
*/ | |
public class FindAndRemoveWatermarks { | |
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/PDFJT_Getting_Started_Guide_Watermarked.pdf"; | |
private static final String outputDir = "cookbook/Document/output/"; | |
public static void main(String[] args) throws Exception { | |
/* | |
* Read in PDF input file | |
*/ | |
URLConnection connection = new URL(inputPDFURL).openConnection(); | |
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); | |
connection.connect(); | |
InputStream fis = connection.getInputStream(); | |
ByteReader byteReader = new InputStreamByteReader(fis); | |
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance()); | |
/* | |
* Watermarks are added at the page level so we need to iterate over the | |
* pages. | |
*/ | |
PDFPageTree pdfPages = pdfDocument.requirePages(); | |
Iterator<PDFPage> pageIterator = pdfPages.iterator(); | |
while (pageIterator.hasNext()) { | |
PDFPage pdfPage = (PDFPage) pageIterator.next(); | |
/* | |
* First look for Watermark Annotations. If we go in reverse order, | |
* we can delete them as we find them. | |
*/ | |
if (pdfPage.hasAnnotations()) { | |
PDFAnnotationList pdfAnnotationList = pdfPage.getAnnotationList(); | |
int annotationListSize = pdfAnnotationList.size(); | |
for (int i = 1; i < annotationListSize + 1; i++) { | |
PDFAnnotation pdfAnnotation = pdfAnnotationList.get(annotationListSize - i); | |
if (pdfAnnotation.getSubtype().asString().matches("Watermark")) { | |
pdfAnnotationList.remove(pdfAnnotation); | |
} | |
} | |
} | |
/* | |
* Next look for the XObjects. | |
*/ | |
pdfPage = findAndRemoveWatermarkOrBackgroundXObjects(pdfPage); | |
} | |
/* | |
* Clean up the Optional Content Properties Dictionary to remove layers | |
* referenced by the XObjects. | |
*/ | |
PDFOCProperties pdfOCProperties = pdfDocument.requireCatalog().getOCProperties(); | |
PDFOCGroupArray pdfOCGroupArray = pdfOCProperties.getOCGs(); | |
int pdfOCGroupArraySize = pdfOCGroupArray.size(); | |
for (int i = 1; i < pdfOCGroupArraySize + 1; i++) { | |
PDFOCGroup pdfOCGroup = pdfOCGroupArray.get(pdfOCGroupArraySize - i); | |
PDFOCUsage pdfOCUsage = pdfOCGroup.getUsage(); | |
if (isUsageForWatermarkOrBackground(pdfOCUsage)) { | |
pdfOCGroupArray.remove(pdfOCGroupArraySize - i); | |
} | |
} | |
/* | |
* If there are no OCGs left, just remove the whole Optional Content | |
* Properties Dictionary. | |
*/ | |
if (pdfOCGroupArray.size() == 0) { | |
pdfDocument.requireCatalog().removeValue(ASName.k_OCProperties); | |
} | |
// Save the file. | |
SampleFileServices.createDir(outputDir); | |
ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + "WatermarksRemoved.pdf"); | |
PDFSaveOptions pdfSaveOptions = PDFSaveFullOptions.newInstance(); | |
pdfDocument.save(outputFile, pdfSaveOptions); | |
System.out.println("Done!"); | |
} | |
/** | |
* Iterates through the page resources detecting and removing watermarks and | |
* backgrounds. | |
* | |
* @param pdfPage | |
* The page in question. | |
* @return PDFpage | |
*/ | |
private static PDFPage findAndRemoveWatermarkOrBackgroundXObjects(PDFPage pdfPage) throws Exception { | |
PDFResources pdfResources = pdfPage.getResources(); | |
PDFXObjectMap pdfXObjectMap = pdfResources.getXObjectMap(); | |
/* | |
* We need to iterate over the XObjects in the resources but we can't | |
* modify the collection while iterating so we add the name of any | |
* XObjects that we want to remove to an ArrayList that we'll use later. | |
*/ | |
List<ASName> toRemoveList = new ArrayList<ASName>(); | |
Set<ASName> keys = pdfXObjectMap.keySet(); | |
Iterator<ASName> keyIterator = keys.iterator(); | |
while (keyIterator.hasNext()) { | |
ASName key = (ASName) keyIterator.next(); | |
PDFXObject pdfXObject = pdfXObjectMap.get(key); | |
/* | |
* Detect if the XObject is being used as a watermark or background. | |
* See method for further explanation. | |
*/ | |
if (isWatermarkOrBackgroundXObject(pdfXObject) == true) { | |
/* | |
* Add the name of the XObject to the list we created above. | |
*/ | |
toRemoveList.add(key); | |
/* | |
* Remove the instructions to pain the XObject. See method for | |
* further explanation. | |
*/ | |
PDFContents cleanedPageContent = removeXObjectPaintInstructionFromPage(pdfPage, key); | |
pdfPage.setContents(cleanedPageContent); | |
} | |
} | |
/* | |
* Now remove the XObjects we've found to be watermarks or backgrounds. | |
*/ | |
for (ASName removeMe : toRemoveList) { | |
pdfXObjectMap.remove(removeMe); | |
} | |
return pdfPage; | |
} | |
/** | |
* Determines if a PDFXObject is being used as a watermark or background as | |
* specified in the Usage dictionary. | |
* | |
* @param pdfXObject | |
* The XObject in question. | |
* @return boolean | |
*/ | |
private static boolean isWatermarkOrBackgroundXObject(PDFXObject pdfXObject) throws Exception { | |
/* | |
* XObjects can belong to Optional Content Groups (OCG). Assigning a | |
* watermark to an OCG allows the watermark to appear on screen but not | |
* in print or vice versa. We can use the Usage dictionary to determine | |
* if the XObject is a watermark. | |
*/ | |
boolean toReturn = false; | |
PDFOCGroup pdfOCGroup = null; | |
PDFOCUsage pdfOCUsage = null; | |
/* | |
* Find out what OCG the XObject is a member of. | |
*/ | |
if (PDFOCMembership.hasOC(pdfXObject)) { | |
/* | |
* The XObject can belong to a single OCG or be in a membership | |
* group but we can get the usage of either. | |
*/ | |
PDFOCObject pdfOCObject = PDFOCMembership.getOC(pdfXObject); | |
if (pdfOCObject.isOCG()) { | |
pdfOCGroup = PDFOCGroup.getInstance(pdfOCObject.getCosObject()); | |
} | |
if (pdfOCObject.isOCMD()) { | |
PDFOCMembership pdfOCMembership = PDFOCMembership.getInstance(pdfOCObject.getCosObject()); | |
PDFOCGroupList pdfOCGroupList = pdfOCMembership.getOCGs(); | |
pdfOCGroup = pdfOCGroupList.getSingleOCGroup(); | |
} | |
pdfOCUsage = pdfOCGroup.getUsage(); | |
if (pdfOCUsage != null) { | |
/* | |
* We need to dig around in the Usage dictionary to get the | |
* PageElement that the XObject is being used as. See method for | |
* further explanation. | |
*/ | |
toReturn = isUsageForWatermarkOrBackground(pdfOCUsage); | |
} | |
} | |
return toReturn; | |
} | |
/** | |
* Examines the PageElement dictionary in the XObject Usage dictionary to | |
* determine if a PDFXObject is being used as a watermark or background. | |
* | |
* @param pdfOCUsage | |
* The Usage Dictionary of the XObject in question. | |
* @return boolean | |
*/ | |
private static boolean isUsageForWatermarkOrBackground(PDFOCUsage pdfOCUsage) throws Exception { | |
/* | |
* Watermarks can be either a Background (behind the page content) or a | |
* Foreground (over the page content). So if the XObject has either a BG | |
* or FG as it's PageElement subtype, we know it's a watermark. | |
*/ | |
boolean toReturn = false; | |
CosDictionary pageElementDict = pdfOCUsage.getDictionaryDictionaryValue(ASName.k_PageElement); | |
String pageElementSubType = pageElementDict.get(ASName.k_Subtype).getValue().toString(); | |
if (pageElementSubType.matches("BG") || pageElementSubType.matches("FG")) { | |
toReturn = true; | |
} | |
return toReturn; | |
} | |
/** | |
* Creates a new set of Instructions as PDFContents minus drawing | |
* instructions for a particular XObject. | |
* | |
* @param pdfPage | |
* The page in question. | |
* @param key | |
* The ASName of the XObject in the page resources dictionary. | |
* @return PDFContents | |
*/ | |
private static PDFContents removeXObjectPaintInstructionFromPage(PDFPage pdfPage, ASName key) throws Exception { | |
/* | |
* It is not enough to simply remove the XObject from the resources, in | |
* order to prevent page rendering problems, you must also remove | |
* references to the XObject from the page content. | |
* | |
* Any XObject can be painted as part of another content stream by means | |
* of the Do operator. The syntax is the same in all cases, although | |
* details of the operator’s behavior differ depending on the type of | |
* XObject. There is a single operand and it's value is the ASName of | |
* the XObject. So, to locate and remove references to the XObject, we | |
* need to iterate through the page content painting instructions | |
* looking for "Do" operators and examining it's operands looking for | |
* one that matches the XObject's name. | |
* | |
* When we <b>don't</b> find a match, we write that instruction to a new | |
* ContentWriter creating a new set of instructions without references | |
* to the XObject. The results become our new PDFContents for that page. | |
*/ | |
Content pageContent = Content.newInstance(pdfPage); | |
ContentReader contentReader = ContentReader.newInstance(pageContent); | |
ContentWriter contentWriter = ContentWriter.newInstance(pdfPage.getPDFDocument()); | |
while (contentReader.hasNext()) { | |
Instruction instruction = contentReader.next(); | |
if (instruction.getOperator().asString().matches("Do") && instruction.getOperands().getSize() == 1) { | |
OperandStack operandStack = instruction.getOperands(); | |
String operandName = operandStack.peekName().asString(); | |
if (operandName.matches(key.asString()) == false) { | |
contentWriter.write(instruction); | |
} | |
} else { | |
contentWriter.write(instruction); | |
} | |
} | |
Content newContent = contentWriter.close(); | |
return newContent.getContents(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment