Skip to content

Instantly share code, notes, and snippets.

@JoelGeraci-Datalogics
Last active July 27, 2021 17:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save JoelGeraci-Datalogics/fff1ab35719930576b25 to your computer and use it in GitHub Desktop.
Save JoelGeraci-Datalogics/fff1ab35719930576b25 to your computer and use it in GitHub Desktop.
Finds and removes Watermarks and Backgrounds from all pages in a PDF document
/*
* Copyright Datalogics, Inc. 2015
*/
package pdfjt.cookbook.document;
import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.ByteWriter;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.core.cos.CosDictionary;
import com.adobe.pdfjt.core.types.ASName;
import com.adobe.pdfjt.pdf.content.Content;
import com.adobe.pdfjt.pdf.content.ContentReader;
import com.adobe.pdfjt.pdf.content.Instruction;
import com.adobe.pdfjt.pdf.content.OperandStack;
import com.adobe.pdfjt.pdf.contentmodify.ContentWriter;
import com.adobe.pdfjt.pdf.document.PDFContents;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.document.PDFResources;
import com.adobe.pdfjt.pdf.document.PDFSaveFullOptions;
import com.adobe.pdfjt.pdf.document.PDFSaveOptions;
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCGroup;
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCGroupArray;
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCGroupList;
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCMembership;
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCObject;
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCProperties;
import com.adobe.pdfjt.pdf.graphics.optionalcontent.PDFOCUsage;
import com.adobe.pdfjt.pdf.graphics.xobject.PDFXObject;
import com.adobe.pdfjt.pdf.graphics.xobject.PDFXObjectMap;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotation;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationList;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.pdf.page.PDFPageTree;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import pdfjt.util.SampleFileServices;
/**
* Finds and removes Watermarks and Backgrounds from all pages in a document.
*
* What you need to know first:
*
* When creating a watermark in Adobe Acrobat and other conforming PDF creators,
* the dialog box referenced below will create the watermark using one of two
* methods. The method depends on the state of the bottom checkbox in the center
* of the screen;
*
* "Keep position and size of watermark text constant when printing on different page sizes"
*
* See image at the link below.
*
* <img src=
* "http://dev.datalogics.com/cookbook/javadoc_img/WatermarkKeepPositionAndSize.png"
* />
*
* When the checkbox is off, Acrobat will use an XObject to add the watermark to
* the page which allows it to scale with the page. When on, Acrobat will add a
* "Watermark" annotation to the page, allowing it to scale independently of the
* underlying content.
*
* This sample looks for and removes watermarks created using both methods.
*/
public class FindAndRemoveWatermarks {
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/PDFJT_Getting_Started_Guide_Watermarked.pdf";
private static final String outputDir = "cookbook/Document/output/";
public static void main(String[] args) throws Exception {
/*
* Read in PDF input file
*/
URLConnection connection = new URL(inputPDFURL).openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
connection.connect();
InputStream fis = connection.getInputStream();
ByteReader byteReader = new InputStreamByteReader(fis);
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
/*
* Watermarks are added at the page level so we need to iterate over the
* pages.
*/
PDFPageTree pdfPages = pdfDocument.requirePages();
Iterator<PDFPage> pageIterator = pdfPages.iterator();
while (pageIterator.hasNext()) {
PDFPage pdfPage = (PDFPage) pageIterator.next();
/*
* First look for Watermark Annotations. If we go in reverse order,
* we can delete them as we find them.
*/
if (pdfPage.hasAnnotations()) {
PDFAnnotationList pdfAnnotationList = pdfPage.getAnnotationList();
int annotationListSize = pdfAnnotationList.size();
for (int i = 1; i < annotationListSize + 1; i++) {
PDFAnnotation pdfAnnotation = pdfAnnotationList.get(annotationListSize - i);
if (pdfAnnotation.getSubtype().asString().matches("Watermark")) {
pdfAnnotationList.remove(pdfAnnotation);
}
}
}
/*
* Next look for the XObjects.
*/
pdfPage = findAndRemoveWatermarkOrBackgroundXObjects(pdfPage);
}
/*
* Clean up the Optional Content Properties Dictionary to remove layers
* referenced by the XObjects.
*/
PDFOCProperties pdfOCProperties = pdfDocument.requireCatalog().getOCProperties();
PDFOCGroupArray pdfOCGroupArray = pdfOCProperties.getOCGs();
int pdfOCGroupArraySize = pdfOCGroupArray.size();
for (int i = 1; i < pdfOCGroupArraySize + 1; i++) {
PDFOCGroup pdfOCGroup = pdfOCGroupArray.get(pdfOCGroupArraySize - i);
PDFOCUsage pdfOCUsage = pdfOCGroup.getUsage();
if (isUsageForWatermarkOrBackground(pdfOCUsage)) {
pdfOCGroupArray.remove(pdfOCGroupArraySize - i);
}
}
/*
* If there are no OCGs left, just remove the whole Optional Content
* Properties Dictionary.
*/
if (pdfOCGroupArray.size() == 0) {
pdfDocument.requireCatalog().removeValue(ASName.k_OCProperties);
}
// Save the file.
SampleFileServices.createDir(outputDir);
ByteWriter outputFile = SampleFileServices.getRAFByteWriter(outputDir + "WatermarksRemoved.pdf");
PDFSaveOptions pdfSaveOptions = PDFSaveFullOptions.newInstance();
pdfDocument.save(outputFile, pdfSaveOptions);
System.out.println("Done!");
}
/**
* Iterates through the page resources detecting and removing watermarks and
* backgrounds.
*
* @param pdfPage
* The page in question.
* @return PDFpage
*/
private static PDFPage findAndRemoveWatermarkOrBackgroundXObjects(PDFPage pdfPage) throws Exception {
PDFResources pdfResources = pdfPage.getResources();
PDFXObjectMap pdfXObjectMap = pdfResources.getXObjectMap();
/*
* We need to iterate over the XObjects in the resources but we can't
* modify the collection while iterating so we add the name of any
* XObjects that we want to remove to an ArrayList that we'll use later.
*/
List<ASName> toRemoveList = new ArrayList<ASName>();
Set<ASName> keys = pdfXObjectMap.keySet();
Iterator<ASName> keyIterator = keys.iterator();
while (keyIterator.hasNext()) {
ASName key = (ASName) keyIterator.next();
PDFXObject pdfXObject = pdfXObjectMap.get(key);
/*
* Detect if the XObject is being used as a watermark or background.
* See method for further explanation.
*/
if (isWatermarkOrBackgroundXObject(pdfXObject) == true) {
/*
* Add the name of the XObject to the list we created above.
*/
toRemoveList.add(key);
/*
* Remove the instructions to pain the XObject. See method for
* further explanation.
*/
PDFContents cleanedPageContent = removeXObjectPaintInstructionFromPage(pdfPage, key);
pdfPage.setContents(cleanedPageContent);
}
}
/*
* Now remove the XObjects we've found to be watermarks or backgrounds.
*/
for (ASName removeMe : toRemoveList) {
pdfXObjectMap.remove(removeMe);
}
return pdfPage;
}
/**
* Determines if a PDFXObject is being used as a watermark or background as
* specified in the Usage dictionary.
*
* @param pdfXObject
* The XObject in question.
* @return boolean
*/
private static boolean isWatermarkOrBackgroundXObject(PDFXObject pdfXObject) throws Exception {
/*
* XObjects can belong to Optional Content Groups (OCG). Assigning a
* watermark to an OCG allows the watermark to appear on screen but not
* in print or vice versa. We can use the Usage dictionary to determine
* if the XObject is a watermark.
*/
boolean toReturn = false;
PDFOCGroup pdfOCGroup = null;
PDFOCUsage pdfOCUsage = null;
/*
* Find out what OCG the XObject is a member of.
*/
if (PDFOCMembership.hasOC(pdfXObject)) {
/*
* The XObject can belong to a single OCG or be in a membership
* group but we can get the usage of either.
*/
PDFOCObject pdfOCObject = PDFOCMembership.getOC(pdfXObject);
if (pdfOCObject.isOCG()) {
pdfOCGroup = PDFOCGroup.getInstance(pdfOCObject.getCosObject());
}
if (pdfOCObject.isOCMD()) {
PDFOCMembership pdfOCMembership = PDFOCMembership.getInstance(pdfOCObject.getCosObject());
PDFOCGroupList pdfOCGroupList = pdfOCMembership.getOCGs();
pdfOCGroup = pdfOCGroupList.getSingleOCGroup();
}
pdfOCUsage = pdfOCGroup.getUsage();
if (pdfOCUsage != null) {
/*
* We need to dig around in the Usage dictionary to get the
* PageElement that the XObject is being used as. See method for
* further explanation.
*/
toReturn = isUsageForWatermarkOrBackground(pdfOCUsage);
}
}
return toReturn;
}
/**
* Examines the PageElement dictionary in the XObject Usage dictionary to
* determine if a PDFXObject is being used as a watermark or background.
*
* @param pdfOCUsage
* The Usage Dictionary of the XObject in question.
* @return boolean
*/
private static boolean isUsageForWatermarkOrBackground(PDFOCUsage pdfOCUsage) throws Exception {
/*
* Watermarks can be either a Background (behind the page content) or a
* Foreground (over the page content). So if the XObject has either a BG
* or FG as it's PageElement subtype, we know it's a watermark.
*/
boolean toReturn = false;
CosDictionary pageElementDict = pdfOCUsage.getDictionaryDictionaryValue(ASName.k_PageElement);
String pageElementSubType = pageElementDict.get(ASName.k_Subtype).getValue().toString();
if (pageElementSubType.matches("BG") || pageElementSubType.matches("FG")) {
toReturn = true;
}
return toReturn;
}
/**
* Creates a new set of Instructions as PDFContents minus drawing
* instructions for a particular XObject.
*
* @param pdfPage
* The page in question.
* @param key
* The ASName of the XObject in the page resources dictionary.
* @return PDFContents
*/
private static PDFContents removeXObjectPaintInstructionFromPage(PDFPage pdfPage, ASName key) throws Exception {
/*
* It is not enough to simply remove the XObject from the resources, in
* order to prevent page rendering problems, you must also remove
* references to the XObject from the page content.
*
* Any XObject can be painted as part of another content stream by means
* of the Do operator. The syntax is the same in all cases, although
* details of the operator’s behavior differ depending on the type of
* XObject. There is a single operand and it's value is the ASName of
* the XObject. So, to locate and remove references to the XObject, we
* need to iterate through the page content painting instructions
* looking for "Do" operators and examining it's operands looking for
* one that matches the XObject's name.
*
* When we <b>don't</b> find a match, we write that instruction to a new
* ContentWriter creating a new set of instructions without references
* to the XObject. The results become our new PDFContents for that page.
*/
Content pageContent = Content.newInstance(pdfPage);
ContentReader contentReader = ContentReader.newInstance(pageContent);
ContentWriter contentWriter = ContentWriter.newInstance(pdfPage.getPDFDocument());
while (contentReader.hasNext()) {
Instruction instruction = contentReader.next();
if (instruction.getOperator().asString().matches("Do") && instruction.getOperands().getSize() == 1) {
OperandStack operandStack = instruction.getOperands();
String operandName = operandStack.peekName().asString();
if (operandName.matches(key.asString()) == false) {
contentWriter.write(instruction);
}
} else {
contentWriter.write(instruction);
}
}
Content newContent = contentWriter.close();
return newContent.getContents();
}
}
@coutPKprintf
Copy link

Hi,i want to find com.adobe.pdfjt jar,but i can't find it ,can you tell me how to get it?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment