Last active
October 19, 2023 12:08
-
-
Save jribble/beddf7620536939f88db to your computer and use it in GitHub Desktop.
Flatten PDF documents using PDFBox
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package pdfutil; | |
import org.apache.pdfbox.cos.COSArray; | |
import org.apache.pdfbox.cos.COSDictionary; | |
import org.apache.pdfbox.cos.COSName; | |
import org.apache.pdfbox.cos.COSStream; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDDocumentCatalog; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.PDResources; | |
import org.apache.pdfbox.pdmodel.common.COSArrayList; | |
import org.apache.pdfbox.pdmodel.common.COSObjectable; | |
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream; | |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; | |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget; | |
import org.apache.pdfbox.pdmodel.interactive.form.*; | |
import java.io.ByteArrayOutputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.util.*; | |
public class PDFUtil { | |
public static void flattenPDF (PDDocument doc) throws IOException { | |
// | |
// find the fields and their kids (widgets) on the input document | |
// (each child widget represents an appearance of the field data on the page, there may be multiple appearances) | |
// | |
PDDocumentCatalog catalog = doc.getDocumentCatalog(); | |
PDAcroForm form = catalog.getAcroForm(); | |
List<PDField> tmpfields = form.getFields(); | |
PDResources formresources = form.getDefaultResources(); | |
Map formfonts = formresources.getFonts(); | |
PDAnnotation ann; | |
// | |
// for each input document page convert the field annotations on the page into | |
// content stream | |
// | |
List<PDPage> pages = catalog.getAllPages(); | |
Iterator<PDPage> pageiterator = pages.iterator(); | |
while (pageiterator.hasNext()) { | |
// | |
// get next page from input document | |
// | |
PDPage page = pageiterator.next(); | |
// | |
// add the fonts from the input form to this pages resources | |
// so the field values will display in the proper font | |
// | |
PDResources pageResources = page.getResources(); | |
Map pageFonts = pageResources.getFonts(); | |
pageFonts.putAll(formfonts); | |
pageResources.setFonts(pageFonts); | |
// | |
// Create a content stream for the page for appending | |
// | |
PDPageContentStream contentStream = new PDPageContentStream(doc, page, true, true); | |
// | |
// Find the appearance widgets for all fields on the input page and insert them into content stream of the page | |
// | |
for (PDField tmpfield : tmpfields) { | |
List widgets = tmpfield.getKids(); | |
if(widgets == null) { | |
widgets = new ArrayList(); | |
widgets.add(tmpfield.getWidget()); | |
} | |
Iterator<COSObjectable> widgetiterator = widgets.iterator(); | |
while (widgetiterator.hasNext()) { | |
COSObjectable next = widgetiterator.next(); | |
if (next instanceof PDField) { | |
PDField foundfield = (PDField) next; | |
ann = foundfield.getWidget(); | |
} else { | |
ann = (PDAnnotation) next; | |
} | |
if (ann.getPage().equals(page)) { | |
COSDictionary dict = ann.getDictionary(); | |
if (dict != null) { | |
if(tmpfield instanceof PDVariableText || tmpfield instanceof PDPushButton) { | |
COSDictionary ap = (COSDictionary) dict.getDictionaryObject("AP"); | |
if (ap != null) { | |
contentStream.appendRawCommands("q\n"); | |
COSArray rectarray = (COSArray) dict.getDictionaryObject("Rect"); | |
if (rectarray != null) { | |
float[] rect = rectarray.toFloatArray(); | |
String s = " 1 0 0 1 " + Float.toString(rect[0]) + " " + Float.toString(rect[1]) + " cm\n"; | |
contentStream.appendRawCommands(s); | |
} | |
COSStream stream = (COSStream) ap.getDictionaryObject("N"); | |
if (stream != null) { | |
InputStream ioStream = stream.getUnfilteredStream(); | |
ByteArrayOutputStream byteArray = new ByteArrayOutputStream(); | |
byte[] buffer = new byte[4096]; | |
int amountRead = 0; | |
while ((amountRead = ioStream.read(buffer, 0, buffer.length)) != -1) { | |
byteArray.write(buffer, 0, amountRead); | |
} | |
contentStream.appendRawCommands(byteArray.toString() + "\n"); | |
} | |
contentStream.appendRawCommands("Q\n"); | |
} | |
} else if (tmpfield instanceof PDChoiceButton) { | |
COSDictionary ap = (COSDictionary) dict.getDictionaryObject("AP"); | |
if(ap != null) { | |
contentStream.appendRawCommands("q\n"); | |
COSArray rectarray = (COSArray) dict.getDictionaryObject("Rect"); | |
if (rectarray != null) { | |
float[] rect = rectarray.toFloatArray(); | |
String s = " 1 0 0 1 " + Float.toString(rect[0]) + " " + Float.toString(rect[1]) + " cm\n"; | |
contentStream.appendRawCommands(s); | |
} | |
COSName cbValue = (COSName) dict.getDictionaryObject(COSName.AS); | |
COSDictionary d = (COSDictionary) ap.getDictionaryObject(COSName.D); | |
if (d != null) { | |
COSStream stream = (COSStream) d.getDictionaryObject(cbValue); | |
if(stream != null) { | |
InputStream ioStream = stream.getUnfilteredStream(); | |
ByteArrayOutputStream byteArray = new ByteArrayOutputStream(); | |
byte[] buffer = new byte[4096]; | |
int amountRead = 0; | |
while ((amountRead = ioStream.read(buffer, 0, buffer.length)) != -1) { | |
byteArray.write(buffer, 0, amountRead); | |
} | |
contentStream.appendRawCommands(byteArray.toString() + "\n"); | |
} | |
} | |
COSDictionary n = (COSDictionary) ap.getDictionaryObject(COSName.N); | |
if (n != null) { | |
COSStream stream = (COSStream) n.getDictionaryObject(cbValue); | |
if(stream != null) { | |
InputStream ioStream = stream.getUnfilteredStream(); | |
ByteArrayOutputStream byteArray = new ByteArrayOutputStream(); | |
byte[] buffer = new byte[4096]; | |
int amountRead = 0; | |
while ((amountRead = ioStream.read(buffer, 0, buffer.length)) != -1) { | |
byteArray.write(buffer, 0, amountRead); | |
} | |
contentStream.appendRawCommands(byteArray.toString() + "\n"); | |
} | |
} | |
contentStream.appendRawCommands("Q\n"); | |
} | |
} | |
} | |
} | |
} | |
} | |
// delete any field widget annotations and write it all to the page | |
// leave other annotations on the page | |
COSArrayList newanns = new COSArrayList(); | |
List anns = page.getAnnotations(); | |
ListIterator annotiterator = anns.listIterator(); | |
while (annotiterator.hasNext()) { | |
COSObjectable next = (COSObjectable) annotiterator.next(); | |
if (!(next instanceof PDAnnotationWidget)) { | |
newanns.add(next); | |
} | |
} | |
page.setAnnotations(newanns); | |
contentStream.close(); | |
} | |
// | |
// Delete all fields from the form and their widgets (kids) | |
// | |
for (PDField tmpfield : tmpfields) { | |
List kids = tmpfield.getKids(); | |
if(kids != null) kids.clear(); | |
} | |
tmpfields.clear(); | |
// Tell Adobe we don't have forms anymore. | |
PDDocumentCatalog pdCatalog = doc.getDocumentCatalog(); | |
PDAcroForm acroForm = pdCatalog.getAcroForm(); | |
COSDictionary acroFormDict = acroForm.getDictionary(); | |
COSArray cosFields = (COSArray) acroFormDict.getDictionaryObject("Fields"); | |
cosFields.clear(); | |
} | |
public static void main(String [] args) | |
{ | |
try { | |
// for testing | |
PDDocument doc = PDDocument.load("test.pdf"); | |
flattenPDF(doc); | |
doc.save("test_flattened.pdf"); | |
} | |
catch (Exception e) { | |
System.err.println("Exception: " + e.getLocalizedMessage()); | |
} | |
} | |
}; |
It works for me. Thank you!
Awesome, works as advertised!
Works as expected!
this is so close to being perfect... I lose checkboxes though when I run this.... any ideas?
Thanks. My simple case (code-filled form fields not showing up on iOS) solved by this.
Wont work for me either. I am using this file https://nofile.io/f/Y9YCkDV6u0b/test.pdf
Hi,
Well done for your job.
in my case I get a NullPointerException on
f.setValue(value);
Because
f.getAcroForm().getDefaultResources().getCOSObject().getCOSDictionary(COSName.FONT) == null
And it is used here where acroFormFontDict == null
Probably because the PDF is not well formed, but I'd like to catch that and apply a correction, for example by creating this dict, but how ?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
form fields still not removed