Created
December 17, 2022 20:49
-
-
Save ashishnitinpatil/4fca4c031f0dbf6c9d287ecaf3d4e860 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Edits minor text in a PDF while preserving original metadata | |
# Thanks to https://stackoverflow.com/a/69439785/2689986 & https://stackoverflow.com/a/49053629/2689986 | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject | |
source_fp = "original.pdf" | |
destination_fp = "modified.pdf" | |
replacements = {"original text": "modified text"} | |
def replace_text(content, replacements=dict()): | |
lines = content.splitlines() | |
result = "" | |
in_text = False | |
for line in lines: | |
if line == "BT": | |
in_text = True | |
elif line == "ET": | |
in_text = False | |
elif in_text: | |
cmd = line[-2:] | |
if cmd.lower() == "tj": | |
replaced_line = line | |
for k, v in replacements.items(): | |
replaced_line = replaced_line.replace(k, v) | |
result += replaced_line + "\n" | |
else: | |
result += line + "\n" | |
continue | |
result += line + "\n" | |
return result | |
def process_data(object, replacements): | |
data = object.getData() | |
decoded_data = data.decode("utf-8") | |
replaced_data = replace_text(decoded_data, replacements) | |
encoded_data = replaced_data.encode("utf-8") | |
if object.decodedSelf is not None: | |
object.decodedSelf.setData(encoded_data) | |
else: | |
object.setData(encoded_data) | |
reader = PdfFileReader(source_fp) | |
writer = PdfFileWriter() | |
# copy (& replace as necessary) from original | |
for page_number in range(0, reader.getNumPages()): | |
page = reader.getPage(page_number) | |
contents = page.getContents() | |
if isinstance(contents, DecodedStreamObject) or isinstance( | |
contents, EncodedStreamObject | |
): | |
process_data(contents, replacements) | |
elif len(contents) > 0: | |
for obj in contents: | |
if isinstance(obj, DecodedStreamObject) or isinstance( | |
obj, EncodedStreamObject | |
): | |
streamObj = obj.getObject() | |
process_data(streamObj, replacements) | |
# Force content replacement | |
page[NameObject("/Contents")] = contents.decodedSelf | |
writer.addPage(page) | |
# copy metadata from original | |
writer.addMetadata(reader.getDocumentInfo()) | |
with open(destination_fp, "wb") as f: | |
writer.write(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment