Skip to content

Instantly share code, notes, and snippets.

@ashishnitinpatil
Created December 17, 2022 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashishnitinpatil/4fca4c031f0dbf6c9d287ecaf3d4e860 to your computer and use it in GitHub Desktop.
Save ashishnitinpatil/4fca4c031f0dbf6c9d287ecaf3d4e860 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
# Edits minor text in a PDF while preserving original metadata
# Thanks to https://stackoverflow.com/a/69439785/2689986 & https://stackoverflow.com/a/49053629/2689986
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject
source_fp = "original.pdf"
destination_fp = "modified.pdf"
replacements = {"original text": "modified text"}
def replace_text(content, replacements=dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == "tj":
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode("utf-8")
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode("utf-8")
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
reader = PdfFileReader(source_fp)
writer = PdfFileWriter()
# copy (& replace as necessary) from original
for page_number in range(0, reader.getNumPages()):
page = reader.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(
contents, EncodedStreamObject
):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(
obj, EncodedStreamObject
):
streamObj = obj.getObject()
process_data(streamObj, replacements)
# Force content replacement
page[NameObject("/Contents")] = contents.decodedSelf
writer.addPage(page)
# copy metadata from original
writer.addMetadata(reader.getDocumentInfo())
with open(destination_fp, "wb") as f:
writer.write(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment