Skip to content

Instantly share code, notes, and snippets.

@xiaopc
Last active January 29, 2023 03:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xiaopc/8d41bc588c50dd419df9aaae6584e0d7 to your computer and use it in GitHub Desktop.
Save xiaopc/8d41bc588c50dd419df9aaae6584e0d7 to your computer and use it in GitHub Desktop.
remove all text on pdf
# Modify from https://gist.github.com/668/2c8f936697ded94394ff4a6ffa4ae87e
import sys
from pypdf import PdfReader, PdfWriter
from pypdf.generic import ContentStream, TextStringObject, NameObject
# Load PDF into PyPDF2
reader = PdfReader(open(sys.argv[1], "rb"))
writer = PdfWriter()
# Migration from PyPDF2 < 2.0.0
def b_(s):
if type(s) == bytes:
return s
else:
try:
r = s.encode("latin-1")
return r
except Exception:
r = s.encode("utf-8")
return r
# For each page
for page in reader.pages:
# Get the current page and it's contents
content_object = page["/Contents"].get_object()
content = ContentStream(content_object, reader)
# Remove Tj Operator
condition = lambda tupl: not tupl[1] == b_("Tj")
content.operations = list(filter(condition, content.operations))
# Set the modified content as content object on the page
page.__setitem__(NameObject("/Contents"), content)
page.compress_content_streams()
# Add the page to the output
writer.add_page(page)
# Write the stream
outputStream = open(sys.argv[2], "wb")
writer.write(outputStream)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment