Skip to content

Instantly share code, notes, and snippets.

@668
Forked from raphiz/pdf_remove_watermark.py
Created August 9, 2017 07:39
Show Gist options
  • Save 668/2c8f936697ded94394ff4a6ffa4ae87e to your computer and use it in GitHub Desktop.
Save 668/2c8f936697ded94394ff4a6ffa4ae87e to your computer and use it in GitHub Desktop.
PDF watermark removal
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.pdf import ContentStream
from PyPDF2.generic import TextStringObject, NameObject
from PyPDF2.utils import b_
wm_text = 'Persönliches Exemplar von'
replace_with = ''
# Load PDF into pyPDF
source = PdfFileReader(open('input.pdf', "rb"))
output = PdfFileWriter()
# For each page
for page in range(source.getNumPages()):
# Get the current page and it's contents
page = source.getPage(page)
content_object = page["/Contents"].getObject()
content = ContentStream(content_object, source)
# Loop over all pdf elements
for operands, operator in content.operations:
# You might adapt this part depending on your PDF file
if operator == b_("TJ"):
text = operands[0][0]
if isinstance(text, TextStringObject) and text.startswith(wm_text):
operands[0] = TextStringObject(replace_with)
# Set the modified content as content object on the page
page.__setitem__(NameObject('/Contents'), content)
# Add the page to the output
output.addPage(page)
# Write the stream
outputStream = open("output.pdf", "wb")
output.write(outputStream)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment