Skip to content

Instantly share code, notes, and snippets.

@gaoconghui
Last active January 20, 2023 16:29
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save gaoconghui/f21057c41c9386d0dcf32691c2d24692 to your computer and use it in GitHub Desktop.
Save gaoconghui/f21057c41c9386d0dcf32691c2d24692 to your computer and use it in GitHub Desktop.
删除pdf中指定的字(如水印 版权标记)
# -*- coding: utf-8 -*-
"""
给定一个pdf路径,以及一个列表,可修改pdf内容,删除所有符合条件的文字。
"""
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import TextStringObject, NameObject
from PyPDF2.pdf import ContentStream
from PyPDF2.utils import b_
def may_match(text, match_list):
for keys in match_list:
if all([key in text for key in keys]):
return True
return False
class PdfTripper(object):
def __init__(self, input_path, output_path, remove_list):
self.reader = PdfFileReader(open(input_path, "rb"))
self.writer = PdfFileWriter()
self.output_path = output_path
self.remove_list = remove_list
def execute(self):
self.process_content()
self.writer.write(open(self.output_path, "wb"))
def process_content(self):
for page_num in range(self.reader.getNumPages()):
page = self.reader.getPage(page_num)
content_object = page["/Contents"].getObject()
content = ContentStream(content_object, self.reader)
for operands, operator in content.operations:
# You might adapt this part depending on your PDF file
if operator == b_("TJ") or operator == b_("Tj"):
text = operands[0]
if may_match(text, self.remove_list):
print text
operands[0] = TextStringObject('')
page.__setitem__(NameObject('/Contents'), content)
self.writer.addPage(page)
if __name__ == '__main__':
input_path = "/tmp/test2.pdf"
output_path = "/tmp/output.pdf"
# 需要匹配删除的文字,如["ASTM", "International"]代表同时包含两个单词
remove_list = [
["ASTM", "International"],
["International,", "Copyright"],
["Licensee="],
["Provided", "by", "IHS", "under", "license", "with", "ASTM"],
["No", "reproduction", "or", "networking"],
["Not", "for", "Resale"],
]
PdfTripper(input_path, output_path, remove_list).execute()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment