Last active
January 20, 2023 16:29
-
-
Save gaoconghui/f21057c41c9386d0dcf32691c2d24692 to your computer and use it in GitHub Desktop.
删除pdf中指定的字(如水印 版权标记)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
给定一个pdf路径,以及一个列表,可修改pdf内容,删除所有符合条件的文字。 | |
""" | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
from PyPDF2.generic import TextStringObject, NameObject | |
from PyPDF2.pdf import ContentStream | |
from PyPDF2.utils import b_ | |
def may_match(text, match_list): | |
for keys in match_list: | |
if all([key in text for key in keys]): | |
return True | |
return False | |
class PdfTripper(object): | |
def __init__(self, input_path, output_path, remove_list): | |
self.reader = PdfFileReader(open(input_path, "rb")) | |
self.writer = PdfFileWriter() | |
self.output_path = output_path | |
self.remove_list = remove_list | |
def execute(self): | |
self.process_content() | |
self.writer.write(open(self.output_path, "wb")) | |
def process_content(self): | |
for page_num in range(self.reader.getNumPages()): | |
page = self.reader.getPage(page_num) | |
content_object = page["/Contents"].getObject() | |
content = ContentStream(content_object, self.reader) | |
for operands, operator in content.operations: | |
# You might adapt this part depending on your PDF file | |
if operator == b_("TJ") or operator == b_("Tj"): | |
text = operands[0] | |
if may_match(text, self.remove_list): | |
print text | |
operands[0] = TextStringObject('') | |
page.__setitem__(NameObject('/Contents'), content) | |
self.writer.addPage(page) | |
if __name__ == '__main__': | |
input_path = "/tmp/test2.pdf" | |
output_path = "/tmp/output.pdf" | |
# 需要匹配删除的文字,如["ASTM", "International"]代表同时包含两个单词 | |
remove_list = [ | |
["ASTM", "International"], | |
["International,", "Copyright"], | |
["Licensee="], | |
["Provided", "by", "IHS", "under", "license", "with", "ASTM"], | |
["No", "reproduction", "or", "networking"], | |
["Not", "for", "Resale"], | |
] | |
PdfTripper(input_path, output_path, remove_list).execute() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment