Created
August 26, 2020 12:07
-
-
Save aqzlpm11/f69381d9dfacb26b50bf004a2973a900 to your computer and use it in GitHub Desktop.
去除 扫描全能王 pdf 的广告二维码
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import img2pdf | |
# =========== 总流程 ============ | |
# pdf --> jpg (抽取内部原有的jpg) | |
# 过滤掉不要的图片 | |
# jpg --> pdf | |
# =============================== | |
def extract_all_jpg_from_pdf(pdf_file): | |
res = [] | |
with open(pdf_file, "rb") as file: | |
pdf = file.read() | |
img_counter = 0 | |
pointer = 0 | |
while True: | |
pointer = pdf.find(b"stream", pointer) | |
if pointer < 0: | |
break | |
x = pdf.find(b"\xff\xd8", pointer) | |
if x < 0: | |
pointer = pointer + 1 | |
continue | |
else: | |
extension = "jpg" | |
limit = pdf.find(b"endstream", pointer) | |
if limit < 0: | |
break | |
y = pdf.find(b"\xff\xd9", pointer, limit) + 2 | |
pointer = limit + 9 | |
if y < 2: | |
continue | |
img = pdf[x:y] | |
res.append(img) | |
# img_counter = img_counter + 1 | |
# save_file = f"{img_counter:3d}.{extension}" | |
# with open(save_file, "wb") as jpgfile: | |
# jpgfile.write(img) | |
return res | |
if __name__ == '__main__': | |
input_file = sys.argv[1] | |
output_file = input_file+'-noAD.pdf' | |
# 抽取原来的图片文件 | |
images = extract_all_jpg_from_pdf(input_file) | |
# 扫描全能王生成的pdf,有一半是二维码。此处去掉 | |
images = [images[i] for i in range(1, len(images), 2)] | |
# 重新保存为pdf | |
a4_input = (img2pdf.mm_to_pt(210),img2pdf.mm_to_pt(297)) | |
layout_fun = img2pdf.get_layout_fun(a4_input, auto_orient=True) | |
with open(output_file, "wb") as f: | |
f.write(img2pdf.convert(images, layout_fun=layout_fun)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment