Skip to content

Instantly share code, notes, and snippets.

@aqzlpm11
Created August 26, 2020 12:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aqzlpm11/f69381d9dfacb26b50bf004a2973a900 to your computer and use it in GitHub Desktop.
Save aqzlpm11/f69381d9dfacb26b50bf004a2973a900 to your computer and use it in GitHub Desktop.
去除 扫描全能王 pdf 的广告二维码
import sys
import img2pdf
# =========== 总流程 ============
# pdf --> jpg (抽取内部原有的jpg)
# 过滤掉不要的图片
# jpg --> pdf
# ===============================
def extract_all_jpg_from_pdf(pdf_file):
res = []
with open(pdf_file, "rb") as file:
pdf = file.read()
img_counter = 0
pointer = 0
while True:
pointer = pdf.find(b"stream", pointer)
if pointer < 0:
break
x = pdf.find(b"\xff\xd8", pointer)
if x < 0:
pointer = pointer + 1
continue
else:
extension = "jpg"
limit = pdf.find(b"endstream", pointer)
if limit < 0:
break
y = pdf.find(b"\xff\xd9", pointer, limit) + 2
pointer = limit + 9
if y < 2:
continue
img = pdf[x:y]
res.append(img)
# img_counter = img_counter + 1
# save_file = f"{img_counter:3d}.{extension}"
# with open(save_file, "wb") as jpgfile:
# jpgfile.write(img)
return res
if __name__ == '__main__':
input_file = sys.argv[1]
output_file = input_file+'-noAD.pdf'
# 抽取原来的图片文件
images = extract_all_jpg_from_pdf(input_file)
# 扫描全能王生成的pdf,有一半是二维码。此处去掉
images = [images[i] for i in range(1, len(images), 2)]
# 重新保存为pdf
a4_input = (img2pdf.mm_to_pt(210),img2pdf.mm_to_pt(297))
layout_fun = img2pdf.get_layout_fun(a4_input, auto_orient=True)
with open(output_file, "wb") as f:
f.write(img2pdf.convert(images, layout_fun=layout_fun))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment