Created
June 10, 2024 06:42
-
-
Save bigshans/02f383bc735175c3eaa6c78f43b0c685 to your computer and use it in GitHub Desktop.
py-pdfbookmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
import sys | |
class PdfDirGenerator: | |
def __init__(self, pdf_path:str, txt_path:str, offset:int, out_path:str=None, levelmark:str='.'): | |
self.pdf_path = pdf_path # pdf路径 | |
self.txt_path = txt_path # 包含pdf目录信息的txt | |
self.offset = offset # 目录页数偏移量 | |
self.out_path = out_path # 输出路径 | |
self.levelmark = levelmark # 用于判断书签级别的标志符 | |
self.dir_parent = [None] | |
def getLevelId(self, level): | |
"""计算书签的级数(级数的标志符号为“.”) | |
一级目录: 0 个“.”,例如: 第1章、附录A等 | |
二级目录: 1个“.”,例如: 1.1、A.1 | |
三级目录: 2个“.”,例如: 2.1.3 | |
""" | |
mark_num = 0 | |
for c in level: | |
if c == self.levelmark: | |
mark_num += 1 | |
return mark_num + 1 | |
def run(self): | |
print("--------------------------- Adding the bookmark ---------------------------") | |
print(" * PDF Source: %s" % self.pdf_path) | |
print(" * TXT Source: %s" % self.txt_path) | |
print(" * Offset: %d" % self.offset) | |
print("---------------------------------------------------------------------------") | |
with open(self.txt_path, 'r', encoding='utf-8') as txt: | |
pdf_reader = PyPDF2.PdfFileReader(self.pdf_path) | |
pdf_writer = PyPDF2.PdfFileWriter() | |
pdf_writer.cloneDocumentFromReader(pdf_reader) | |
# BUG: ValueError: {’/Type’: ‘/Outlines’, ‘/Count’: 0} is not in list | |
# 修改代码 ${PYTHON_PATH}/site-packages/PyPDF2/pdf.py): getOutlineRoot 函数 | |
# 参考:https://www.codetd.com/en/article/11823498 | |
lines = txt.readlines() | |
num_all_lines = len(lines) | |
for i, line in enumerate(lines): | |
pline = line.split(' ') | |
if len(pline) == 2: | |
pline.insert(0, '') | |
level = pline[0]; title = ' '.join(pline[1:-1]); page = int(pline[-1]) + self.offset | |
# 1. 计算当前的 level 的级数 id | |
# 2. 当前书签的父结点存放在 dir_parent[id-1] 上 | |
# 3. 更新/插入 dir_parent[id] | |
id = self.getLevelId(level) | |
if id >= len(self.dir_parent): | |
self.dir_parent.append(None) | |
if level != '': | |
title = level + ' ' + title | |
self.dir_parent[id] = pdf_writer.addBookmark(title, page-1, self.dir_parent[id-1]) | |
print(" * [%d/%d finished] level: %s(%d), title: %s, page: %d" % (i+1, num_all_lines, level, id, title, page)) | |
if self.out_path is None: | |
self.out_path = self.pdf_path[:-4] + '(书签).pdf' | |
with open(self.out_path, 'wb') as out_pdf: | |
pdf_writer.write(out_pdf) | |
print("---------------------------------------------------------------------------") | |
print(" * Save: %s" % self.out_path) | |
print("---------------------------------- Done! ----------------------------------") | |
if __name__ == '__main__': | |
input_num = len(sys.argv) | |
assert(input_num > 3) | |
opath = None | |
if input_num > 4: | |
opath = sys.argv[4] | |
mark='.' | |
if input_num > 5: | |
mark = sys.argv[5] | |
pdg = PdfDirGenerator( | |
pdf_path=sys.argv[1], | |
txt_path=sys.argv[2], | |
offset=int(sys.argv[3]), # 一般是目录结束页的页数 | |
out_path=opath, | |
levelmark=mark | |
) | |
pdg.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment