Last active
July 16, 2024 15:34
-
-
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Write an outline specified in JSON format into a PDF document using qpdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import argparse | |
import json | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser("Rewrite PDF outlines") | |
parser.add_argument("--json", | |
help="JSON file created by qpdf", | |
required=True) | |
parser.add_argument("--outline", | |
help="Your new outline file, in JSON format", | |
required=True) | |
parser.add_argument("--input", | |
help="Original input PDF file to update", | |
required=True) | |
parser.add_argument( | |
"--offset", | |
help="Page offset to add to each target in the outline JSON", | |
default=0) | |
args = parser.parse_args() | |
j = build_output_json(args.json, args.outline, args.offset) | |
json.dump(j, sys.stdout) | |
def build_output_json(json_fname: str, outline_fname: str, offset: int): | |
with open(json_fname) as f: | |
j = json.load(f) | |
with open(outline_fname) as f: | |
outline = json.load(f) | |
pages = [page["object"] for page in j["pages"]] | |
next_object_id = j["qpdf"][0]["maxobjectid"] + 1 | |
ids = ObjectIdAllocator(next_object_id) | |
catalog = get_catalog(j) | |
outlines_id = ids.next_id() | |
outlines = insert_new_object(j, outlines_id) | |
outlines["/Type"] = "/Outlines" | |
bookmarks = [] | |
for item in outline: | |
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids) | |
bookmarks.append(bookmark) | |
for ((id, bookmark), (next_id, | |
next_bookmark)) in zip(bookmarks, bookmarks[1:]): | |
bookmark["/Next"] = f"{next_id} 0 R" | |
next_bookmark["/Prev"] = f"{id} 0 R" | |
catalog["/Outlines"] = f"{outlines_id} 0 R" | |
first_id = bookmarks[0][0] | |
outlines["/First"] = f"{first_id} 0 R" | |
last_id = bookmarks[-1][0] | |
outlines["/Last"] = f"{last_id} 0 R" | |
return j | |
def get_catalog(j): | |
objs = j["qpdf"][1] | |
for (k, v) in objs.items(): | |
if not k.startswith("obj:"): | |
continue | |
if "value" not in v: | |
continue | |
v = v["value"] | |
if "/Type" not in v: | |
continue | |
if v["/Type"] == "/Catalog": | |
return v | |
raise Exception("could not find a PDF /Catalog") | |
def add_outline_item(j, pages, item, parent_id, offset: int, | |
ids: ObjectIdAllocator): | |
id = ids.next_id() | |
title = item["title"] | |
page_num = item["dest"] | |
page_ref = pages[page_num + offset] | |
bookmark = insert_new_object(j, id) | |
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None] | |
bookmark["/Parent"] = f"{parent_id} 0 R" | |
bookmark["/Title"] = f"u:{title}" | |
if "children" in item: | |
children = [] | |
for child in item["children"]: | |
bm = add_outline_item(j, pages, child, id, offset, ids) | |
children.append(bm) | |
for ((child_id, bm), (next_child_id, | |
next_bm)) in zip(children, children[1:]): | |
bm["/Next"] = f"{next_child_id} 0 R" | |
next_bm["/Prev"] = f"{child_id} 0 R" | |
first_id = children[0][0] | |
bookmark["/First"] = f"{first_id} 0 R" | |
last_id = children[-1][0] | |
bookmark["/Last"] = f"{last_id} 0 R" | |
return (id, bookmark) | |
def insert_new_object(j, id): | |
key = f"obj:{id} 0 R" | |
obj = {} | |
j["qpdf"][1][key] = obj | |
value = {} | |
obj["value"] = value | |
return value | |
class ObjectIdAllocator(): | |
def __init__(self, next_id: int): | |
self._next_id = next_id | |
def next_id(self): | |
id = self._next_id | |
self._next_id += 1 | |
return id | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Clean up PDF for ingestion | |
qpdf --decrypt --object-streams=disable original.pdf in.pdf | |
# Create JSON dump of relevant metadata | |
qpdf --json in.pdf in.json | |
# Create outline JSON | |
vim outline.json | |
# Write outline data into JSON dump, overwriting old outline if any. | |
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json | |
# Write output JSON data into final PDF. | |
qpdf in.pdf out.pdf --update-from-json=out.json |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Do you have an example
outline.json
file you could share?