Skip to content

Instantly share code, notes, and snippets.

@bsidhom
Last active July 16, 2024 15:34
Show Gist options
  • Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Write an outline specified in JSON format into a PDF document using qpdf
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import sys
def main():
parser = argparse.ArgumentParser("Rewrite PDF outlines")
parser.add_argument("--json",
help="JSON file created by qpdf",
required=True)
parser.add_argument("--outline",
help="Your new outline file, in JSON format",
required=True)
parser.add_argument("--input",
help="Original input PDF file to update",
required=True)
parser.add_argument(
"--offset",
help="Page offset to add to each target in the outline JSON",
default=0)
args = parser.parse_args()
j = build_output_json(args.json, args.outline, args.offset)
json.dump(j, sys.stdout)
def build_output_json(json_fname: str, outline_fname: str, offset: int):
with open(json_fname) as f:
j = json.load(f)
with open(outline_fname) as f:
outline = json.load(f)
pages = [page["object"] for page in j["pages"]]
next_object_id = j["qpdf"][0]["maxobjectid"] + 1
ids = ObjectIdAllocator(next_object_id)
catalog = get_catalog(j)
outlines_id = ids.next_id()
outlines = insert_new_object(j, outlines_id)
outlines["/Type"] = "/Outlines"
bookmarks = []
for item in outline:
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids)
bookmarks.append(bookmark)
for ((id, bookmark), (next_id,
next_bookmark)) in zip(bookmarks, bookmarks[1:]):
bookmark["/Next"] = f"{next_id} 0 R"
next_bookmark["/Prev"] = f"{id} 0 R"
catalog["/Outlines"] = f"{outlines_id} 0 R"
first_id = bookmarks[0][0]
outlines["/First"] = f"{first_id} 0 R"
last_id = bookmarks[-1][0]
outlines["/Last"] = f"{last_id} 0 R"
return j
def get_catalog(j):
objs = j["qpdf"][1]
for (k, v) in objs.items():
if not k.startswith("obj:"):
continue
if "value" not in v:
continue
v = v["value"]
if "/Type" not in v:
continue
if v["/Type"] == "/Catalog":
return v
raise Exception("could not find a PDF /Catalog")
def add_outline_item(j, pages, item, parent_id, offset: int,
ids: ObjectIdAllocator):
id = ids.next_id()
title = item["title"]
page_num = item["dest"]
page_ref = pages[page_num + offset]
bookmark = insert_new_object(j, id)
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None]
bookmark["/Parent"] = f"{parent_id} 0 R"
bookmark["/Title"] = f"u:{title}"
if "children" in item:
children = []
for child in item["children"]:
bm = add_outline_item(j, pages, child, id, offset, ids)
children.append(bm)
for ((child_id, bm), (next_child_id,
next_bm)) in zip(children, children[1:]):
bm["/Next"] = f"{next_child_id} 0 R"
next_bm["/Prev"] = f"{child_id} 0 R"
first_id = children[0][0]
bookmark["/First"] = f"{first_id} 0 R"
last_id = children[-1][0]
bookmark["/Last"] = f"{last_id} 0 R"
return (id, bookmark)
def insert_new_object(j, id):
key = f"obj:{id} 0 R"
obj = {}
j["qpdf"][1][key] = obj
value = {}
obj["value"] = value
return value
class ObjectIdAllocator():
def __init__(self, next_id: int):
self._next_id = next_id
def next_id(self):
id = self._next_id
self._next_id += 1
return id
if __name__ == "__main__":
main()
# Clean up PDF for ingestion
qpdf --decrypt --object-streams=disable original.pdf in.pdf
# Create JSON dump of relevant metadata
qpdf --json in.pdf in.json
# Create outline JSON
vim outline.json
# Write outline data into JSON dump, overwriting old outline if any.
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json
# Write output JSON data into final PDF.
qpdf in.pdf out.pdf --update-from-json=out.json
@foolishgrunt
Copy link

Do you have an example outline.json file you could share?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment