Last active
April 30, 2022 17:35
Star
You must be signed in to star a gist
Converting PDF table of contents into DjVu outline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree | |
import djvu.decode as djvu | |
import sys | |
__author__ = 'soshial' | |
''' | |
1. export bookmarks from PDF using pdfminer: | |
> python tools/dumppdf.py -T ~/book.pdf > ~/toc_pdf.xml | |
2. use current script to convert bookmarks (we need python-djvulibre), but it could be easily done without it (using pages, see http://www.ub-filosofie.ro/~solcan/wt/gnu/d/bdjv.html) | |
> outline_pdf2djvu.py toc_pdf.xml outline_djvu.txt book.djvu | |
3. use djvulibre to import bookmarks into DjVU file | |
> djvused -e 'set-outline outline_djvu.txt' book.djvu -s | |
''' | |
def print_bookmark(level, title, page): | |
print(' ' * level + '("' + title.strip() + '" "#' + page + '"', file=f) | |
if __name__ == "__main__": | |
print(sys.argv) | |
quit() | |
djvu_document = djvu.Context().new_document(djvu.FileURI(sys.argv[3])) | |
djvu_document.decoding_job.wait() | |
page_filenames = [page.file.name for page in djvu_document.pages] | |
e = xml.etree.ElementTree.parse(sys.argv[1]).getroot() | |
prev_level = 0 | |
with open(sys.argv[2], 'w', encoding='utf-8') as f: | |
print('(bookmarks ', file=f) | |
for bookmark in e.findall('outline'): | |
level = int(bookmark.get('level')) | |
title = bookmark.get('title') | |
for child in bookmark: | |
if child.tag == "pageno": | |
page = int(child.text) | |
if prev_level >= level: | |
print(')' * (prev_level - level + 1), file=f) | |
print_bookmark(level, title, page_filenames[page]) | |
prev_level = level | |
print(')' * (prev_level+1), file=f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment