Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Converting PDF table of contents into DjVu outline
import xml.etree.ElementTree
import djvu.decode as djvu
import sys
__author__ = 'soshial'
'''
1. export bookmarks from PDF using pdfminer:
> python tools/dumppdf.py -T ~/book.pdf > ~/toc_pdf.xml
2. use current script to convert bookmarks (we need python-djvulibre), but it could be easily done without it (using pages, see http://www.ub-filosofie.ro/~solcan/wt/gnu/d/bdjv.html)
> outline_pdf2djvu.py toc_pdf.xml outline_djvu.txt book.djvu
3. use djvulibre to import bookmarks into DjVU file
> djvused -e 'set-outline outline_djvu.txt' book.djvu -s
'''
def print_bookmark(level, title, page):
print(' ' * level + '("' + title.strip() + '" "#' + page + '"', file=f)
if __name__ == "__main__":
print(sys.argv)
quit()
djvu_document = djvu.Context().new_document(djvu.FileURI(sys.argv[3]))
djvu_document.decoding_job.wait()
page_filenames = [page.file.name for page in djvu_document.pages]
e = xml.etree.ElementTree.parse(sys.argv[1]).getroot()
prev_level = 0
with open(sys.argv[2], 'w', encoding='utf-8') as f:
print('(bookmarks ', file=f)
for bookmark in e.findall('outline'):
level = int(bookmark.get('level'))
title = bookmark.get('title')
for child in bookmark:
if child.tag == "pageno":
page = int(child.text)
if prev_level >= level:
print(')' * (prev_level - level + 1), file=f)
print_bookmark(level, title, page_filenames[page])
prev_level = level
print(')' * (prev_level+1), file=f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment