This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# parse_toc.py | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
def parse(filename, maxlevel): | |
fp = open(filename, 'rb') | |
parser = PDFParser(fp) | |
doc = PDFDocument(parser) | |
outlines = doc.get_outlines() | |
for (level, title, dest, a, se) in outlines: | |
if level <= maxlevel: | |
title_words = title \ | |
.encode('utf8') \ | |
.replace('\n', '') \ | |
.split() | |
title = ' '.join(title_words) | |
print ' ' * level, title | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) != 3: | |
print 'Usage: %s xxx.pdf level' % sys.argv[0] | |
sys.exit(2) | |
parse(sys.argv[1], int(sys.argv[2])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I had to parse a PDF which contains 3520 pages, each one beeing a level one entry, so the script produced the following error :
RuntimeError: maximum recursion depth exceeded while calling a Python object
and stopped.As a workaround, I splitted the PDF into 500 pages parts, and everything went fine.
Thank you for your script.