Skip to content

Instantly share code, notes, and snippets.

@tosh1ki
Created December 27, 2014 07:11
Show Gist options
  • Save tosh1ki/347d7b395c0104ec42ef to your computer and use it in GitHub Desktop.
Save tosh1ki/347d7b395c0104ec42ef to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'tosh1ki'
__email__ = 'tosh1ki@yahoo.co.jp'
__date__ = '2014-12-27'
import re
if __name__ == '__main__':
page_begin = 19 # PDFに書いてあるページ番号と実際のページ番号のズレ
with open('toc.txt', 'r') as f:
txt = f.read()
# 怖いのでunicodeを適当に置換しておく
replace_list = [
['\xe2\x80\x9c', '"'], # “
['\xe2\x80\x9d', '"'], # ”
['\xe2\x80\x93', '-'], # –
['\xe2\x80\x99', '\''] # ’
]
for rep in replace_list:
txt = txt.replace(rep[0], rep[1])
toc_list = txt.split('\n')
s = """
^(\d+(?:\.\d+)*)\s
([\-\s\w\d:<>,()"?]+)
(?:\s\.\s|\.)*\s(\d+)$
"""
regex = re.compile(s, re.X)
toc_split = map(lambda x: re.findall(regex, x), toc_list)
for toc in toc_split:
if not toc:
continue
toc = toc[0]
depth = len(toc[0].split(r'.'))
title = toc[1]
page = int(toc[2]) + page_begin
bookmark = [
'BookmarkBegin',
'BookmarkTitle: '+toc[0]+' '+title,
'BookmarkLevel: '+str(depth),
'BookmarkPageNumber: '+str(page)
]
print '\n'.join(bookmark)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment