tosh1ki/pdfmine.py

## pdfmine.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'tosh1ki'
__email__ = 'tosh1ki@yahoo.co.jp'
__date__ = '2014-12-27'

import re


if __name__ == '__main__':
    page_begin = 19  # PDFに書いてあるページ番号と実際のページ番号のズレ

    with open('toc.txt', 'r') as f:
        txt = f.read()

    # 怖いのでunicodeを適当に置換しておく
    replace_list = [
        ['\xe2\x80\x9c', '"'],  # “
        ['\xe2\x80\x9d', '"'],  # ”
        ['\xe2\x80\x93', '-'],  # –
        ['\xe2\x80\x99', '\'']  # ’
    ]
    for rep in replace_list:
        txt = txt.replace(rep[0], rep[1])

    toc_list = txt.split('\n')


    s = """
    ^(\d+(?:\.\d+)*)\s
    ([\-\s\w\d:<>,()"?]+)
    (?:\s\.\s|\.)*\s(\d+)$
    """
    regex = re.compile(s, re.X)

    toc_split = map(lambda x: re.findall(regex, x), toc_list)

    for toc in toc_split:
        if not toc:
            continue
        toc = toc[0]
        depth = len(toc[0].split(r'.'))
        title = toc[1]
        page = int(toc[2]) + page_begin

        bookmark = [
            'BookmarkBegin',
            'BookmarkTitle: '+toc[0]+' '+title,
            'BookmarkLevel: '+str(depth),
            'BookmarkPageNumber: '+str(page)
        ]
        print '\n'.join(bookmark)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	__author__ = 'tosh1ki'
	__email__ = 'tosh1ki@yahoo.co.jp'
	__date__ = '2014-12-27'

	import re


	if __name__ == '__main__':
	page_begin = 19 # PDFに書いてあるページ番号と実際のページ番号のズレ

	with open('toc.txt', 'r') as f:
	txt = f.read()

	# 怖いのでunicodeを適当に置換しておく
	replace_list = [
	['\xe2\x80\x9c', '"'], # “
	['\xe2\x80\x9d', '"'], # ”
	['\xe2\x80\x93', '-'], # –
	['\xe2\x80\x99', '\''] # ’
	]
	for rep in replace_list:
	txt = txt.replace(rep[0], rep[1])

	toc_list = txt.split('\n')


	s = """
	^(\d+(?:\.\d+)*)\s
	([\-\s\w\d:<>,()"?]+)
	(?:\s\.\s\|\.)*\s(\d+)$
	"""
	regex = re.compile(s, re.X)

	toc_split = map(lambda x: re.findall(regex, x), toc_list)

	for toc in toc_split:
	if not toc:
	continue
	toc = toc[0]
	depth = len(toc[0].split(r'.'))
	title = toc[1]
	page = int(toc[2]) + page_begin

	bookmark = [
	'BookmarkBegin',
	'BookmarkTitle: '+toc[0]+' '+title,
	'BookmarkLevel: '+str(depth),
	'BookmarkPageNumber: '+str(page)
	]
	print '\n'.join(bookmark)