shimizukawa/converter.py

## converter.py
import pathlib
import dataclasses
import typing
import re

import ebooklib
from ebooklib import epub
import html2text
import m2r


OUT_DIR = pathlib.Path('./output/')
OUT_DIR.mkdir(exist_ok=True, parents=True)


@dataclasses.dataclass
class Heading:
    sec: typing.Tuple[int]
    node: typing.Union[epub.Link, epub.Section]


def process_toc(toc, sec=()) -> typing.List[Heading]:
    r = []
    if isinstance(toc, tuple):
        r += process_toc(toc[0], sec)
        r += process_toc(toc[1], sec)
    elif isinstance(toc, list):
        for i, item in enumerate(toc):
            s = sec + (i+1, )
            r += process_toc(item, sec=s)
    else:
        r += [Heading(sec, toc)]

    return r


NORMALIZER = re.compile(r'[\n\s\t?:*]')


def normalize(name):
    normalized = NORMALIZER.sub('-', name)
    normalized = re.sub(r'-+', '-', normalized)
    return normalized


def main():
    book = epub.read_epub('EXPERT_PYTHON_PROGRAMMING_THIRD_EDITION-9781789808896.epub')

    heads = process_toc(book.toc)
    names = []

    # writing chapters
    for h in heads:
        item = book.get_item_with_href(h.node.href)
        title = h.node.title
        basename = normalize(title)
        depth = len(h.sec)
        md = html2text.html2text(item.get_content().decode())

        # heading level
        first_line, rest_lines = md.split('\n', 1)
        if depth == 1 and first_line.startswith('# '):
            rst_first_line = m2r.convert(first_line)
            _t, _hr = rst_first_line.strip().split('\n')
            rst_first_line = f'{_hr}\n{_t}\n{_hr}\n'
        else:
            first_line = '#' * (depth-2) + first_line
            rst_first_line = m2r.convert(first_line)

        rst = rst_first_line + m2r.convert(rest_lines)

        if depth == 1:
            file = OUT_DIR / f'{basename}.rst'
            names.append(file.relative_to('.').stem)
            print('Writing ...', h.sec[0], file.relative_to('.'))
        with file.open('a', encoding='utf-8') as f:
            f.write(rst)

    # writing index
    with (OUT_DIR / 'index.rst').open('w') as f:
        f.write('Expert Python Programming 3rd\n')
        f.write('=============================\n')
        f.write('.. toctree::\n')
        f.write('\n')
        f.write(''.join([f'   {name}\n' for name in names]))

    # writing images
    for item in book.get_items():
        file = (OUT_DIR / item.get_name())
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            continue
        elif item.get_type() == ebooklib.ITEM_NAVIGATION:
            print('skip navigation file')
            continue
        elif item.get_type() == ebooklib.ITEM_STYLE:
            print('skip style file')
            continue

        file.parent.mkdir(exist_ok=True, parents=True)

        if item.get_type() == ebooklib.ITEM_IMAGE:
            print('Writing ...', file.relative_to('.'))
            file.write_bytes(item.get_content())

        elif item.get_type() == ebooklib.ITEM_COVER:
            file = file.with_name('cover.png')
            print('Writing ...', file.relative_to('.'))
            file.write_bytes(item.get_content())

        else:
            raise NotImplementedError('Unknown Type: %s' % item)


if __name__ == '__main__':
    main()

## requirements.lock
docutils==0.16
EbookLib==0.17.1
html2text==2020.1.16
lxml==4.5.0
m2r==0.2.1
mistune==0.8.4
six==1.14.0

## requirements.txt
EbookLib
html2text
m2r
	import pathlib
	import dataclasses
	import typing
	import re

	import ebooklib
	from ebooklib import epub
	import html2text
	import m2r


	OUT_DIR = pathlib.Path('./output/')
	OUT_DIR.mkdir(exist_ok=True, parents=True)


	@dataclasses.dataclass
	class Heading:
	sec: typing.Tuple[int]
	node: typing.Union[epub.Link, epub.Section]


	def process_toc(toc, sec=()) -> typing.List[Heading]:
	r = []
	if isinstance(toc, tuple):
	r += process_toc(toc[0], sec)
	r += process_toc(toc[1], sec)
	elif isinstance(toc, list):
	for i, item in enumerate(toc):
	s = sec + (i+1, )
	r += process_toc(item, sec=s)
	else:
	r += [Heading(sec, toc)]

	return r


	NORMALIZER = re.compile(r'[\n\s\t?:*]')


	def normalize(name):
	normalized = NORMALIZER.sub('-', name)
	normalized = re.sub(r'-+', '-', normalized)
	return normalized


	def main():
	book = epub.read_epub('EXPERT_PYTHON_PROGRAMMING_THIRD_EDITION-9781789808896.epub')

	heads = process_toc(book.toc)
	names = []

	# writing chapters
	for h in heads:
	item = book.get_item_with_href(h.node.href)
	title = h.node.title
	basename = normalize(title)
	depth = len(h.sec)
	md = html2text.html2text(item.get_content().decode())

	# heading level
	first_line, rest_lines = md.split('\n', 1)
	if depth == 1 and first_line.startswith('# '):
	rst_first_line = m2r.convert(first_line)
	_t, _hr = rst_first_line.strip().split('\n')
	rst_first_line = f'{_hr}\n{_t}\n{_hr}\n'
	else:
	first_line = '#' * (depth-2) + first_line
	rst_first_line = m2r.convert(first_line)

	rst = rst_first_line + m2r.convert(rest_lines)

	if depth == 1:
	file = OUT_DIR / f'{basename}.rst'
	names.append(file.relative_to('.').stem)
	print('Writing ...', h.sec[0], file.relative_to('.'))
	with file.open('a', encoding='utf-8') as f:
	f.write(rst)

	# writing index
	with (OUT_DIR / 'index.rst').open('w') as f:
	f.write('Expert Python Programming 3rd\n')
	f.write('=============================\n')
	f.write('.. toctree::\n')
	f.write('\n')
	f.write(''.join([f' {name}\n' for name in names]))

	# writing images
	for item in book.get_items():
	file = (OUT_DIR / item.get_name())
	if item.get_type() == ebooklib.ITEM_DOCUMENT:
	continue
	elif item.get_type() == ebooklib.ITEM_NAVIGATION:
	print('skip navigation file')
	continue
	elif item.get_type() == ebooklib.ITEM_STYLE:
	print('skip style file')
	continue

	file.parent.mkdir(exist_ok=True, parents=True)

	if item.get_type() == ebooklib.ITEM_IMAGE:
	print('Writing ...', file.relative_to('.'))
	file.write_bytes(item.get_content())

	elif item.get_type() == ebooklib.ITEM_COVER:
	file = file.with_name('cover.png')
	print('Writing ...', file.relative_to('.'))
	file.write_bytes(item.get_content())

	else:
	raise NotImplementedError('Unknown Type: %s' % item)


	if __name__ == '__main__':
	main()
	docutils==0.16
	EbookLib==0.17.1
	html2text==2020.1.16
	lxml==4.5.0
	m2r==0.2.1
	mistune==0.8.4
	six==1.14.0