Skip to content

Instantly share code, notes, and snippets.

@shimizukawa
Created February 16, 2020 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shimizukawa/474ecf0905e57820ebb8d2b3b65433f3 to your computer and use it in GitHub Desktop.
Save shimizukawa/474ecf0905e57820ebb8d2b3b65433f3 to your computer and use it in GitHub Desktop.
EPUBをSphinxソースに変換する
import pathlib
import dataclasses
import typing
import re
import ebooklib
from ebooklib import epub
import html2text
import m2r
OUT_DIR = pathlib.Path('./output/')
OUT_DIR.mkdir(exist_ok=True, parents=True)
@dataclasses.dataclass
class Heading:
sec: typing.Tuple[int]
node: typing.Union[epub.Link, epub.Section]
def process_toc(toc, sec=()) -> typing.List[Heading]:
r = []
if isinstance(toc, tuple):
r += process_toc(toc[0], sec)
r += process_toc(toc[1], sec)
elif isinstance(toc, list):
for i, item in enumerate(toc):
s = sec + (i+1, )
r += process_toc(item, sec=s)
else:
r += [Heading(sec, toc)]
return r
NORMALIZER = re.compile(r'[\n\s\t?:*]')
def normalize(name):
normalized = NORMALIZER.sub('-', name)
normalized = re.sub(r'-+', '-', normalized)
return normalized
def main():
book = epub.read_epub('EXPERT_PYTHON_PROGRAMMING_THIRD_EDITION-9781789808896.epub')
heads = process_toc(book.toc)
names = []
# writing chapters
for h in heads:
item = book.get_item_with_href(h.node.href)
title = h.node.title
basename = normalize(title)
depth = len(h.sec)
md = html2text.html2text(item.get_content().decode())
# heading level
first_line, rest_lines = md.split('\n', 1)
if depth == 1 and first_line.startswith('# '):
rst_first_line = m2r.convert(first_line)
_t, _hr = rst_first_line.strip().split('\n')
rst_first_line = f'{_hr}\n{_t}\n{_hr}\n'
else:
first_line = '#' * (depth-2) + first_line
rst_first_line = m2r.convert(first_line)
rst = rst_first_line + m2r.convert(rest_lines)
if depth == 1:
file = OUT_DIR / f'{basename}.rst'
names.append(file.relative_to('.').stem)
print('Writing ...', h.sec[0], file.relative_to('.'))
with file.open('a', encoding='utf-8') as f:
f.write(rst)
# writing index
with (OUT_DIR / 'index.rst').open('w') as f:
f.write('Expert Python Programming 3rd\n')
f.write('=============================\n')
f.write('.. toctree::\n')
f.write('\n')
f.write(''.join([f' {name}\n' for name in names]))
# writing images
for item in book.get_items():
file = (OUT_DIR / item.get_name())
if item.get_type() == ebooklib.ITEM_DOCUMENT:
continue
elif item.get_type() == ebooklib.ITEM_NAVIGATION:
print('skip navigation file')
continue
elif item.get_type() == ebooklib.ITEM_STYLE:
print('skip style file')
continue
file.parent.mkdir(exist_ok=True, parents=True)
if item.get_type() == ebooklib.ITEM_IMAGE:
print('Writing ...', file.relative_to('.'))
file.write_bytes(item.get_content())
elif item.get_type() == ebooklib.ITEM_COVER:
file = file.with_name('cover.png')
print('Writing ...', file.relative_to('.'))
file.write_bytes(item.get_content())
else:
raise NotImplementedError('Unknown Type: %s' % item)
if __name__ == '__main__':
main()
docutils==0.16
EbookLib==0.17.1
html2text==2020.1.16
lxml==4.5.0
m2r==0.2.1
mistune==0.8.4
six==1.14.0
EbookLib
html2text
m2r
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment