#!/usr/bin/env python3
import re
from contextlib import closing
from epub import create_epub, DEFAULT_STYLESHEET
import requests
import requests_cache
from bs4 import BeautifulSoup
from smartypants import smartypants
author = 'Pusakuronu'
title = 'Dungeon Keeper Ami'
publisher = 'Anime Addventure'
url_base = ''
#list_url = url_base + 'authors/{author}.html?tag={title}'.format(author=author, title=title.replace(' ', '+'))
#story_url_re = re.compile(r'.*/(\d+).html')
#header_re = re.compile(r'{}: (.+) \[Episode \d+\]'.format(title))
titlepic_url = ''
stylesheet = '''\
hr {
margin: 0 20% 55px; padding: 19px 0; line-height: 38px;
border: none; border-bottom: 1px solid black; text-align: center;
color: inherit; background-color: inherit;
hr:before {
content: "☿"; display: inline-block; float: left; position: relative; left: -8px;
margin: 0 50%; padding: 0 4px; background-color: inherit; cursor: default;
def parse(url):
r = requests.get(url)
return BeautifulSoup(r.text, 'html5lib')
def get_posts(pages):
for p in pages:
marked = p.find(class_='threadmarker')
if marked:
_, _, title = marked.find(class_='label').children
yield title.strip(), p.find(class_='messageText')
thread_pages = [parse(url_base)]
nav = thread_pages[0].find(class_='PageNav')
thread_pages += [parse(f'{url_base}/page-{p}') for p in range(int(nav['data-start']), int(nav['data-last'])+1)]
pages = [msg for tp in thread_pages for msg in tp.find_all('li', class_='message')]
posts = list(get_posts(pages))
chapters = []
for id_, (header, html) in enumerate(posts):
html = smartypants(f'<h1>{header}</h1>\n{html.prettify()}')
chapters.append((id_, header, html))
titlepic = requests.get(titlepic_url).content
create_epub(title, author, publisher, chapters, titlepic=titlepic, stylesheet=DEFAULT_STYLESHEET + stylesheet)
import sys
from zipfile import ZipFile, ZIP_DEFLATED
from bs4 import BeautifulSoup, Tag
TITLEPIC_PATH = 'images/title.png'
TITLEPIC_ITEM = '\t\t<item id="imgl" href="{}" media-type="image/png"/>'.format(TITLEPIC_PATH)
TITLEPAGE_ID = '0-titlepage'
XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'
img {
max-width: 100%;
max-height: 100%;
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
content = (XML_HEADER + '''
<package xmlns="" unique-identifier="BookID" version="2.0">
<metadata xmlns:dc="" xmlns:opf="">
<dc:creator opf:role="aut">{author}</dc:creator>
<dc:rights>Public Domain</dc:rights>
<dc:identifier id="BookID" opf:scheme="UUID">{uuid}</dc:identifier>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="style" href="stylesheet.css" media-type="text/css"/>
<spine toc="ncx">
item = '\t\t<item id="{id}" href="{id}.xhtml" media-type="application/xhtml+xml"/>'.format
itemref = '\t\t<itemref idref="{id}"/>'.format
toc = (XML_HEADER + '''
<ncx xmlns="" version="2005-1">
<meta name="dtb:uid" content="{uuid}"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
navpoint = '''\
<navPoint id="{id}" playOrder="{order}">
<content src="{id}.xhtml"/>
titlepage = '''\
<style type="text/css">
@page {{ padding: 0; margin: 0 }}
body {{ text-align: center; padding: 0; margin: 0 }}
def xhtmlify(page):
if not page.find('body'): # assume sequence of divs/paragraphs/…
page.wrap(page, page.new_tag('body'))
html_tag = page.find('html')
if not html_tag: # assume plain body tag
html_tag = page.wrap(page, page.new_tag('html'))
if not page.find('head'):
head_tag = page.new_tag('head')
html_tag.insert(0, head_tag)
h1 = page.find('h1')
if h1:
title_tag = page.new_tag('title')
title_tag.string = h1.get_text()
head_tag.insert(0, title_tag)
html_tag['xmlns'] = ''
html_tag['xml:lang'] = 'en'
page.is_xml = True # emit xml header
def create_parts(title_page, chapters):
"""yields toc and index entries, as well as chapter tuples with prepended title page"""
yield (
navpoint(id=TITLEPAGE_ID, header='Title page', order=1),
(TITLEPAGE_ID, 'Title page', title_page))
for order, chapter in enumerate(chapters, 2):
id_, header, _ = chapter
yield (
navpoint(id=id_, header=header, order=order),
def create_epub(title, author, publisher, chapters, path=None, *, uuid=None, titlepic=None, stylesheet=DEFAULT_STYLESHEET):
"""Creates and saves an epub file.
chapters: sequence of (id, title, page) tuples. The ids are used as filenames.
page can be a string or BeautifulSoup. it may be a body tag, a series of content tags, or a whole (X)HTML document.
path: path to write to.
titlepic: path to or bytes of png file.
if path is None:
path = '{} – {}.epub'.format(author, title)
if uuid is None:
uuid = '{}-{}'.format(author, title).replace(' ', '_').lower()
if titlepic is None:
titlepic_item = ''
title_page = titlepage(title=title, content='<h1>{}</h1>\n<h2>{}</h2>\n<h3>{}</h3>'.format(title, author, publisher))
titlepic_item = TITLEPIC_ITEM
if isinstance(titlepic, str):
with open(titlepic, 'rb'):
titlepic =
title_page = titlepage(title=title, content='<img src="{}"/>'.format(TITLEPIC_PATH))
items, itemrefs, navpoints, chapters = zip(*create_parts(title_page, chapters))
with ZipFile(path, 'w', ZIP_DEFLATED) as epub:
epub.writestr('mimetype', 'application/epub+zip')
epub.writestr('META-INF/container.xml', CONTAINER)
if titlepic:
epub.writestr('OEBPS/' + TITLEPIC_PATH, titlepic)
epub.writestr('OEBPS/stylesheet.css', stylesheet)
epub.writestr('OEBPS/content.opf', content(title=title, author=author, publisher=publisher, uuid=uuid,
titlepic_item=titlepic_item, items='\n'.join(items), itemrefs='\n'.join(itemrefs)))
epub.writestr('OEBPS/toc.ncx', toc(title=title, uuid=uuid, navpoints='\n'.join(navpoints)))
for id_, _, chapter in chapters:
if isinstance(chapter, Tag):
t, chapter = chapter, BeautifulSoup('<!doctype html><meta charset=utf-8>', 'html5lib')
elif not isinstance(chapter, BeautifulSoup):
chapter = BeautifulSoup(chapter, 'html5lib')
chapter.find('head').append(chapter.new_tag('link', href='stylesheet.css', type='text/css', rel='stylesheet'))
# enforce XHTML
epub.writestr('OEBPS/{}.xhtml'.format(id_), chapter.prettify())
raekuul commented Mar 6, 2015

The script is leaving out chunks of chapters and isn't grabbing chapters after Frenzied Fortification

