Skip to content

Instantly share code, notes, and snippets.

@flying-sheep
Last active June 17, 2019 10:52
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flying-sheep/7958790 to your computer and use it in GitHub Desktop.
Save flying-sheep/7958790 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
from contextlib import closing
from epub import create_epub, DEFAULT_STYLESHEET
import requests
import requests_cache
from bs4 import BeautifulSoup
from smartypants import smartypants
author = 'Pusakuronu'
title = 'Dungeon Keeper Ami'
publisher = 'Anime Addventure'
url_base = 'https://forums.sufficientvelocity.com/threads/dungeon-keeper-ami-sailor-moon-dungeon-keeper-story-only-thread.30066'
#list_url = url_base + 'authors/{author}.html?tag={title}'.format(author=author, title=title.replace(' ', '+'))
#story_url_re = re.compile(r'.*/(\d+).html')
#header_re = re.compile(r'{}: (.+) \[Episode \d+\]'.format(title))
titlepic_url = 'http://fc00.deviantart.net/fs70/f/2011/063/b/e/dungeon_keeper_ami_by_paulobarrios-d3avri8.jpg'
stylesheet = '''\
hr {
margin: 0 20% 55px; padding: 19px 0; line-height: 38px;
border: none; border-bottom: 1px solid black; text-align: center;
color: inherit; background-color: inherit;
}
hr:before {
content: "☿"; display: inline-block; float: left; position: relative; left: -8px;
margin: 0 50%; padding: 0 4px; background-color: inherit; cursor: default;
}
'''
def parse(url):
r = requests.get(url)
return BeautifulSoup(r.text, 'html5lib')
def get_posts(pages):
for p in pages:
marked = p.find(class_='threadmarker')
if marked:
_, _, title = marked.find(class_='label').children
yield title.strip(), p.find(class_='messageText')
requests_cache.install_cache('dkami')
thread_pages = [parse(url_base)]
nav = thread_pages[0].find(class_='PageNav')
thread_pages += [parse(f'{url_base}/page-{p}') for p in range(int(nav['data-start']), int(nav['data-last'])+1)]
pages = [msg for tp in thread_pages for msg in tp.find_all('li', class_='message')]
posts = list(get_posts(pages))
chapters = []
for id_, (header, html) in enumerate(posts):
html = smartypants(f'<h1>{header}</h1>\n{html.prettify()}')
chapters.append((id_, header, html))
titlepic = requests.get(titlepic_url).content
create_epub(title, author, publisher, chapters, titlepic=titlepic, stylesheet=DEFAULT_STYLESHEET + stylesheet)
import sys
from zipfile import ZipFile, ZIP_DEFLATED
from bs4 import BeautifulSoup, Tag
TITLEPIC_PATH = 'images/title.png'
TITLEPIC_ITEM = '\t\t<item id="imgl" href="{}" media-type="image/png"/>'.format(TITLEPIC_PATH)
TITLEPAGE_ID = '0-titlepage'
XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'
DEFAULT_STYLESHEET = '''\
img {
max-width: 100%;
max-height: 100%;
}
'''
CONTAINER = XML_HEADER + '''
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''
content = (XML_HEADER + '''
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>{title}</dc:title>
<dc:creator opf:role="aut">{author}</dc:creator>
<dc:language>en-US</dc:language>
<dc:rights>Public Domain</dc:rights>
<dc:publisher>{publisher}</dc:publisher>
<dc:identifier id="BookID" opf:scheme="UUID">{uuid}</dc:identifier>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="style" href="stylesheet.css" media-type="text/css"/>
{titlepic_item}
{items}
</manifest>
<spine toc="ncx">
{itemrefs}
</spine>
</package>
''').format
item = '\t\t<item id="{id}" href="{id}.xhtml" media-type="application/xhtml+xml"/>'.format
itemref = '\t\t<itemref idref="{id}"/>'.format
toc = (XML_HEADER + '''
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="{uuid}"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>{title}</text>
</docTitle>
<navMap>
{navpoints}
</navMap>
</ncx>
''').format
navpoint = '''\
<navPoint id="{id}" playOrder="{order}">
<navLabel>
<text>{header}</text>
</navLabel>
<content src="{id}.xhtml"/>
</navPoint>'''.format
titlepage = '''\
<html>
<head>
<title>{title}</title>
<style type="text/css">
@page {{ padding: 0; margin: 0 }}
body {{ text-align: center; padding: 0; margin: 0 }}
</style>
</head>
<body>
{content}
</body>
</html>
'''.format
def xhtmlify(page):
if not page.find('body'): # assume sequence of divs/paragraphs/…
page.wrap(page, page.new_tag('body'))
html_tag = page.find('html')
if not html_tag: # assume plain body tag
html_tag = page.wrap(page, page.new_tag('html'))
if not page.find('head'):
head_tag = page.new_tag('head')
html_tag.insert(0, head_tag)
h1 = page.find('h1')
if h1:
title_tag = page.new_tag('title')
title_tag.string = h1.get_text()
head_tag.insert(0, title_tag)
html_tag['xmlns'] = 'http://www.w3.org/1999/xhtml'
html_tag['xml:lang'] = 'en'
page.is_xml = True # emit xml header
def create_parts(title_page, chapters):
"""yields toc and index entries, as well as chapter tuples with prepended title page"""
yield (
item(id=TITLEPAGE_ID),
itemref(id=TITLEPAGE_ID),
navpoint(id=TITLEPAGE_ID, header='Title page', order=1),
(TITLEPAGE_ID, 'Title page', title_page))
for order, chapter in enumerate(chapters, 2):
id_, header, _ = chapter
yield (
item(id=id_),
itemref(id=id_),
navpoint(id=id_, header=header, order=order),
chapter)
def create_epub(title, author, publisher, chapters, path=None, *, uuid=None, titlepic=None, stylesheet=DEFAULT_STYLESHEET):
"""Creates and saves an epub file.
chapters: sequence of (id, title, page) tuples. The ids are used as filenames.
page can be a string or BeautifulSoup. it may be a body tag, a series of content tags, or a whole (X)HTML document.
path: path to write to.
titlepic: path to or bytes of png file.
"""
if path is None:
path = '{} – {}.epub'.format(author, title)
if uuid is None:
uuid = '{}-{}'.format(author, title).replace(' ', '_').lower()
if titlepic is None:
titlepic_item = ''
title_page = titlepage(title=title, content='<h1>{}</h1>\n<h2>{}</h2>\n<h3>{}</h3>'.format(title, author, publisher))
else:
titlepic_item = TITLEPIC_ITEM
if isinstance(titlepic, str):
with open(titlepic, 'rb'):
titlepic = titlepic.read()
title_page = titlepage(title=title, content='<img src="{}"/>'.format(TITLEPIC_PATH))
items, itemrefs, navpoints, chapters = zip(*create_parts(title_page, chapters))
with ZipFile(path, 'w', ZIP_DEFLATED) as epub:
epub.writestr('mimetype', 'application/epub+zip')
epub.writestr('META-INF/container.xml', CONTAINER)
if titlepic:
epub.writestr('OEBPS/' + TITLEPIC_PATH, titlepic)
epub.writestr('OEBPS/stylesheet.css', stylesheet)
epub.writestr('OEBPS/content.opf', content(title=title, author=author, publisher=publisher, uuid=uuid,
titlepic_item=titlepic_item, items='\n'.join(items), itemrefs='\n'.join(itemrefs)))
epub.writestr('OEBPS/toc.ncx', toc(title=title, uuid=uuid, navpoints='\n'.join(navpoints)))
for id_, _, chapter in chapters:
if isinstance(chapter, Tag):
t, chapter = chapter, BeautifulSoup('<!doctype html><meta charset=utf-8>', 'html5lib')
chapter.append(t)
elif not isinstance(chapter, BeautifulSoup):
chapter = BeautifulSoup(chapter, 'html5lib')
xhtmlify(chapter)
chapter.find('head').append(chapter.new_tag('link', href='stylesheet.css', type='text/css', rel='stylesheet'))
# enforce XHTML
epub.writestr('OEBPS/{}.xhtml'.format(id_), chapter.prettify())
@raekuul
Copy link

raekuul commented Mar 6, 2015

The script is leaving out chunks of chapters and isn't grabbing chapters after Frenzied Fortification

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment