flying-sheep/dkami.py

## dkami.py
#!/usr/bin/env python3

import re
from contextlib import closing
from epub import create_epub, DEFAULT_STYLESHEET

import requests
import requests_cache
from bs4 import BeautifulSoup
from smartypants import smartypants


author = 'Pusakuronu'
title = 'Dungeon Keeper Ami'
publisher = 'Anime Addventure'

url_base = 'https://forums.sufficientvelocity.com/threads/dungeon-keeper-ami-sailor-moon-dungeon-keeper-story-only-thread.30066'
#list_url = url_base + 'authors/{author}.html?tag={title}'.format(author=author, title=title.replace(' ', '+'))
#story_url_re = re.compile(r'.*/(\d+).html')
#header_re = re.compile(r'{}: (.+) \[Episode \d+\]'.format(title))

titlepic_url = 'http://fc00.deviantart.net/fs70/f/2011/063/b/e/dungeon_keeper_ami_by_paulobarrios-d3avri8.jpg'

stylesheet = '''\
hr {
 margin: 0 20% 55px; padding: 19px 0; line-height: 38px;
 border: none; border-bottom: 1px solid black; text-align: center;
 color: inherit; background-color: inherit;
}
hr:before {
 content: "☿"; display: inline-block; float: left; position: relative; left: -8px;
 margin: 0 50%; padding: 0 4px; background-color: inherit; cursor: default;
}
'''


def parse(url):
	r = requests.get(url)
	return BeautifulSoup(r.text, 'html5lib')

def get_posts(pages):
	for p in pages:
		marked = p.find(class_='threadmarker')
		if marked:
			_, _, title = marked.find(class_='label').children
			yield title.strip(), p.find(class_='messageText')


requests_cache.install_cache('dkami')


thread_pages = [parse(url_base)]
nav = thread_pages[0].find(class_='PageNav')
thread_pages += [parse(f'{url_base}/page-{p}') for p in range(int(nav['data-start']), int(nav['data-last'])+1)]

pages = [msg for tp in thread_pages for msg in tp.find_all('li', class_='message')]

posts = list(get_posts(pages))

chapters = []
for id_, (header, html) in enumerate(posts):
	html = smartypants(f'<h1>{header}</h1>\n{html.prettify()}')
	chapters.append((id_, header, html))

titlepic = requests.get(titlepic_url).content

create_epub(title, author, publisher, chapters, titlepic=titlepic, stylesheet=DEFAULT_STYLESHEET + stylesheet)

## epub.py
import sys

from zipfile import ZipFile, ZIP_DEFLATED

from bs4 import BeautifulSoup, Tag

TITLEPIC_PATH = 'images/title.png'
TITLEPIC_ITEM = '\t\t<item id="imgl" href="{}" media-type="image/png"/>'.format(TITLEPIC_PATH)
TITLEPAGE_ID = '0-titlepage'

XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'

DEFAULT_STYLESHEET = '''\
img {
 max-width: 100%;
 max-height: 100%;
}
'''

CONTAINER = XML_HEADER + '''
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
 <rootfiles>
  <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
 </rootfiles>
</container>
'''

content = (XML_HEADER + '''
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0">
 <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
  <dc:title>{title}</dc:title>
  <dc:creator opf:role="aut">{author}</dc:creator>
  <dc:language>en-US</dc:language>
  <dc:rights>Public Domain</dc:rights>
  <dc:publisher>{publisher}</dc:publisher>
  <dc:identifier id="BookID" opf:scheme="UUID">{uuid}</dc:identifier>
 </metadata>
 <manifest>
  <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
  <item id="style" href="stylesheet.css" media-type="text/css"/>
{titlepic_item}
{items}
 </manifest>
 <spine toc="ncx">
{itemrefs}
 </spine>
</package>
''').format

item = '\t\t<item id="{id}" href="{id}.xhtml" media-type="application/xhtml+xml"/>'.format
itemref = '\t\t<itemref idref="{id}"/>'.format

toc = (XML_HEADER + '''
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
 <meta name="dtb:uid" content="{uuid}"/>
 <meta name="dtb:depth" content="1"/>
 <meta name="dtb:totalPageCount" content="0"/>
 <meta name="dtb:maxPageNumber" content="0"/>
</head>

<docTitle>
 <text>{title}</text>
</docTitle>

<navMap>
{navpoints}
</navMap>
</ncx>
''').format

navpoint = '''\
 <navPoint id="{id}" playOrder="{order}">
  <navLabel>
   <text>{header}</text>
  </navLabel>
  <content src="{id}.xhtml"/>
 </navPoint>'''.format

titlepage = '''\
<html>
 <head>
  <title>{title}</title>
  <style type="text/css">
   @page {{ padding: 0; margin: 0 }}
   body {{ text-align: center; padding: 0; margin: 0 }}
  </style>
 </head>
 <body>
  {content}
 </body>
</html>
'''.format

def xhtmlify(page):
	if not page.find('body'):  # assume sequence of divs/paragraphs/…
		page.wrap(page, page.new_tag('body'))

	html_tag = page.find('html')
	if not html_tag:  # assume plain body tag
		html_tag = page.wrap(page, page.new_tag('html'))

	if not page.find('head'):
		head_tag = page.new_tag('head')
		html_tag.insert(0, head_tag)
		h1 = page.find('h1')
		if h1:
			title_tag = page.new_tag('title')
			title_tag.string = h1.get_text()
			head_tag.insert(0, title_tag)

	html_tag['xmlns'] = 'http://www.w3.org/1999/xhtml'
	html_tag['xml:lang'] = 'en'
	page.is_xml = True  # emit xml header

def create_parts(title_page, chapters):
	"""yields toc and index entries, as well as chapter tuples with prepended title page"""
	yield (
		item(id=TITLEPAGE_ID),
		itemref(id=TITLEPAGE_ID),
		navpoint(id=TITLEPAGE_ID, header='Title page', order=1),
		(TITLEPAGE_ID, 'Title page', title_page))

	for order, chapter in enumerate(chapters, 2):
		id_, header, _ = chapter
		yield (
			item(id=id_),
			itemref(id=id_),
			navpoint(id=id_, header=header, order=order),
			chapter)

def create_epub(title, author, publisher, chapters, path=None, *, uuid=None, titlepic=None, stylesheet=DEFAULT_STYLESHEET):
	"""Creates and saves an epub file.
	chapters: sequence of (id, title, page) tuples. The ids are used as filenames.
		page can be a string or BeautifulSoup. it may be a body tag, a series of content tags, or a whole (X)HTML document.
	path: path to write to.
	titlepic: path to or bytes of png file.
	"""
	if path is None:
		path = '{} – {}.epub'.format(author, title)

	if uuid is None:
		uuid = '{}-{}'.format(author, title).replace(' ', '_').lower()

	if titlepic is None:
		titlepic_item = ''
		title_page = titlepage(title=title, content='<h1>{}</h1>\n<h2>{}</h2>\n<h3>{}</h3>'.format(title, author, publisher))
	else:
		titlepic_item = TITLEPIC_ITEM
		if isinstance(titlepic, str):
			with open(titlepic, 'rb'):
				titlepic = titlepic.read()
		title_page = titlepage(title=title, content='<img src="{}"/>'.format(TITLEPIC_PATH))

	items, itemrefs, navpoints, chapters = zip(*create_parts(title_page, chapters))

	with ZipFile(path, 'w', ZIP_DEFLATED) as epub:
		epub.writestr('mimetype', 'application/epub+zip')
		epub.writestr('META-INF/container.xml', CONTAINER)

		if titlepic:
			epub.writestr('OEBPS/' + TITLEPIC_PATH, titlepic)
		epub.writestr('OEBPS/stylesheet.css', stylesheet)
		epub.writestr('OEBPS/content.opf', content(title=title, author=author, publisher=publisher, uuid=uuid,
			titlepic_item=titlepic_item, items='\n'.join(items), itemrefs='\n'.join(itemrefs)))
		epub.writestr('OEBPS/toc.ncx', toc(title=title, uuid=uuid, navpoints='\n'.join(navpoints)))

		for id_, _, chapter in chapters:
			if isinstance(chapter, Tag):
				t, chapter = chapter, BeautifulSoup('<!doctype html><meta charset=utf-8>', 'html5lib')
				chapter.append(t)
			elif not isinstance(chapter, BeautifulSoup):
				chapter = BeautifulSoup(chapter, 'html5lib')

			xhtmlify(chapter)

			chapter.find('head').append(chapter.new_tag('link', href='stylesheet.css', type='text/css', rel='stylesheet'))

			# enforce XHTML
			epub.writestr('OEBPS/{}.xhtml'.format(id_), chapter.prettify())
	#!/usr/bin/env python3

	import re
	from contextlib import closing
	from epub import create_epub, DEFAULT_STYLESHEET

	import requests
	import requests_cache
	from bs4 import BeautifulSoup
	from smartypants import smartypants


	author = 'Pusakuronu'
	title = 'Dungeon Keeper Ami'
	publisher = 'Anime Addventure'

	url_base = 'https://forums.sufficientvelocity.com/threads/dungeon-keeper-ami-sailor-moon-dungeon-keeper-story-only-thread.30066'
	#list_url = url_base + 'authors/{author}.html?tag={title}'.format(author=author, title=title.replace(' ', '+'))
	#story_url_re = re.compile(r'.*/(\d+).html')
	#header_re = re.compile(r'{}: (.+) \[Episode \d+\]'.format(title))

	titlepic_url = 'http://fc00.deviantart.net/fs70/f/2011/063/b/e/dungeon_keeper_ami_by_paulobarrios-d3avri8.jpg'

	stylesheet = '''\
	hr {
	margin: 0 20% 55px; padding: 19px 0; line-height: 38px;
	border: none; border-bottom: 1px solid black; text-align: center;
	color: inherit; background-color: inherit;
	}
	hr:before {
	content: "☿"; display: inline-block; float: left; position: relative; left: -8px;
	margin: 0 50%; padding: 0 4px; background-color: inherit; cursor: default;
	}
	'''


	def parse(url):
	r = requests.get(url)
	return BeautifulSoup(r.text, 'html5lib')

	def get_posts(pages):
	for p in pages:
	marked = p.find(class_='threadmarker')
	if marked:
	_, _, title = marked.find(class_='label').children
	yield title.strip(), p.find(class_='messageText')


	requests_cache.install_cache('dkami')


	thread_pages = [parse(url_base)]
	nav = thread_pages[0].find(class_='PageNav')
	thread_pages += [parse(f'{url_base}/page-{p}') for p in range(int(nav['data-start']), int(nav['data-last'])+1)]

	pages = [msg for tp in thread_pages for msg in tp.find_all('li', class_='message')]

	posts = list(get_posts(pages))

	chapters = []
	for id_, (header, html) in enumerate(posts):
	html = smartypants(f'<h1>{header}</h1>\n{html.prettify()}')
	chapters.append((id_, header, html))

	titlepic = requests.get(titlepic_url).content

	create_epub(title, author, publisher, chapters, titlepic=titlepic, stylesheet=DEFAULT_STYLESHEET + stylesheet)
	import sys

	from zipfile import ZipFile, ZIP_DEFLATED

	from bs4 import BeautifulSoup, Tag

	TITLEPIC_PATH = 'images/title.png'
	TITLEPIC_ITEM = '\t\t<item id="imgl" href="{}" media-type="image/png"/>'.format(TITLEPIC_PATH)
	TITLEPAGE_ID = '0-titlepage'

	XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'

	DEFAULT_STYLESHEET = '''\
	img {
	max-width: 100%;
	max-height: 100%;
	}
	'''

	CONTAINER = XML_HEADER + '''
	<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
	<rootfiles>
	<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
	</rootfiles>
	</container>
	'''

	content = (XML_HEADER + '''
	<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
	<dc:title>{title}</dc:title>
	<dc:creator opf:role="aut">{author}</dc:creator>
	<dc:language>en-US</dc:language>
	<dc:rights>Public Domain</dc:rights>
	<dc:publisher>{publisher}</dc:publisher>
	<dc:identifier id="BookID" opf:scheme="UUID">{uuid}</dc:identifier>
	</metadata>
	<manifest>
	<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
	<item id="style" href="stylesheet.css" media-type="text/css"/>
	{titlepic_item}
	{items}
	</manifest>
	<spine toc="ncx">
	{itemrefs}
	</spine>
	</package>
	''').format

	item = '\t\t<item id="{id}" href="{id}.xhtml" media-type="application/xhtml+xml"/>'.format
	itemref = '\t\t<itemref idref="{id}"/>'.format

	toc = (XML_HEADER + '''
	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
	<head>
	<meta name="dtb:uid" content="{uuid}"/>
	<meta name="dtb:depth" content="1"/>
	<meta name="dtb:totalPageCount" content="0"/>
	<meta name="dtb:maxPageNumber" content="0"/>
	</head>

	<docTitle>
	<text>{title}</text>
	</docTitle>

	<navMap>
	{navpoints}
	</navMap>
	</ncx>
	''').format

	navpoint = '''\
	<navPoint id="{id}" playOrder="{order}">
	<navLabel>
	<text>{header}</text>
	</navLabel>
	<content src="{id}.xhtml"/>
	</navPoint>'''.format

	titlepage = '''\
	<html>
	<head>
	<title>{title}</title>
	<style type="text/css">
	@page {{ padding: 0; margin: 0 }}
	body {{ text-align: center; padding: 0; margin: 0 }}
	</style>
	</head>
	<body>
	{content}
	</body>
	</html>
	'''.format

	def xhtmlify(page):
	if not page.find('body'): # assume sequence of divs/paragraphs/…
	page.wrap(page, page.new_tag('body'))

	html_tag = page.find('html')
	if not html_tag: # assume plain body tag
	html_tag = page.wrap(page, page.new_tag('html'))

	if not page.find('head'):
	head_tag = page.new_tag('head')
	html_tag.insert(0, head_tag)
	h1 = page.find('h1')
	if h1:
	title_tag = page.new_tag('title')
	title_tag.string = h1.get_text()
	head_tag.insert(0, title_tag)

	html_tag['xmlns'] = 'http://www.w3.org/1999/xhtml'
	html_tag['xml:lang'] = 'en'
	page.is_xml = True # emit xml header

	def create_parts(title_page, chapters):
	"""yields toc and index entries, as well as chapter tuples with prepended title page"""
	yield (
	item(id=TITLEPAGE_ID),
	itemref(id=TITLEPAGE_ID),
	navpoint(id=TITLEPAGE_ID, header='Title page', order=1),
	(TITLEPAGE_ID, 'Title page', title_page))

	for order, chapter in enumerate(chapters, 2):
	id_, header, _ = chapter
	yield (
	item(id=id_),
	itemref(id=id_),
	navpoint(id=id_, header=header, order=order),
	chapter)

	def create_epub(title, author, publisher, chapters, path=None, *, uuid=None, titlepic=None, stylesheet=DEFAULT_STYLESHEET):
	"""Creates and saves an epub file.
	chapters: sequence of (id, title, page) tuples. The ids are used as filenames.
	page can be a string or BeautifulSoup. it may be a body tag, a series of content tags, or a whole (X)HTML document.
	path: path to write to.
	titlepic: path to or bytes of png file.
	"""
	if path is None:
	path = '{} – {}.epub'.format(author, title)

	if uuid is None:
	uuid = '{}-{}'.format(author, title).replace(' ', '_').lower()

	if titlepic is None:
	titlepic_item = ''
	title_page = titlepage(title=title, content='<h1>{}</h1>\n<h2>{}</h2>\n<h3>{}</h3>'.format(title, author, publisher))
	else:
	titlepic_item = TITLEPIC_ITEM
	if isinstance(titlepic, str):
	with open(titlepic, 'rb'):
	titlepic = titlepic.read()
	title_page = titlepage(title=title, content='<img src="{}"/>'.format(TITLEPIC_PATH))

	items, itemrefs, navpoints, chapters = zip(*create_parts(title_page, chapters))

	with ZipFile(path, 'w', ZIP_DEFLATED) as epub:
	epub.writestr('mimetype', 'application/epub+zip')
	epub.writestr('META-INF/container.xml', CONTAINER)

	if titlepic:
	epub.writestr('OEBPS/' + TITLEPIC_PATH, titlepic)
	epub.writestr('OEBPS/stylesheet.css', stylesheet)
	epub.writestr('OEBPS/content.opf', content(title=title, author=author, publisher=publisher, uuid=uuid,
	titlepic_item=titlepic_item, items='\n'.join(items), itemrefs='\n'.join(itemrefs)))
	epub.writestr('OEBPS/toc.ncx', toc(title=title, uuid=uuid, navpoints='\n'.join(navpoints)))

	for id_, _, chapter in chapters:
	if isinstance(chapter, Tag):
	t, chapter = chapter, BeautifulSoup('<!doctype html><meta charset=utf-8>', 'html5lib')
	chapter.append(t)
	elif not isinstance(chapter, BeautifulSoup):
	chapter = BeautifulSoup(chapter, 'html5lib')

	xhtmlify(chapter)

	chapter.find('head').append(chapter.new_tag('link', href='stylesheet.css', type='text/css', rel='stylesheet'))

	# enforce XHTML
	epub.writestr('OEBPS/{}.xhtml'.format(id_), chapter.prettify())