Yasushi/eb.py

## eb.py
#!/usr/bin/env python3
# -*- coding: utf-8; -*-

import sys, os, glob, itertools, re, functools, pprint
from collections import OrderedDict

from lxml import etree
from ebooklib import epub


def xpath(query):
    return etree.XPath(query, namespaces={'h':'http://www.w3.org/1999/xhtml', 're':'http://exslt.org/regular-expressions'})

def xpath_class(cls, name="*", append=""):
    return xpath(".//%s[re:test(@class, '\\b%s\\b')]%s"%(name, cls, append))

def xpath_id(id, name="*", append=""):
    return lambda h: next(iter(xpath(".//%s[@id='%s']%s"%(name, id, append))(h)), None)

class NTOC:
    TEMPLATE=u'''\
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja">
  <head>
    <title>title</title>
    <link rel="stylesheet" type="text/css" href="style.css" />
  </head>
  <body>
  </body>
</html>
'''
    def __init__(self, path):
        self.dom = etree.parse(open(path), etree.HTMLParser())
        self.identifier = self.dom.xpath('.//link[@rel="alternate"][@media="handheld"]/@href')[0].split('/')[-2]
        self.require_font = self.identifier == 'n4955ee'
        self.chapter_index = None
        self.series_title = "".join(xpath_class('series_title', append='//text()')(self.dom))
        self.novel_title = "".join(xpath_class('novel_title', append='//text()')(self.dom))
        self.novel_writername = "".join(xpath_class('novel_writername', append='/a/text()')(self.dom))
        self.novel_ex = xpath_id('novel_ex')(self.dom)

        def parse_index_box(box, elem):
            if elem.tag == 'div':
                box.insert(0, (elem.text, []))
            elif elem.tag == 'dl':
                a=xpath('./dd/a')(elem)[0]
                href=re.sub(r'^/|/$', '', a.attrib['href'])
                date = "".join([s.strip() for s in xpath('./dt/text()')(elem)])
                revdate = "".join([s.strip() for s in xpath('./dt/span/@title')(elem)])
                if revdate:
                    date="%s (%s)"%(date, revdate)
                box[0][1].append((href,a.text, date))
            return box

        self.index_box = functools.reduce(parse_index_box, xpath_class('index_box')(self.dom)[0], [("", [])])
        self.index_box = [i for i in self.index_box if i[1]]
        self.index_box.reverse()

        self.pages=[]
        for (chapter, pages) in self.index_box:
            for (href, subtitle, date) in pages:
                self.pages.append(Item(href))
        self.pagemap = dict([(i.origpath, i) for i in self.pages])

    def make_toc(self, chapter_index=None):
        self.chapter_index=chapter_index
        self.toc=TOC()
        for (i, (chapter, pages)) in enumerate(self.index_box):
            if self.chapter_index is not None and self.chapter_index != i:
                continue
            first=True
            c=self.toc
            for (href, subtitle, date) in pages:
                if first and chapter: # and self.chapter_index is None:
                    first=False
                    c=self.toc.add(chapter, self.pagemap[href].path)
                page = self.pagemap[href]
                c.add(page.subtitle, page.path, 'novel_no')

        self.htmltoc = self.generate_htmltoc()
        t = TOC(u'目次', self.htmltoc[0])
        t.parent=self.toc
        self.toc.children.insert(0, t)
        self.metadata = Metadata(self.title(), authors=(self.novel_writername,))

    def coverpage(self):
        dom = etree.fromstring(self.TEMPLATE)
        title=xpath('//h:title')(dom)[0]
        title.text=self.title()
        body=dom.find('{http://www.w3.org/1999/xhtml}body')
        h1=etree.SubElement(body, 'h1')
        h1.text=self.novel_title
        if self.chapter_index is not None:
            h2=etree.SubElement(body, 'h2')
            h2.text=self.selected_chapter_title()

        body.append(self.novel_ex)
        return etree.tostring(dom, encoding='utf-8',
                              xml_declaration=True,
                              pretty_print=False,
                              doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
                              with_tail=True)

    def has_chaps(self):
        return len(self.index_box) > 1

    def title(self):
        if self.has_chaps() and self.chapter_index is not None:
            return "%s %s - %s"%(self.novel_title, self.index_box[self.chapter_index][0], self.series_title)
        else:
            return "%s - %s"%(self.novel_title, self.series_title)

    def selected_chapter_title(self):
        if self.has_chaps() and self.chapter_index is not None:
            return self.index_box[self.chapter_index][0]
        else:
            return ''

    def pagelist(self):
        if self.has_chaps() and self.chapter_index is not None:
            ret=[]
            for (href, subtitle, date) in self.index_box[self.chapter_index][1]:
                ret.append(self.pagemap[href])
            return ret
        else:
            return self.pages

    def epubname(self, dir="."):
        if self.has_chaps() and self.chapter_index is not None:
            return os.path.join(dir, "%s %d %s.epub"%(self.novel_title, self.chapter_index + 1, self.index_box[self.chapter_index][0]))
        else:
            return os.path.join(dir, self.novel_title+u'.epub')


class Item:
    def __init__(self, path):
        self.origpath = path
        self.srcpath = path
        if os.path.isdir(self.srcpath):
            self.srcpath = os.path.join(self.srcpath, 'index.html')
        (directory, filename) = os.path.split(self.srcpath)
        (basename,ext) = os.path.splitext(filename)
        if basename == 'index':
            (directory, basename) = os.path.split(directory)
        if ext == '' or ext == '.html':
            ext = '.xhtml'
        self.path = os.path.join(directory, basename + ext)
        self.id = self.path.replace("/", "_")
        self.dom = self.readHTML()
        self.no = xpath_id('novel_no', append="/text()")(self.dom)
        self.title = xpath('//title')(self.dom)[0].text.split(' - ')[0]
        self.chapter = "".join(xpath_class('chapter_title', name='p', append="/text()")(self.dom))
        self.subtitle = "".join(xpath_class("novel_subtitle", append="/text()")(self.dom))
        self.html = etree.tostring(self.dom, encoding='utf-8',
                                   xml_declaration=True,
                                   pretty_print=False,
                                   doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
                                   with_tail=True)

    def readHTML(self):
        h = etree.parse(open(self.srcpath), etree.HTMLParser()).getroot()
        # del h.attrib['lang']
        etree.strip_elements(h, 'script', 'meta', 'link', etree.Comment)
        etree.strip_attributes(h, 'onload', 'onclick')
        etree.strip_tags(h, 'rb')
        for e in xpath_class('contents1')(h[1]):
            etree.strip_tags(e, 'a')
        for e in h[1].xpath('./*[not(@id="container")]'):
            h[1].remove(e)
        for e in h[1].xpath('.//*[contains(@class,"koukoku")]'):
            e.getparent().remove(e)
        for sel in ['toaster', 'narou_modal', 'novel_bn', 'twitter-share-button']:
            for e in xpath_class(sel)(h[1]):
                e.getparent().remove(e)
        for sel in ['novel_attention', 'novel_footer', 'novel_hyouka', 'impression', 'recommend', 'review', 'pageTop']:
            e = xpath_id(sel)(h[1])
            if e is not None:
                e.getparent().remove(e)
        return h


def create_book(ntoc):
    book = epub.EpubBook()

    # set metadata
    book.set_identifier(ntoc.identifier)
    book.set_title(ntoc.title())
    book.set_language('ja')

    book.add_author(ntoc.novel_writername)

    intro=epub.EpubHtml(uid='intro', title=book.title, file_name="intro.xhtml", content=ntoc.coverpage())
    intro.add_link(href='style.css', rel='stylesheet', type='text/css')
    book.add_item(intro)

    # create chapter
    def _create_chapter(item):
        c=epub.EpubHtml(uid=item.id, title=item.subtitle, file_name=item.path, content=item.html)
        c.add_link(href='../style.css', rel='stylesheet', type='text/css')
        return c
    chaps = OrderedDict([(i.origpath, _create_chapter(i)) for i in ntoc.pages])
    # add chapter
    for c in chaps.values():
        book.add_item(c)

    # define Table Of Contents
    # book.toc = (epub.Link('chap_01.xhtml', 'Introduction はじめに', 'intro'),
    #              (epub.Section('Simple book しんぷるぶっく'),
    #                (c1, ))
    #             )

    root=[intro]
    cur=root
    for (sec, children) in ntoc.index_box:
        if len(sec) > 0:
            cur=[]
            s=[epub.Section(sec),cur]
            root.append(s)
        for (path, subtitle, date) in children:
            c=chaps[path]
            cur.append(c)
            #cur.append(epub.Link(c.file_name, "{!s} ({!s})".format(subtitle, date), uid=c.id))
    book.toc = root

    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    nav=epub.EpubNav()
    nav.add_link(href='style.css', rel='stylesheet', type='text/css')
    book.add_item(nav)

    # define CSS style
    css = epub.EpubItem(uid="style", file_name="style.css", media_type="text/css", content=open("style.css").read())
    if ntoc.require_font:
        font=epub.EpubItem(uid="font", file_name="SourceHanSerif-Medium.otf", media_type="application/x-font-otf", content=open("SourceHanSerif-Medium.otf", "rb").read())
        book.add_item(font)
        content = """\
@font-face {
  font-family: han;
  src: url("SourceHanSerif-Medium.otf");
}
""" + css.get_content().replace('font-family: serif;', 'font-family: han;')
        css.set_content(content)
    book.add_item(css)

    # basic spine
    book.spine = [intro, 'nav', *chaps.values()]

    # write to the file
    epub.write_epub(ntoc.epubname(dir='eb'), book, {})
    print(ntoc.epubname(dir='eb'))


if __name__ == '__main__':
    if len(sys.argv) > 1:
        ntoc = NTOC(sys.argv[-1])
        create_book(ntoc)
	#!/usr/bin/env python3
	# -- coding: utf-8; --

	import sys, os, glob, itertools, re, functools, pprint
	from collections import OrderedDict

	from lxml import etree
	from ebooklib import epub


	def xpath(query):
	return etree.XPath(query, namespaces={'h':'http://www.w3.org/1999/xhtml', 're':'http://exslt.org/regular-expressions'})

	def xpath_class(cls, name="*", append=""):
	return xpath(".//%s[re:test(@class, '\\b%s\\b')]%s"%(name, cls, append))

	def xpath_id(id, name="*", append=""):
	return lambda h: next(iter(xpath(".//%s[@id='%s']%s"%(name, id, append))(h)), None)

	class NTOC:
	TEMPLATE=u'''\
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja">
	<head>
	<title>title</title>
	<link rel="stylesheet" type="text/css" href="style.css" />
	</head>
	<body>
	</body>
	</html>
	'''
	def __init__(self, path):
	self.dom = etree.parse(open(path), etree.HTMLParser())
	self.identifier = self.dom.xpath('.//link[@rel="alternate"][@media="handheld"]/@href')[0].split('/')[-2]
	self.require_font = self.identifier == 'n4955ee'
	self.chapter_index = None
	self.series_title = "".join(xpath_class('series_title', append='//text()')(self.dom))
	self.novel_title = "".join(xpath_class('novel_title', append='//text()')(self.dom))
	self.novel_writername = "".join(xpath_class('novel_writername', append='/a/text()')(self.dom))
	self.novel_ex = xpath_id('novel_ex')(self.dom)

	def parse_index_box(box, elem):
	if elem.tag == 'div':
	box.insert(0, (elem.text, []))
	elif elem.tag == 'dl':
	a=xpath('./dd/a')(elem)[0]
	href=re.sub(r'^/\|/$', '', a.attrib['href'])
	date = "".join([s.strip() for s in xpath('./dt/text()')(elem)])
	revdate = "".join([s.strip() for s in xpath('./dt/span/@title')(elem)])
	if revdate:
	date="%s (%s)"%(date, revdate)
	box[0][1].append((href,a.text, date))
	return box

	self.index_box = functools.reduce(parse_index_box, xpath_class('index_box')(self.dom)[0], [("", [])])
	self.index_box = [i for i in self.index_box if i[1]]
	self.index_box.reverse()

	self.pages=[]
	for (chapter, pages) in self.index_box:
	for (href, subtitle, date) in pages:
	self.pages.append(Item(href))
	self.pagemap = dict([(i.origpath, i) for i in self.pages])

	def make_toc(self, chapter_index=None):
	self.chapter_index=chapter_index
	self.toc=TOC()
	for (i, (chapter, pages)) in enumerate(self.index_box):
	if self.chapter_index is not None and self.chapter_index != i:
	continue
	first=True
	c=self.toc
	for (href, subtitle, date) in pages:
	if first and chapter: # and self.chapter_index is None:
	first=False
	c=self.toc.add(chapter, self.pagemap[href].path)
	page = self.pagemap[href]
	c.add(page.subtitle, page.path, 'novel_no')

	self.htmltoc = self.generate_htmltoc()
	t = TOC(u'目次', self.htmltoc[0])
	t.parent=self.toc
	self.toc.children.insert(0, t)
	self.metadata = Metadata(self.title(), authors=(self.novel_writername,))

	def coverpage(self):
	dom = etree.fromstring(self.TEMPLATE)
	title=xpath('//h:title')(dom)[0]
	title.text=self.title()
	body=dom.find('{http://www.w3.org/1999/xhtml}body')
	h1=etree.SubElement(body, 'h1')
	h1.text=self.novel_title
	if self.chapter_index is not None:
	h2=etree.SubElement(body, 'h2')
	h2.text=self.selected_chapter_title()

	body.append(self.novel_ex)
	return etree.tostring(dom, encoding='utf-8',
	xml_declaration=True,
	pretty_print=False,
	doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
	with_tail=True)

	def has_chaps(self):
	return len(self.index_box) > 1

	def title(self):
	if self.has_chaps() and self.chapter_index is not None:
	return "%s %s - %s"%(self.novel_title, self.index_box[self.chapter_index][0], self.series_title)
	else:
	return "%s - %s"%(self.novel_title, self.series_title)

	def selected_chapter_title(self):
	if self.has_chaps() and self.chapter_index is not None:
	return self.index_box[self.chapter_index][0]
	else:
	return ''

	def pagelist(self):
	if self.has_chaps() and self.chapter_index is not None:
	ret=[]
	for (href, subtitle, date) in self.index_box[self.chapter_index][1]:
	ret.append(self.pagemap[href])
	return ret
	else:
	return self.pages

	def epubname(self, dir="."):
	if self.has_chaps() and self.chapter_index is not None:
	return os.path.join(dir, "%s %d %s.epub"%(self.novel_title, self.chapter_index + 1, self.index_box[self.chapter_index][0]))
	else:
	return os.path.join(dir, self.novel_title+u'.epub')


	class Item:
	def __init__(self, path):
	self.origpath = path
	self.srcpath = path
	if os.path.isdir(self.srcpath):
	self.srcpath = os.path.join(self.srcpath, 'index.html')
	(directory, filename) = os.path.split(self.srcpath)
	(basename,ext) = os.path.splitext(filename)
	if basename == 'index':
	(directory, basename) = os.path.split(directory)
	if ext == '' or ext == '.html':
	ext = '.xhtml'
	self.path = os.path.join(directory, basename + ext)
	self.id = self.path.replace("/", "_")
	self.dom = self.readHTML()
	self.no = xpath_id('novel_no', append="/text()")(self.dom)
	self.title = xpath('//title')(self.dom)[0].text.split(' - ')[0]
	self.chapter = "".join(xpath_class('chapter_title', name='p', append="/text()")(self.dom))
	self.subtitle = "".join(xpath_class("novel_subtitle", append="/text()")(self.dom))
	self.html = etree.tostring(self.dom, encoding='utf-8',
	xml_declaration=True,
	pretty_print=False,
	doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
	with_tail=True)

	def readHTML(self):
	h = etree.parse(open(self.srcpath), etree.HTMLParser()).getroot()
	# del h.attrib['lang']
	etree.strip_elements(h, 'script', 'meta', 'link', etree.Comment)
	etree.strip_attributes(h, 'onload', 'onclick')
	etree.strip_tags(h, 'rb')
	for e in xpath_class('contents1')(h[1]):
	etree.strip_tags(e, 'a')
	for e in h[1].xpath('./*[not(@id="container")]'):
	h[1].remove(e)
	for e in h[1].xpath('.//*[contains(@class,"koukoku")]'):
	e.getparent().remove(e)
	for sel in ['toaster', 'narou_modal', 'novel_bn', 'twitter-share-button']:
	for e in xpath_class(sel)(h[1]):
	e.getparent().remove(e)
	for sel in ['novel_attention', 'novel_footer', 'novel_hyouka', 'impression', 'recommend', 'review', 'pageTop']:
	e = xpath_id(sel)(h[1])
	if e is not None:
	e.getparent().remove(e)
	return h



	def create_book(ntoc):
	book = epub.EpubBook()

	# set metadata
	book.set_identifier(ntoc.identifier)
	book.set_title(ntoc.title())
	book.set_language('ja')

	book.add_author(ntoc.novel_writername)

	intro=epub.EpubHtml(uid='intro', title=book.title, file_name="intro.xhtml", content=ntoc.coverpage())
	intro.add_link(href='style.css', rel='stylesheet', type='text/css')
	book.add_item(intro)

	# create chapter
	def _create_chapter(item):
	c=epub.EpubHtml(uid=item.id, title=item.subtitle, file_name=item.path, content=item.html)
	c.add_link(href='../style.css', rel='stylesheet', type='text/css')
	return c
	chaps = OrderedDict([(i.origpath, _create_chapter(i)) for i in ntoc.pages])
	# add chapter
	for c in chaps.values():
	book.add_item(c)

	# define Table Of Contents
	# book.toc = (epub.Link('chap_01.xhtml', 'Introduction はじめに', 'intro'),
	# (epub.Section('Simple book しんぷるぶっく'),
	# (c1, ))
	# )

	root=[intro]
	cur=root
	for (sec, children) in ntoc.index_box:
	if len(sec) > 0:
	cur=[]
	s=[epub.Section(sec),cur]
	root.append(s)
	for (path, subtitle, date) in children:
	c=chaps[path]
	cur.append(c)
	#cur.append(epub.Link(c.file_name, "{!s} ({!s})".format(subtitle, date), uid=c.id))
	book.toc = root

	# add default NCX and Nav file
	book.add_item(epub.EpubNcx())
	nav=epub.EpubNav()
	nav.add_link(href='style.css', rel='stylesheet', type='text/css')
	book.add_item(nav)

	# define CSS style
	css = epub.EpubItem(uid="style", file_name="style.css", media_type="text/css", content=open("style.css").read())
	if ntoc.require_font:
	font=epub.EpubItem(uid="font", file_name="SourceHanSerif-Medium.otf", media_type="application/x-font-otf", content=open("SourceHanSerif-Medium.otf", "rb").read())
	book.add_item(font)
	content = """\
	@font-face {
	font-family: han;
	src: url("SourceHanSerif-Medium.otf");
	}
	""" + css.get_content().replace('font-family: serif;', 'font-family: han;')
	css.set_content(content)
	book.add_item(css)

	# basic spine
	book.spine = [intro, 'nav', *chaps.values()]

	# write to the file
	epub.write_epub(ntoc.epubname(dir='eb'), book, {})
	print(ntoc.epubname(dir='eb'))


	if __name__ == '__main__':
	if len(sys.argv) > 1:
	ntoc = NTOC(sys.argv[-1])
	create_book(ntoc)