rubyu/merge_chainlp_epub.bat

## merge_chainlp_epub.bat
Python path_to\merge_chainlp_epub.py "%1" "Merged_%~nx1"

## merge_chainlp_epub.py
# -*- coding: utf-8 -*-

import argparse
from zipfile import ZipFile
from StringIO import StringIO
from io import TextIOWrapper
import re

def get_arg():
    parser = argparse.ArgumentParser(prog="merge_chainlp_epub.py",
                                     description="merges a section of EPUB published by ChainLP into a single page")
    parser.add_argument("input")
    parser.add_argument("output")
    return parser.parse_args()

def parse_ncx(text):
    index = []
    pat = re.compile(r"<content src=\"(content/index_P(\d+)\.xhtml)\"")
    ncx = TextIOWrapper(text, encoding="utf-8")
    line = ncx.readline()
    while line:
        m = pat.search(line)
        if m:
            item = {}
            item["resource"] = m.group(1)
            item["index"] = int(m.group(2))
            index.append(item)
        line = ncx.readline()
    return index

def parse_opf(text):
    pages = []
    pat = re.compile(r"<item id=\"([^\"]+)\" href=\"(content/index_(P(\d+))\.xhtml)\"")
    opf = TextIOWrapper(text, encoding="utf-8")
    line = opf.readline()
    while line:
        m = pat.search(line)
        if m:
            page = {}
            page["id"] = m.group(1)
            page["resource"] = m.group(2)
            page["full_index"] = m.group(3)
            page["index"] = int(m.group(4))
            pages.append(page)
        line = opf.readline()
    return pages

def merge(pages, index):
    """
    全てのページについて、
        ・目次に存在している
        ・それよりindexの小さい目次がある
        ・それよりindexの大きい目次がある
        ・目次がない
        がありえる
    """
    sections = []

    if len(index) == 0:
        return sections

    for item in index:
        section = item.copy()
        section["merged"] = []
        sections.append(section)

    def find_equal_section(page, sections):
        for section in sections:
            if page["index"] == section["index"]:
                return section

    def find_larger_section(page, sections):
        for section in sections:
            if page["index"] < section["index"]:
                return section

    def find_smaller_section(page, sections):
        for section in sections[::-1]:
            if page["index"] > section["index"]:
                return section

    for page in pages:
        page = page.copy()
        section = find_equal_section(page, sections)
        if section:
            section["id"] = page["id"]
            section["full_index"] = page["full_index"]
        else:
            section = find_smaller_section(page, sections)
        if not section:
            section = find_larger_section(page, sections)
        section["merged"].append(page)
    return sections

def rewrite_index_page(text, section):
    new = StringIO()
    old = TextIOWrapper(text, encoding="utf-8")
    line = old.readline()
    while line:
        line = line.strip()
        if line.startswith("<p><img"):
            line = "\r\n".join(["<p><img src=\"resources/%(full_index)s.jpg\" width=\"600\" height=\"800\" /></p>" % page for page in section["merged"]])
        new.write(line.encode("utf-8"))
        new.write("\r\n")
        line = old.readline()
    return new.getvalue()

def rewrite_metadata(text, sections):
    new = StringIO()
    old = TextIOWrapper(text, encoding="utf-8")
    line = old.readline()
    while line:
        line = line.strip()
        if line.startswith("</manifest>"):
            line = "\r\n".join(["<item id=\"%(id)s\" href=\"%(resource)s\" media-type=\"application/xhtml+xml\"/>" % section for section in sections])
            line += "\r\n</manifest>"
        elif line.startswith("</spine>"):
            line = "\r\n".join(["<itemref idref=\"%(id)s\"/>" % section for section in sections])
            line += "\r\n</spine>"
        elif line.count("content/index_P") > 0 or line.count("<itemref") > 0:
            line = ""
        if line:
            new.write(line.encode("utf-8"))
            new.write("\r\n")
        line = old.readline()
    return new.getvalue()

if __name__ == "__main__":
    arg = get_arg()

    print "inout: %s" % arg.input
    print "output: %s" % arg.output

    """
    ncxから目次情報を取得
    目次に指定されているページに、その前後のページを統合する
    metadataには統合されて意味のなくなったページは登録しない
    """
    with ZipFile(arg.input, "r") as old, ZipFile(arg.output, "w") as new:
        print "Parsing toc.ncx ...",
        index = parse_ncx(old.open("toc.ncx"))
        print "done."

        print "Parsing metadata.opf ...",
        pages = parse_opf(old.open("metadata.opf"))
        print "done."

        print "Marging index ...",
        sections = merge(pages, index)
        print "done."

        def find_section(name, sections):
            for section in sections:
                for page in section["merged"]:
                    if name == page["resource"]:
                        return section

        def is_section_owner(name, section):
            if section["resource"] == name:
                return True

        print "Generating a new EPUB file ...",
        for info in old.infolist():
            if info.filename == "metadata.opf":
                text = rewrite_metadata(old.open(info), sections)
            else:
                section = find_section(info.filename, sections)
                if section: #xhtml
                    if is_section_owner(info.filename, section): #index page
                        text = rewrite_index_page(old.open(info), section)
                    else: #merged and removed page
                        continue
                else: #other
                    text = old.read(info)
            new.writestr(info, text)
        print "done."
	# -- coding: utf-8 --

	import argparse
	from zipfile import ZipFile
	from StringIO import StringIO
	from io import TextIOWrapper
	import re

	def get_arg():
	parser = argparse.ArgumentParser(prog="merge_chainlp_epub.py",
	description="merges a section of EPUB published by ChainLP into a single page")
	parser.add_argument("input")
	parser.add_argument("output")
	return parser.parse_args()

	def parse_ncx(text):
	index = []
	pat = re.compile(r"<content src=\"(content/index_P(\d+)\.xhtml)\"")
	ncx = TextIOWrapper(text, encoding="utf-8")
	line = ncx.readline()
	while line:
	m = pat.search(line)
	if m:
	item = {}
	item["resource"] = m.group(1)
	item["index"] = int(m.group(2))
	index.append(item)
	line = ncx.readline()
	return index

	def parse_opf(text):
	pages = []
	pat = re.compile(r"<item id=\"([^\"]+)\" href=\"(content/index_(P(\d+))\.xhtml)\"")
	opf = TextIOWrapper(text, encoding="utf-8")
	line = opf.readline()
	while line:
	m = pat.search(line)
	if m:
	page = {}
	page["id"] = m.group(1)
	page["resource"] = m.group(2)
	page["full_index"] = m.group(3)
	page["index"] = int(m.group(4))
	pages.append(page)
	line = opf.readline()
	return pages

	def merge(pages, index):
	"""
	全てのページについて、
	・目次に存在している
	・それよりindexの小さい目次がある
	・それよりindexの大きい目次がある
	・目次がない
	がありえる
	"""
	sections = []

	if len(index) == 0:
	return sections

	for item in index:
	section = item.copy()
	section["merged"] = []
	sections.append(section)

	def find_equal_section(page, sections):
	for section in sections:
	if page["index"] == section["index"]:
	return section

	def find_larger_section(page, sections):
	for section in sections:
	if page["index"] < section["index"]:
	return section

	def find_smaller_section(page, sections):
	for section in sections[::-1]:
	if page["index"] > section["index"]:
	return section

	for page in pages:
	page = page.copy()
	section = find_equal_section(page, sections)
	if section:
	section["id"] = page["id"]
	section["full_index"] = page["full_index"]
	else:
	section = find_smaller_section(page, sections)
	if not section:
	section = find_larger_section(page, sections)
	section["merged"].append(page)
	return sections

	def rewrite_index_page(text, section):
	new = StringIO()
	old = TextIOWrapper(text, encoding="utf-8")
	line = old.readline()
	while line:
	line = line.strip()
	if line.startswith("<p><img"):
	line = "\r\n".join(["<p><img src=\"resources/%(full_index)s.jpg\" width=\"600\" height=\"800\" /></p>" % page for page in section["merged"]])
	new.write(line.encode("utf-8"))
	new.write("\r\n")
	line = old.readline()
	return new.getvalue()

	def rewrite_metadata(text, sections):
	new = StringIO()
	old = TextIOWrapper(text, encoding="utf-8")
	line = old.readline()
	while line:
	line = line.strip()
	if line.startswith("</manifest>"):
	line = "\r\n".join(["<item id=\"%(id)s\" href=\"%(resource)s\" media-type=\"application/xhtml+xml\"/>" % section for section in sections])
	line += "\r\n</manifest>"
	elif line.startswith("</spine>"):
	line = "\r\n".join(["<itemref idref=\"%(id)s\"/>" % section for section in sections])
	line += "\r\n</spine>"
	elif line.count("content/index_P") > 0 or line.count("<itemref") > 0:
	line = ""
	if line:
	new.write(line.encode("utf-8"))
	new.write("\r\n")
	line = old.readline()
	return new.getvalue()

	if __name__ == "__main__":
	arg = get_arg()

	print "inout: %s" % arg.input
	print "output: %s" % arg.output

	"""
	ncxから目次情報を取得
	目次に指定されているページに、その前後のページを統合する
	metadataには統合されて意味のなくなったページは登録しない
	"""
	with ZipFile(arg.input, "r") as old, ZipFile(arg.output, "w") as new:
	print "Parsing toc.ncx ...",
	index = parse_ncx(old.open("toc.ncx"))
	print "done."

	print "Parsing metadata.opf ...",
	pages = parse_opf(old.open("metadata.opf"))
	print "done."

	print "Marging index ...",
	sections = merge(pages, index)
	print "done."

	def find_section(name, sections):
	for section in sections:
	for page in section["merged"]:
	if name == page["resource"]:
	return section

	def is_section_owner(name, section):
	if section["resource"] == name:
	return True

	print "Generating a new EPUB file ...",
	for info in old.infolist():
	if info.filename == "metadata.opf":
	text = rewrite_metadata(old.open(info), sections)
	else:
	section = find_section(info.filename, sections)
	if section: #xhtml
	if is_section_owner(info.filename, section): #index page
	text = rewrite_index_page(old.open(info), section)
	else: #merged and removed page
	continue
	else: #other
	text = old.read(info)
	new.writestr(info, text)
	print "done."