Skip to content

Instantly share code, notes, and snippets.

@rubyu
Last active December 17, 2015 07:39
Show Gist options
  • Save rubyu/5574454 to your computer and use it in GitHub Desktop.
Save rubyu/5574454 to your computer and use it in GitHub Desktop.
Python path_to\merge_chainlp_epub.py "%1" "Merged_%~nx1"
# -*- coding: utf-8 -*-
import argparse
from zipfile import ZipFile
from StringIO import StringIO
from io import TextIOWrapper
import re
def get_arg():
parser = argparse.ArgumentParser(prog="merge_chainlp_epub.py",
description="merges a section of EPUB published by ChainLP into a single page")
parser.add_argument("input")
parser.add_argument("output")
return parser.parse_args()
def parse_ncx(text):
index = []
pat = re.compile(r"<content src=\"(content/index_P(\d+)\.xhtml)\"")
ncx = TextIOWrapper(text, encoding="utf-8")
line = ncx.readline()
while line:
m = pat.search(line)
if m:
item = {}
item["resource"] = m.group(1)
item["index"] = int(m.group(2))
index.append(item)
line = ncx.readline()
return index
def parse_opf(text):
pages = []
pat = re.compile(r"<item id=\"([^\"]+)\" href=\"(content/index_(P(\d+))\.xhtml)\"")
opf = TextIOWrapper(text, encoding="utf-8")
line = opf.readline()
while line:
m = pat.search(line)
if m:
page = {}
page["id"] = m.group(1)
page["resource"] = m.group(2)
page["full_index"] = m.group(3)
page["index"] = int(m.group(4))
pages.append(page)
line = opf.readline()
return pages
def merge(pages, index):
"""
全てのページについて、
・目次に存在している
・それよりindexの小さい目次がある
・それよりindexの大きい目次がある
・目次がない
がありえる
"""
sections = []
if len(index) == 0:
return sections
for item in index:
section = item.copy()
section["merged"] = []
sections.append(section)
def find_equal_section(page, sections):
for section in sections:
if page["index"] == section["index"]:
return section
def find_larger_section(page, sections):
for section in sections:
if page["index"] < section["index"]:
return section
def find_smaller_section(page, sections):
for section in sections[::-1]:
if page["index"] > section["index"]:
return section
for page in pages:
page = page.copy()
section = find_equal_section(page, sections)
if section:
section["id"] = page["id"]
section["full_index"] = page["full_index"]
else:
section = find_smaller_section(page, sections)
if not section:
section = find_larger_section(page, sections)
section["merged"].append(page)
return sections
def rewrite_index_page(text, section):
new = StringIO()
old = TextIOWrapper(text, encoding="utf-8")
line = old.readline()
while line:
line = line.strip()
if line.startswith("<p><img"):
line = "\r\n".join(["<p><img src=\"resources/%(full_index)s.jpg\" width=\"600\" height=\"800\" /></p>" % page for page in section["merged"]])
new.write(line.encode("utf-8"))
new.write("\r\n")
line = old.readline()
return new.getvalue()
def rewrite_metadata(text, sections):
new = StringIO()
old = TextIOWrapper(text, encoding="utf-8")
line = old.readline()
while line:
line = line.strip()
if line.startswith("</manifest>"):
line = "\r\n".join(["<item id=\"%(id)s\" href=\"%(resource)s\" media-type=\"application/xhtml+xml\"/>" % section for section in sections])
line += "\r\n</manifest>"
elif line.startswith("</spine>"):
line = "\r\n".join(["<itemref idref=\"%(id)s\"/>" % section for section in sections])
line += "\r\n</spine>"
elif line.count("content/index_P") > 0 or line.count("<itemref") > 0:
line = ""
if line:
new.write(line.encode("utf-8"))
new.write("\r\n")
line = old.readline()
return new.getvalue()
if __name__ == "__main__":
arg = get_arg()
print "inout: %s" % arg.input
print "output: %s" % arg.output
"""
ncxから目次情報を取得
目次に指定されているページに、その前後のページを統合する
metadataには統合されて意味のなくなったページは登録しない
"""
with ZipFile(arg.input, "r") as old, ZipFile(arg.output, "w") as new:
print "Parsing toc.ncx ...",
index = parse_ncx(old.open("toc.ncx"))
print "done."
print "Parsing metadata.opf ...",
pages = parse_opf(old.open("metadata.opf"))
print "done."
print "Marging index ...",
sections = merge(pages, index)
print "done."
def find_section(name, sections):
for section in sections:
for page in section["merged"]:
if name == page["resource"]:
return section
def is_section_owner(name, section):
if section["resource"] == name:
return True
print "Generating a new EPUB file ...",
for info in old.infolist():
if info.filename == "metadata.opf":
text = rewrite_metadata(old.open(info), sections)
else:
section = find_section(info.filename, sections)
if section: #xhtml
if is_section_owner(info.filename, section): #index page
text = rewrite_index_page(old.open(info), section)
else: #merged and removed page
continue
else: #other
text = old.read(info)
new.writestr(info, text)
print "done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment