#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# AozoraEpub3 + kindlegen で作成した mobi テキストで、部分的に辞書が | |
# 引けなくなる時の対策スクリプト。 | |
# | |
# epub の 本文 xhtml (0001.xhtmlなど)について、句読点の後に、おそらく | |
# 無害であろうと思われる、空の <span /> タグを挿入している。 | |
# 但し、句読点の後が、ascii文字(white-space, <ruby>タグなど)だった場合は、 | |
# <span/>の挿入は行わない。 | |
_version="0.9.1" | |
import shutil, tempfile, zipfile, os, sys, re | |
import StringIO | |
SENTENCE_END =( | |
(u"。",), # 通常 | |
(u"。", u"、") # 酷い場合… | |
) | |
# epub 内部の本文データのファイル名パターン | |
AOZORAEPUB3_BODYFILE = r'''OPS/xhtml/[\d]+\.xhtml$''' | |
#BUNRI_KINSOKU = u"—…‥〳〴〵" | |
BUNRI_KINSOKU = u"—…‥〳〴〵○" | |
#OWARI_KAKKO = u"」』)]}〕〉》】〙〗⦆" | |
OWARI_KAKKO = u"」』)]}〕〉》】〙〗⦆〟’”" | |
GYOUTOU_KINSOKU = u"、。" + OWARI_KAKKO | |
#_isdebug = True | |
_isdebug = False | |
def expand_path(path): | |
return os.path.expandvars( os.path.expanduser(path) ) | |
def U(s): | |
if isinstance(s, str): return s.decode("utf-8", "replace") | |
return unicode(s) | |
def oprif(port, fmt, *args): | |
if not args: port.write(U(fmt).encode("utf-8", "replace")) | |
else: | |
ustr = U(fmt) % tuple( | |
U(a) if isinstance(a, str) else a for a in args) | |
port.write(ustr.encode("utf-8", "replace")) | |
def prif(fmt, *args): oprif(sys.stdout, fmt, *args) | |
def eprif(fmt, *args): oprif(sys.stderr, fmt, *args) | |
def isascii(uc): return ord(uc) < 128 | |
# for zipfile.ZipFile() in python2.6 | |
class FileCtx(object): | |
def __init__(self, fobj): self.fobj = fobj | |
def __enter__(self): return self.fobj | |
def __exit__(self, exc_type, exc_value, traceback): | |
if not (self.fobj == sys.stdin): | |
# print " %s will close" % self.fobj # for debug | |
self.fobj.close() | |
# print u" %s did close" % self.fobj # for debug | |
return False # throw exception | |
class TempfileCtx(object): | |
def __init__(self): | |
(fd, self.tmpfile) = tempfile.mkstemp(".xhtml", "_aoepub3hack_") | |
os.close(fd) # Fix me | |
if _isdebug: prif("tmpfile = %s\n", self.tmpfile) | |
def __enter__(self): return self.tmpfile | |
def __exit__(self, exc_type, exc_value, traceback): | |
os.unlink(self.tmpfile) | |
return False # throw exception | |
class TempdirCtx(object): | |
def __init__(self): | |
self.tmpdir = tempfile.mkdtemp(".tmp", "_aoepub3hack_") | |
if _isdebug: sys.stdout.write("tmpdir = %s\n" % self.tmpdir) | |
def __enter__(self): return self.tmpdir | |
def __exit__(self, exc_type, exc_value, traceback): | |
#print "delete", self.tmpdir | |
# Memo: in Mac, find /var/folders -type d -iname '_zipdir*' | |
shutil.rmtree(self.tmpdir) | |
return False # throw exception | |
def out(uline, port=sys.stdout): | |
try: | |
sline = uline.encode("utf-8", "strict") | |
except UnicodeError: | |
sline = uline.encode("utf-8", "replace") | |
eprif("WARNING: fail to unicode encoding\n") | |
eprif(" %s\n", uline.rstrip()) | |
eprif(" ->%s\n", sline.decode("utf-8", "replace").rstrip()) | |
port.write(sline) | |
def out_each_sentence(line, port=sys.stdout, sep=SENTENCE_END[0] ): | |
class TextBuffer(object): | |
def __init__(self): | |
self.buf = None | |
def putbuf(self, s): | |
if self.buf is None: self.buf = StringIO.StringIO(u"") | |
self.buf.write(s) | |
def flushbuf(self): | |
if self.buf is not None: | |
out( self.buf.getvalue(), port ) | |
self.buf.close() | |
self.buf = None | |
try: uline = line.decode("utf-8", "strict") | |
except UnicodeError: | |
uline = line.decode("utf-8", "replace") | |
eprif("WARNING: fial to unicode decoding\n") | |
eprif(" %s\n", uline.rstrip()) | |
tbuf = TextBuffer() | |
intag = False | |
will_span = False | |
for s in uline: | |
if will_span: | |
# 句読点の次の文字が、ascii文字('<' や white-space) でなければ | |
# <span /> を書く | |
if isascii(s) or (s in GYOUTOU_KINSOKU) or (s in BUNRI_KINSOKU): pass | |
else: | |
out(u"<span />", port) | |
will_span = False | |
tbuf.putbuf(s) | |
if intag: | |
if s == ">": intag = False | |
else: | |
if s == "<": intag = True | |
else: | |
if s in sep: | |
# output sentence | |
tbuf.flushbuf() | |
will_span = True | |
#out(u"<span />", port) | |
tbuf.flushbuf() | |
def main(srcname, dstname, sep=SENTENCE_END[0]): | |
with TempdirCtx() as tmpdir: | |
with FileCtx(zipfile.ZipFile(srcname, "r")) as inzp: | |
infolist = inzp.infolist() | |
inzp.extractall(tmpdir) | |
with FileCtx(zipfile.ZipFile(U(dstname).encode("utf-8", "replace"), | |
"w", zipfile.ZIP_STORED)) as outzp: | |
for zinfo in infolist: | |
compression = zinfo.compress_type | |
if compression not in (zipfile.ZIP_DEFLATED, zipfile.ZIP_STORED): | |
compression = zipfile.ZIP_DEFLATED | |
arcname = zinfo.filename | |
srcpath = os.path.join(tmpdir, arcname) | |
if not os.path.isfile(srcpath): | |
eprif("ERROR: BUG?, No such a file - %s\n", srcpath) | |
else: | |
if re.search(AOZORAEPUB3_BODYFILE, arcname, re.I): | |
# 文の終わりで改行させる。 | |
with TempfileCtx() as convname: | |
with open(convname, "w") as outport: | |
with open(srcpath, "rb") as inport: | |
inbody = False | |
rxbody = re.compile(r'''<body\s*>|<body\s[^<>]*>''', re.I) | |
for line in inport: | |
if not inbody: | |
if rxbody.search(line): inbody = True | |
if inbody: | |
out_each_sentence(line, outport, sep) | |
else: | |
outport.write(line) | |
outzp.write(convname, arcname, compression) | |
else: | |
outzp.write(srcpath, arcname, compression) | |
if __name__ == '__main__': | |
def usage(): | |
name = os.path.basename(sys.argv[0]) | |
sys.stderr.write( (u'''\ | |
Usage: %s {OPTIONS} input_epub3 output_epub3 | |
input_epub3 --- AozoraEpub3 が出力する epub3 ファイル名 | |
output_epub3 -- 変換後の epub3 ファイル名 | |
OPTIONS: | |
-s 0|1 --- 0 ならば、句点(。) の後に <span /> を加える | |
1 ならば、句点(。) 及び、読点(、) の後に <span /> を加える | |
(デフォルトは、1) | |
-V --- バージョン番号の表示 | |
-h --- Help表示 | |
''' % name).encode("utf-8", "replace")) | |
exit(1) | |
(srcname, dstname, sep) = (None, None, SENTENCE_END[1]) | |
arglist = sys.argv[1:] | |
while arglist: | |
arg = arglist.pop(0) | |
if arg.startswith("-"): | |
if arg.startswith("-h"): | |
usage() | |
elif arg == "-s": | |
try: | |
sep = SENTENCE_END[ int( arglist.pop(0) ) ] | |
except (ValueError, IndexError): | |
usage() | |
elif arg == "-V": | |
prif("version %s\n", _version) | |
exit(0) | |
else: usage() | |
else: | |
if not srcname: | |
srcname = expand_path( arg ) | |
if not os.path.isfile(srcname): | |
eprif("No such a file - %s\n", srcname) | |
exit(1) | |
elif not dstname : | |
dstname = expand_path( arg ) | |
else: | |
usage() | |
if srcname and dstname: | |
#prif("src=%s dst=%s sep=%s\n", srcname, dstname, sep) # debug | |
main(srcname, dstname, sep) | |
else: | |
usage() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment