Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# AozoraEpub3 + kindlegen で作成した mobi テキストで、部分的に辞書が
# 引けなくなる時の対策スクリプト。
#
# epub の 本文 xhtml (0001.xhtmlなど)について、句読点の後に、おそらく
# 無害であろうと思われる、空の <span /> タグを挿入している。
# 但し、句読点の後が、ascii文字(white-space, <ruby>タグなど)だった場合は、
# <span/>の挿入は行わない。
_version="0.9.1"
import shutil, tempfile, zipfile, os, sys, re
import StringIO
SENTENCE_END =(
(u"",), # 通常
(u"", u"") # 酷い場合…
)
# epub 内部の本文データのファイル名パターン
AOZORAEPUB3_BODYFILE = r'''OPS/xhtml/[\d]+\.xhtml$'''
#BUNRI_KINSOKU = u"—…‥〳〴〵"
BUNRI_KINSOKU = u"—…‥〳〴〵○"
#OWARI_KAKKO = u"」』)]}〕〉》】〙〗⦆"
OWARI_KAKKO = u"」』)]}〕〉》】〙〗⦆〟’”"
GYOUTOU_KINSOKU = u"、。" + OWARI_KAKKO
#_isdebug = True
_isdebug = False
def expand_path(path):
return os.path.expandvars( os.path.expanduser(path) )
def U(s):
if isinstance(s, str): return s.decode("utf-8", "replace")
return unicode(s)
def oprif(port, fmt, *args):
if not args: port.write(U(fmt).encode("utf-8", "replace"))
else:
ustr = U(fmt) % tuple(
U(a) if isinstance(a, str) else a for a in args)
port.write(ustr.encode("utf-8", "replace"))
def prif(fmt, *args): oprif(sys.stdout, fmt, *args)
def eprif(fmt, *args): oprif(sys.stderr, fmt, *args)
def isascii(uc): return ord(uc) < 128
# for zipfile.ZipFile() in python2.6
class FileCtx(object):
def __init__(self, fobj): self.fobj = fobj
def __enter__(self): return self.fobj
def __exit__(self, exc_type, exc_value, traceback):
if not (self.fobj == sys.stdin):
# print " %s will close" % self.fobj # for debug
self.fobj.close()
# print u" %s did close" % self.fobj # for debug
return False # throw exception
class TempfileCtx(object):
def __init__(self):
(fd, self.tmpfile) = tempfile.mkstemp(".xhtml", "_aoepub3hack_")
os.close(fd) # Fix me
if _isdebug: prif("tmpfile = %s\n", self.tmpfile)
def __enter__(self): return self.tmpfile
def __exit__(self, exc_type, exc_value, traceback):
os.unlink(self.tmpfile)
return False # throw exception
class TempdirCtx(object):
def __init__(self):
self.tmpdir = tempfile.mkdtemp(".tmp", "_aoepub3hack_")
if _isdebug: sys.stdout.write("tmpdir = %s\n" % self.tmpdir)
def __enter__(self): return self.tmpdir
def __exit__(self, exc_type, exc_value, traceback):
#print "delete", self.tmpdir
# Memo: in Mac, find /var/folders -type d -iname '_zipdir*'
shutil.rmtree(self.tmpdir)
return False # throw exception
def out(uline, port=sys.stdout):
try:
sline = uline.encode("utf-8", "strict")
except UnicodeError:
sline = uline.encode("utf-8", "replace")
eprif("WARNING: fail to unicode encoding\n")
eprif(" %s\n", uline.rstrip())
eprif(" ->%s\n", sline.decode("utf-8", "replace").rstrip())
port.write(sline)
def out_each_sentence(line, port=sys.stdout, sep=SENTENCE_END[0] ):
class TextBuffer(object):
def __init__(self):
self.buf = None
def putbuf(self, s):
if self.buf is None: self.buf = StringIO.StringIO(u"")
self.buf.write(s)
def flushbuf(self):
if self.buf is not None:
out( self.buf.getvalue(), port )
self.buf.close()
self.buf = None
try: uline = line.decode("utf-8", "strict")
except UnicodeError:
uline = line.decode("utf-8", "replace")
eprif("WARNING: fial to unicode decoding\n")
eprif(" %s\n", uline.rstrip())
tbuf = TextBuffer()
intag = False
will_span = False
for s in uline:
if will_span:
# 句読点の次の文字が、ascii文字('<' や white-space) でなければ
# <span /> を書く
if isascii(s) or (s in GYOUTOU_KINSOKU) or (s in BUNRI_KINSOKU): pass
else:
out(u"<span />", port)
will_span = False
tbuf.putbuf(s)
if intag:
if s == ">": intag = False
else:
if s == "<": intag = True
else:
if s in sep:
# output sentence
tbuf.flushbuf()
will_span = True
#out(u"<span />", port)
tbuf.flushbuf()
def main(srcname, dstname, sep=SENTENCE_END[0]):
with TempdirCtx() as tmpdir:
with FileCtx(zipfile.ZipFile(srcname, "r")) as inzp:
infolist = inzp.infolist()
inzp.extractall(tmpdir)
with FileCtx(zipfile.ZipFile(U(dstname).encode("utf-8", "replace"),
"w", zipfile.ZIP_STORED)) as outzp:
for zinfo in infolist:
compression = zinfo.compress_type
if compression not in (zipfile.ZIP_DEFLATED, zipfile.ZIP_STORED):
compression = zipfile.ZIP_DEFLATED
arcname = zinfo.filename
srcpath = os.path.join(tmpdir, arcname)
if not os.path.isfile(srcpath):
eprif("ERROR: BUG?, No such a file - %s\n", srcpath)
else:
if re.search(AOZORAEPUB3_BODYFILE, arcname, re.I):
# 文の終わりで改行させる。
with TempfileCtx() as convname:
with open(convname, "w") as outport:
with open(srcpath, "rb") as inport:
inbody = False
rxbody = re.compile(r'''<body\s*>|<body\s[^<>]*>''', re.I)
for line in inport:
if not inbody:
if rxbody.search(line): inbody = True
if inbody:
out_each_sentence(line, outport, sep)
else:
outport.write(line)
outzp.write(convname, arcname, compression)
else:
outzp.write(srcpath, arcname, compression)
if __name__ == '__main__':
def usage():
name = os.path.basename(sys.argv[0])
sys.stderr.write( (u'''\
Usage: %s {OPTIONS} input_epub3 output_epub3
input_epub3 --- AozoraEpub3 が出力する epub3 ファイル名
output_epub3 -- 変換後の epub3 ファイル名
OPTIONS:
-s 0|1 --- 0 ならば、句点(。) の後に <span /> を加える
1 ならば、句点(。) 及び、読点(、) の後に <span /> を加える
(デフォルトは、1)
-V --- バージョン番号の表示
-h --- Help表示
''' % name).encode("utf-8", "replace"))
exit(1)
(srcname, dstname, sep) = (None, None, SENTENCE_END[1])
arglist = sys.argv[1:]
while arglist:
arg = arglist.pop(0)
if arg.startswith("-"):
if arg.startswith("-h"):
usage()
elif arg == "-s":
try:
sep = SENTENCE_END[ int( arglist.pop(0) ) ]
except (ValueError, IndexError):
usage()
elif arg == "-V":
prif("version %s\n", _version)
exit(0)
else: usage()
else:
if not srcname:
srcname = expand_path( arg )
if not os.path.isfile(srcname):
eprif("No such a file - %s\n", srcname)
exit(1)
elif not dstname :
dstname = expand_path( arg )
else:
usage()
if srcname and dstname:
#prif("src=%s dst=%s sep=%s\n", srcname, dstname, sep) # debug
main(srcname, dstname, sep)
else:
usage()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment