Last active
December 27, 2015 08:29
-
-
Save te223/7296698 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# AozoraEpub3 + kindlegen で作成した mobi テキストで、部分的に辞書が | |
# 引けなくなる時の対策スクリプト。 | |
# | |
# epub の 本文 xhtml (0001.xhtmlなど)について、句読点の後に、おそらく | |
# 無害であろうと思われる、空の <span /> タグを挿入している。 | |
# 但し、句読点の後が、ascii文字(white-space, <ruby>タグなど)だった場合は、 | |
# <span/>の挿入は行わない。 | |
_version="0.9.1" | |
import shutil, tempfile, zipfile, os, sys, re | |
import StringIO | |
SENTENCE_END =( | |
(u"。",), # 通常 | |
(u"。", u"、") # 酷い場合… | |
) | |
# epub 内部の本文データのファイル名パターン | |
AOZORAEPUB3_BODYFILE = r'''OPS/xhtml/[\d]+\.xhtml$''' | |
#BUNRI_KINSOKU = u"—…‥〳〴〵" | |
BUNRI_KINSOKU = u"—…‥〳〴〵○" | |
#OWARI_KAKKO = u"」』)]}〕〉》】〙〗⦆" | |
OWARI_KAKKO = u"」』)]}〕〉》】〙〗⦆〟’”" | |
GYOUTOU_KINSOKU = u"、。" + OWARI_KAKKO | |
#_isdebug = True | |
_isdebug = False | |
def expand_path(path): | |
return os.path.expandvars( os.path.expanduser(path) ) | |
def U(s): | |
if isinstance(s, str): return s.decode("utf-8", "replace") | |
return unicode(s) | |
def oprif(port, fmt, *args): | |
if not args: port.write(U(fmt).encode("utf-8", "replace")) | |
else: | |
ustr = U(fmt) % tuple( | |
U(a) if isinstance(a, str) else a for a in args) | |
port.write(ustr.encode("utf-8", "replace")) | |
def prif(fmt, *args): oprif(sys.stdout, fmt, *args) | |
def eprif(fmt, *args): oprif(sys.stderr, fmt, *args) | |
def isascii(uc): return ord(uc) < 128 | |
# for zipfile.ZipFile() in python2.6 | |
class FileCtx(object): | |
def __init__(self, fobj): self.fobj = fobj | |
def __enter__(self): return self.fobj | |
def __exit__(self, exc_type, exc_value, traceback): | |
if not (self.fobj == sys.stdin): | |
# print " %s will close" % self.fobj # for debug | |
self.fobj.close() | |
# print u" %s did close" % self.fobj # for debug | |
return False # throw exception | |
class TempfileCtx(object): | |
def __init__(self): | |
(fd, self.tmpfile) = tempfile.mkstemp(".xhtml", "_aoepub3hack_") | |
os.close(fd) # Fix me | |
if _isdebug: prif("tmpfile = %s\n", self.tmpfile) | |
def __enter__(self): return self.tmpfile | |
def __exit__(self, exc_type, exc_value, traceback): | |
os.unlink(self.tmpfile) | |
return False # throw exception | |
class TempdirCtx(object): | |
def __init__(self): | |
self.tmpdir = tempfile.mkdtemp(".tmp", "_aoepub3hack_") | |
if _isdebug: sys.stdout.write("tmpdir = %s\n" % self.tmpdir) | |
def __enter__(self): return self.tmpdir | |
def __exit__(self, exc_type, exc_value, traceback): | |
#print "delete", self.tmpdir | |
# Memo: in Mac, find /var/folders -type d -iname '_zipdir*' | |
shutil.rmtree(self.tmpdir) | |
return False # throw exception | |
def out(uline, port=sys.stdout): | |
try: | |
sline = uline.encode("utf-8", "strict") | |
except UnicodeError: | |
sline = uline.encode("utf-8", "replace") | |
eprif("WARNING: fail to unicode encoding\n") | |
eprif(" %s\n", uline.rstrip()) | |
eprif(" ->%s\n", sline.decode("utf-8", "replace").rstrip()) | |
port.write(sline) | |
def out_each_sentence(line, port=sys.stdout, sep=SENTENCE_END[0] ): | |
class TextBuffer(object): | |
def __init__(self): | |
self.buf = None | |
def putbuf(self, s): | |
if self.buf is None: self.buf = StringIO.StringIO(u"") | |
self.buf.write(s) | |
def flushbuf(self): | |
if self.buf is not None: | |
out( self.buf.getvalue(), port ) | |
self.buf.close() | |
self.buf = None | |
try: uline = line.decode("utf-8", "strict") | |
except UnicodeError: | |
uline = line.decode("utf-8", "replace") | |
eprif("WARNING: fial to unicode decoding\n") | |
eprif(" %s\n", uline.rstrip()) | |
tbuf = TextBuffer() | |
intag = False | |
will_span = False | |
for s in uline: | |
if will_span: | |
# 句読点の次の文字が、ascii文字('<' や white-space) でなければ | |
# <span /> を書く | |
if isascii(s) or (s in GYOUTOU_KINSOKU) or (s in BUNRI_KINSOKU): pass | |
else: | |
out(u"<span />", port) | |
will_span = False | |
tbuf.putbuf(s) | |
if intag: | |
if s == ">": intag = False | |
else: | |
if s == "<": intag = True | |
else: | |
if s in sep: | |
# output sentence | |
tbuf.flushbuf() | |
will_span = True | |
#out(u"<span />", port) | |
tbuf.flushbuf() | |
def main(srcname, dstname, sep=SENTENCE_END[0]): | |
with TempdirCtx() as tmpdir: | |
with FileCtx(zipfile.ZipFile(srcname, "r")) as inzp: | |
infolist = inzp.infolist() | |
inzp.extractall(tmpdir) | |
with FileCtx(zipfile.ZipFile(U(dstname).encode("utf-8", "replace"), | |
"w", zipfile.ZIP_STORED)) as outzp: | |
for zinfo in infolist: | |
compression = zinfo.compress_type | |
if compression not in (zipfile.ZIP_DEFLATED, zipfile.ZIP_STORED): | |
compression = zipfile.ZIP_DEFLATED | |
arcname = zinfo.filename | |
srcpath = os.path.join(tmpdir, arcname) | |
if not os.path.isfile(srcpath): | |
eprif("ERROR: BUG?, No such a file - %s\n", srcpath) | |
else: | |
if re.search(AOZORAEPUB3_BODYFILE, arcname, re.I): | |
# 文の終わりで改行させる。 | |
with TempfileCtx() as convname: | |
with open(convname, "w") as outport: | |
with open(srcpath, "rb") as inport: | |
inbody = False | |
rxbody = re.compile(r'''<body\s*>|<body\s[^<>]*>''', re.I) | |
for line in inport: | |
if not inbody: | |
if rxbody.search(line): inbody = True | |
if inbody: | |
out_each_sentence(line, outport, sep) | |
else: | |
outport.write(line) | |
outzp.write(convname, arcname, compression) | |
else: | |
outzp.write(srcpath, arcname, compression) | |
if __name__ == '__main__': | |
def usage(): | |
name = os.path.basename(sys.argv[0]) | |
sys.stderr.write( (u'''\ | |
Usage: %s {OPTIONS} input_epub3 output_epub3 | |
input_epub3 --- AozoraEpub3 が出力する epub3 ファイル名 | |
output_epub3 -- 変換後の epub3 ファイル名 | |
OPTIONS: | |
-s 0|1 --- 0 ならば、句点(。) の後に <span /> を加える | |
1 ならば、句点(。) 及び、読点(、) の後に <span /> を加える | |
(デフォルトは、1) | |
-V --- バージョン番号の表示 | |
-h --- Help表示 | |
''' % name).encode("utf-8", "replace")) | |
exit(1) | |
(srcname, dstname, sep) = (None, None, SENTENCE_END[1]) | |
arglist = sys.argv[1:] | |
while arglist: | |
arg = arglist.pop(0) | |
if arg.startswith("-"): | |
if arg.startswith("-h"): | |
usage() | |
elif arg == "-s": | |
try: | |
sep = SENTENCE_END[ int( arglist.pop(0) ) ] | |
except (ValueError, IndexError): | |
usage() | |
elif arg == "-V": | |
prif("version %s\n", _version) | |
exit(0) | |
else: usage() | |
else: | |
if not srcname: | |
srcname = expand_path( arg ) | |
if not os.path.isfile(srcname): | |
eprif("No such a file - %s\n", srcname) | |
exit(1) | |
elif not dstname : | |
dstname = expand_path( arg ) | |
else: | |
usage() | |
if srcname and dstname: | |
#prif("src=%s dst=%s sep=%s\n", srcname, dstname, sep) # debug | |
main(srcname, dstname, sep) | |
else: | |
usage() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment