te223/aoepub3_hack.py

## aoepub3_hack.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# AozoraEpub3 + kindlegen で作成した mobi テキストで、部分的に辞書が
# 引けなくなる時の対策スクリプト。
#
# epub の 本文 xhtml (0001.xhtmlなど)について、句読点の後に、おそらく
# 無害であろうと思われる、空の <span /> タグを挿入している。
# 但し、句読点の後が、ascii文字(white-space, <ruby>タグなど)だった場合は、
# <span/>の挿入は行わない。

_version="0.9.1"
import shutil, tempfile, zipfile, os, sys, re
import StringIO

SENTENCE_END =(
  (u"。",),  # 通常
  (u"。", u"、")  # 酷い場合…
  )

# epub 内部の本文データのファイル名パターン
AOZORAEPUB3_BODYFILE = r'''OPS/xhtml/[\d]+\.xhtml$'''

#BUNRI_KINSOKU = u"—…‥〳〴〵"
BUNRI_KINSOKU = u"—…‥〳〴〵○"

#OWARI_KAKKO = u"」』）］｝〕〉》】〙〗｠"
OWARI_KAKKO = u"」』）］｝〕〉》】〙〗｠〟’”"

GYOUTOU_KINSOKU = u"、。" + OWARI_KAKKO

#_isdebug = True
_isdebug = False

def expand_path(path):
  return os.path.expandvars( os.path.expanduser(path) )

def U(s):
  if isinstance(s, str): return s.decode("utf-8", "replace")
  return unicode(s)

def oprif(port, fmt, *args):
  if not args: port.write(U(fmt).encode("utf-8", "replace"))
  else:
    ustr = U(fmt) % tuple(
      U(a) if isinstance(a, str) else a for a in args)
    port.write(ustr.encode("utf-8", "replace"))

def prif(fmt, *args): oprif(sys.stdout, fmt, *args)
def eprif(fmt, *args): oprif(sys.stderr, fmt, *args)

def isascii(uc): return ord(uc) < 128

# for zipfile.ZipFile() in python2.6
class FileCtx(object):
  def __init__(self, fobj): self.fobj = fobj
  def __enter__(self): return self.fobj
  def __exit__(self, exc_type, exc_value, traceback):
    if not (self.fobj == sys.stdin):
      # print "  %s will close" % self.fobj # for debug
      self.fobj.close()
      # print u"  %s did close" % self.fobj # for debug
    return False # throw exception

class TempfileCtx(object):
  def __init__(self):
    (fd, self.tmpfile) = tempfile.mkstemp(".xhtml", "_aoepub3hack_")
    os.close(fd) # Fix me
    if _isdebug: prif("tmpfile = %s\n", self.tmpfile)
  def __enter__(self): return self.tmpfile
  def __exit__(self, exc_type, exc_value, traceback):
    os.unlink(self.tmpfile)
    return False # throw exception

class TempdirCtx(object):
  def __init__(self):
    self.tmpdir  = tempfile.mkdtemp(".tmp", "_aoepub3hack_")
    if _isdebug: sys.stdout.write("tmpdir = %s\n" % self.tmpdir)
  def __enter__(self): return self.tmpdir
  def __exit__(self, exc_type, exc_value, traceback):
    #print "delete", self.tmpdir
    # Memo:  in Mac, find /var/folders -type d -iname '_zipdir*'
    shutil.rmtree(self.tmpdir)
    return False # throw exception


def out(uline, port=sys.stdout):
  try:
    sline = uline.encode("utf-8", "strict")
  except UnicodeError:
    sline = uline.encode("utf-8", "replace")
    eprif("WARNING: fail to unicode encoding\n")
    eprif("    %s\n", uline.rstrip())
    eprif("  ->%s\n", sline.decode("utf-8", "replace").rstrip())
  port.write(sline)

def out_each_sentence(line, port=sys.stdout, sep=SENTENCE_END[0] ):
  class TextBuffer(object):
    def __init__(self):
      self.buf = None
    def putbuf(self, s):
      if self.buf is None: self.buf = StringIO.StringIO(u"")
      self.buf.write(s)
    def flushbuf(self):
      if self.buf is not None:
        out( self.buf.getvalue(), port )
        self.buf.close()
        self.buf = None
  try: uline = line.decode("utf-8", "strict")
  except UnicodeError:
    uline = line.decode("utf-8", "replace")
    eprif("WARNING: fial to unicode decoding\n")
    eprif("  %s\n", uline.rstrip())

  tbuf = TextBuffer()
  intag = False
  will_span = False
  for s in uline:
    if will_span:
      # 句読点の次の文字が、ascii文字('<' や white-space) でなければ
      # <span /> を書く
      if isascii(s) or (s in GYOUTOU_KINSOKU) or (s in BUNRI_KINSOKU): pass
      else:
        out(u"<span />", port)
      will_span = False

    tbuf.putbuf(s)
    if intag:
      if s == ">": intag = False
    else:
      if s == "<": intag = True
      else:
        if s in sep:
          # output sentence
          tbuf.flushbuf()
          will_span = True
          #out(u"<span />", port)
  tbuf.flushbuf()

def main(srcname, dstname, sep=SENTENCE_END[0]):
  with TempdirCtx() as tmpdir:
    with FileCtx(zipfile.ZipFile(srcname, "r")) as inzp:
      infolist = inzp.infolist()
      inzp.extractall(tmpdir)

      with FileCtx(zipfile.ZipFile(U(dstname).encode("utf-8", "replace"),
                                   "w", zipfile.ZIP_STORED)) as outzp:
        for zinfo in infolist:
          compression = zinfo.compress_type
          if compression not in (zipfile.ZIP_DEFLATED, zipfile.ZIP_STORED):
            compression = zipfile.ZIP_DEFLATED
          arcname = zinfo.filename
          srcpath = os.path.join(tmpdir, arcname)
          if not os.path.isfile(srcpath):
            eprif("ERROR: BUG?,  No such a file - %s\n", srcpath)
          else:
            if re.search(AOZORAEPUB3_BODYFILE, arcname, re.I):
              # 文の終わりで改行させる。
              with TempfileCtx() as convname:
                with open(convname, "w") as outport:
                  with open(srcpath, "rb") as inport:
                    inbody = False
                    rxbody = re.compile(r'''<body\s*>|<body\s[^<>]*>''', re.I)
                    for line in inport:
                      if not inbody:
                        if rxbody.search(line): inbody = True
                      if inbody:
                        out_each_sentence(line, outport, sep)
                      else:
                        outport.write(line)
                outzp.write(convname, arcname, compression)
            else:
              outzp.write(srcpath, arcname, compression)

if __name__ == '__main__':

  def usage():
    name = os.path.basename(sys.argv[0])
    sys.stderr.write( (u'''\
Usage:  %s {OPTIONS} input_epub3 output_epub3
  input_epub3 --- AozoraEpub3 が出力する epub3 ファイル名
  output_epub3 -- 変換後の epub3 ファイル名
  OPTIONS:
    -s 0|1  --- 0 ならば、句点(。) の後に <span /> を加える
                1 ならば、句点(。) 及び、読点(、) の後に <span /> を加える
                (デフォルトは、1)
    -V      --- バージョン番号の表示
    -h      --- Help表示
''' % name).encode("utf-8", "replace"))
    exit(1)

  (srcname, dstname, sep) = (None, None, SENTENCE_END[1])
  arglist = sys.argv[1:]
  while arglist:
    arg = arglist.pop(0)
    if arg.startswith("-"):
      if arg.startswith("-h"):
        usage()
      elif arg == "-s":
        try:
          sep = SENTENCE_END[ int( arglist.pop(0) ) ]
        except (ValueError, IndexError):
          usage()
      elif arg == "-V":
        prif("version %s\n", _version)
        exit(0)
      else: usage()
    else:
      if not srcname:
        srcname = expand_path( arg )
        if not os.path.isfile(srcname):
          eprif("No such a file - %s\n", srcname)
          exit(1)
      elif not dstname :
        dstname = expand_path( arg )
      else:
        usage()
  if srcname and dstname:
    #prif("src=%s dst=%s sep=%s\n", srcname, dstname, sep) # debug
    main(srcname, dstname, sep)
  else:
    usage()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# AozoraEpub3 + kindlegen で作成した mobi テキストで、部分的に辞書が
	# 引けなくなる時の対策スクリプト。
	#
	# epub の本文 xhtml (0001.xhtmlなど)について、句読点の後に、おそらく
	# 無害であろうと思われる、空の <span /> タグを挿入している。
	# 但し、句読点の後が、ascii文字(white-space, <ruby>タグなど)だった場合は、
	# <span/>の挿入は行わない。

	_version="0.9.1"
	import shutil, tempfile, zipfile, os, sys, re
	import StringIO

	SENTENCE_END =(
	(u"。",), # 通常
	(u"。", u"、") # 酷い場合…
	)

	# epub 内部の本文データのファイル名パターン
	AOZORAEPUB3_BODYFILE = r'''OPS/xhtml/[\d]+\.xhtml$'''

	#BUNRI_KINSOKU = u"—…‥〳〴〵"
	BUNRI_KINSOKU = u"—…‥〳〴〵○"

	#OWARI_KAKKO = u"」』）］｝〕〉》】〙〗｠"
	OWARI_KAKKO = u"」』）］｝〕〉》】〙〗｠〟’”"

	GYOUTOU_KINSOKU = u"、。" + OWARI_KAKKO

	#_isdebug = True
	_isdebug = False

	def expand_path(path):
	return os.path.expandvars( os.path.expanduser(path) )

	def U(s):
	if isinstance(s, str): return s.decode("utf-8", "replace")
	return unicode(s)

	def oprif(port, fmt, *args):
	if not args: port.write(U(fmt).encode("utf-8", "replace"))
	else:
	ustr = U(fmt) % tuple(
	U(a) if isinstance(a, str) else a for a in args)
	port.write(ustr.encode("utf-8", "replace"))

	def prif(fmt, args): oprif(sys.stdout, fmt, args)
	def eprif(fmt, args): oprif(sys.stderr, fmt, args)

	def isascii(uc): return ord(uc) < 128

	# for zipfile.ZipFile() in python2.6
	class FileCtx(object):
	def __init__(self, fobj): self.fobj = fobj
	def __enter__(self): return self.fobj
	def __exit__(self, exc_type, exc_value, traceback):
	if not (self.fobj == sys.stdin):
	# print " %s will close" % self.fobj # for debug
	self.fobj.close()
	# print u" %s did close" % self.fobj # for debug
	return False # throw exception

	class TempfileCtx(object):
	def __init__(self):
	(fd, self.tmpfile) = tempfile.mkstemp(".xhtml", "_aoepub3hack_")
	os.close(fd) # Fix me
	if _isdebug: prif("tmpfile = %s\n", self.tmpfile)
	def __enter__(self): return self.tmpfile
	def __exit__(self, exc_type, exc_value, traceback):
	os.unlink(self.tmpfile)
	return False # throw exception

	class TempdirCtx(object):
	def __init__(self):
	self.tmpdir = tempfile.mkdtemp(".tmp", "_aoepub3hack_")
	if _isdebug: sys.stdout.write("tmpdir = %s\n" % self.tmpdir)
	def __enter__(self): return self.tmpdir
	def __exit__(self, exc_type, exc_value, traceback):
	#print "delete", self.tmpdir
	# Memo: in Mac, find /var/folders -type d -iname '_zipdir*'
	shutil.rmtree(self.tmpdir)
	return False # throw exception


	def out(uline, port=sys.stdout):
	try:
	sline = uline.encode("utf-8", "strict")
	except UnicodeError:
	sline = uline.encode("utf-8", "replace")
	eprif("WARNING: fail to unicode encoding\n")
	eprif(" %s\n", uline.rstrip())
	eprif(" ->%s\n", sline.decode("utf-8", "replace").rstrip())
	port.write(sline)

	def out_each_sentence(line, port=sys.stdout, sep=SENTENCE_END[0] ):
	class TextBuffer(object):
	def __init__(self):
	self.buf = None
	def putbuf(self, s):
	if self.buf is None: self.buf = StringIO.StringIO(u"")
	self.buf.write(s)
	def flushbuf(self):
	if self.buf is not None:
	out( self.buf.getvalue(), port )
	self.buf.close()
	self.buf = None
	try: uline = line.decode("utf-8", "strict")
	except UnicodeError:
	uline = line.decode("utf-8", "replace")
	eprif("WARNING: fial to unicode decoding\n")
	eprif(" %s\n", uline.rstrip())

	tbuf = TextBuffer()
	intag = False
	will_span = False
	for s in uline:
	if will_span:
	# 句読点の次の文字が、ascii文字('<' や white-space) でなければ
	# <span /> を書く
	if isascii(s) or (s in GYOUTOU_KINSOKU) or (s in BUNRI_KINSOKU): pass
	else:
	out(u"<span />", port)
	will_span = False

	tbuf.putbuf(s)
	if intag:
	if s == ">": intag = False
	else:
	if s == "<": intag = True
	else:
	if s in sep:
	# output sentence
	tbuf.flushbuf()
	will_span = True
	#out(u"<span />", port)
	tbuf.flushbuf()

	def main(srcname, dstname, sep=SENTENCE_END[0]):
	with TempdirCtx() as tmpdir:
	with FileCtx(zipfile.ZipFile(srcname, "r")) as inzp:
	infolist = inzp.infolist()
	inzp.extractall(tmpdir)

	with FileCtx(zipfile.ZipFile(U(dstname).encode("utf-8", "replace"),
	"w", zipfile.ZIP_STORED)) as outzp:
	for zinfo in infolist:
	compression = zinfo.compress_type
	if compression not in (zipfile.ZIP_DEFLATED, zipfile.ZIP_STORED):
	compression = zipfile.ZIP_DEFLATED
	arcname = zinfo.filename
	srcpath = os.path.join(tmpdir, arcname)
	if not os.path.isfile(srcpath):
	eprif("ERROR: BUG?, No such a file - %s\n", srcpath)
	else:
	if re.search(AOZORAEPUB3_BODYFILE, arcname, re.I):
	# 文の終わりで改行させる。
	with TempfileCtx() as convname:
	with open(convname, "w") as outport:
	with open(srcpath, "rb") as inport:
	inbody = False
	rxbody = re.compile(r'''<body\s>\|<body\s[^<>]>''', re.I)
	for line in inport:
	if not inbody:
	if rxbody.search(line): inbody = True
	if inbody:
	out_each_sentence(line, outport, sep)
	else:
	outport.write(line)
	outzp.write(convname, arcname, compression)
	else:
	outzp.write(srcpath, arcname, compression)

	if __name__ == '__main__':

	def usage():
	name = os.path.basename(sys.argv[0])
	sys.stderr.write( (u'''\
	Usage: %s {OPTIONS} input_epub3 output_epub3
	input_epub3 --- AozoraEpub3 が出力する epub3 ファイル名
	output_epub3 -- 変換後の epub3 ファイル名
	OPTIONS:
	-s 0\|1 --- 0 ならば、句点(。) の後に <span /> を加える
	1 ならば、句点(。) 及び、読点(、) の後に <span /> を加える
	(デフォルトは、1)
	-V --- バージョン番号の表示
	-h --- Help表示
	''' % name).encode("utf-8", "replace"))
	exit(1)

	(srcname, dstname, sep) = (None, None, SENTENCE_END[1])
	arglist = sys.argv[1:]
	while arglist:
	arg = arglist.pop(0)
	if arg.startswith("-"):
	if arg.startswith("-h"):
	usage()
	elif arg == "-s":
	try:
	sep = SENTENCE_END[ int( arglist.pop(0) ) ]
	except (ValueError, IndexError):
	usage()
	elif arg == "-V":
	prif("version %s\n", _version)
	exit(0)
	else: usage()
	else:
	if not srcname:
	srcname = expand_path( arg )
	if not os.path.isfile(srcname):
	eprif("No such a file - %s\n", srcname)
	exit(1)
	elif not dstname :
	dstname = expand_path( arg )
	else:
	usage()
	if srcname and dstname:
	#prif("src=%s dst=%s sep=%s\n", srcname, dstname, sep) # debug
	main(srcname, dstname, sep)
	else:
	usage()