olsgaard/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Standalone Japanese reading generator based on Damien Elmes' Japanese Support
plugin for Anki: https://ankiweb.net/shared/info/3918629684
This script does not need Anki to run, but it does use the MeCab and Kakasi
applications included in the Japanese Support plugin. You are expected to
download that plugin and place this file inside your
<Documents/Anki/addons/japanese/> directory alongside reading.py and run it
from the command line.
Usage: at the command prompt, run:
python readingStandAlone.py inputFile [outputFile [formatter]]


inputFile should be UTF-8 encoded.


outputFile: optional. Output will be written to this file if provided (UTF-8-encoded), or written to screen if not provided.


formatter: optional. If omitted or with formatter="defaultFormatter", this script will put a space before Japanese words, and the hiragana reading in [square-brackets] immediately after the word. E.g., the following input sentence:


お父さんは？
becomes
お 父[とう]さんは？
With formatter="verboseFormatter", the following will be produced:
お_{父}[とう]さんは？
Note how the prefix space is replaced by an underscore "_", and the Japanese word (in this case, just one kanji, but potentially more) is put in {curly brackets}.
You can add other formatters to the source code: they should be functions of two arguments and one optional argument, i.e., with the following definition:
def newFormatter(kanji, reading, optionalReading=""):

Caveat: no HTML stripping available.

  
## readingStandAlone.py
# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
#
# Automatic reading generation with kakasi and mecab.
# See http://ichi2.net/anki/wiki/JapaneseSupport
#

"""

Standalone Japanese reading generator based on Damien Elmes' Japanese Support
plugin for Anki: https://ankiweb.net/shared/info/3918629684

This script does *not* need Anki to run, but it does use the MeCab and Kakasi
applications included in the Japanese Support plugin. You are expected to
download that plugin and place this file inside your
<Documents/Anki/addons/japanese/> directory alongside `reading.py` and run it
from the command line.


Usage: at the command prompt, run:

python readingStandAlone.py inputFile [outputFile [formatter]]

- inputFile should be UTF-8 encoded.

- outputFile: optional. Output will be written to this file if provided (UTF-8
- encoded), or written to screen if not provided.

- formatter: optional. If omitted or with formatter="defaultFormatter", this
script will put a space before Japanese words, and the hiragana reading in
[square-brackets] immediately after the word. E.g., the following input
sentence:

お父さんは？

becomes

お 父[とう]さんは？

With formatter="verboseFormatter", the following will be produced:

お_{父}[とう]さんは？

Note how the prefix space is replaced by an underscore "_", and the Japanese
word (in this case, just one kanji, but potentially more) is put in {curly
brackets}.

You can add other formatters to the source code: they should be functions of
two arguments and one optional argument, i.e., with the following definition:

    def newFormatter(kanji, reading, optionalReading=""):

Caveat: no HTML stripping available.
"""

import sys, os, platform, re, subprocess
#from anki.utils import stripHTML, isWin, isMac
#from anki.hooks import addHook
isMac = sys.platform.startswith("darwin")
isWin = sys.platform.startswith("win32")
def stripHTML(s): return s

def verboseFormatter(kanji, reading, optionalReading=""):
    return "_{%s}[%s]%s" % (kanji, reading, optionalReading)
def defaultFormatter(kanji, reading, optionalReading=""):
    return " %s[%s]%s" % (kanji, reading, optionalReading)


kakasiArgs = ["-isjis", "-osjis", "-u", "-JH", "-KH"]
mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
            '--unk-format=%m[] ']

def escapeText(text):
    # strip characters that trip up kakasi/mecab
    text = text.replace("\n", " ")
    text = text.replace(u'\uff5e', "~")
    text = re.sub("<br( /)?>", "---newline---", text)
    text = stripHTML(text)
    text = text.replace("---newline---", "<br>")
    return text

if sys.platform == "win32":
    si = subprocess.STARTUPINFO()
    try:
        si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
    except:
        si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW
else:
    si = None

# Mecab
##########################################################################

def mungeForPlatform(popen):
    if isWin:
        popen = [os.path.normpath(x) for x in popen]
        popen[0] += ".exe"
    elif not isMac:
        popen[0] += ".lin"
    return popen

class MecabController(object):

    def __init__(self):
        self.mecab = None

    def setup(self):
        base = "../../addons/japanese/support/"
        self.mecabCmd = mungeForPlatform(
            [base + "mecab"] + mecabArgs + [
                '-d', base, '-r', base + "mecabrc"])
        os.environ['DYLD_LIBRARY_PATH'] = base
        os.environ['LD_LIBRARY_PATH'] = base
        if not isWin:
            os.chmod(self.mecabCmd[0], 0755)

    def ensureOpen(self):
        if not self.mecab:
            self.setup()
            try:
                self.mecab = subprocess.Popen(
                    self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                    startupinfo=si)
            except OSError:
                raise Exception("Please ensure your Linux system has 32 bit binary support.")

    def reading(self, expr, formatter=defaultFormatter):
        self.ensureOpen()
        expr = escapeText(expr)
        self.mecab.stdin.write(expr.encode("euc-jp", "ignore")+'\n')
        self.mecab.stdin.flush()
        expr = unicode(self.mecab.stdout.readline().rstrip('\r\n'), "euc-jp")
        out = []
        for node in expr.split(" "):
            if not node:
                break
            (kanji, reading) = re.match("(.+)\[(.*)\]", node).groups()
            # hiragana, punctuation, not japanese, or lacking a reading
            if kanji == reading or not reading:
                out.append(kanji)
                continue
            # katakana
            if kanji == kakasi.reading(reading):
                out.append(kanji)
                continue
            # convert to hiragana
            reading = kakasi.reading(reading)
            # ended up the same
            if reading == kanji:
                out.append(kanji)
                continue
            # don't add readings of numbers
            if kanji in u"一二三四五六七八九十０１２３４５６７８９":
                out.append(kanji)
                continue
            # strip matching characters and beginning and end of reading and kanji
            # reading should always be at least as long as the kanji
            placeL = 0
            placeR = 0
            for i in range(1,len(kanji)):
                if kanji[-i] != reading[-i]:
                    break
                placeR = i
            for i in range(0,len(kanji)-1):
                if kanji[i] != reading[i]:
                    break
                placeL = i+1
            if placeL == 0:
                if placeR == 0:
                    out.append(formatter(kanji, reading))
                else:
                    out.append(formatter(
                        kanji[:-placeR], reading[:-placeR], reading[-placeR:]))
            else:
                if placeR == 0:
                    out.append("%s%s" % (
                        reading[:placeL], formatter(kanji[placeL:], reading[placeL:])))
                else:
                    out.append("%s%s" % (
                        reading[:placeL], formatter(kanji[placeL:-placeR],
                        reading[placeL:-placeR], reading[-placeR:])))
        fin = u""
        for c, s in enumerate(out):
            if c < len(out) - 1 and re.match("^[A-Za-z0-9]+$", out[c+1]):
                s += " "
            fin += s
        return fin.strip().replace("< br>", "<br>")

# Kakasi
##########################################################################

class KakasiController(object):

    def __init__(self):
        self.kakasi = None

    def setup(self):
        base = "../../addons/japanese/support/"
        self.kakasiCmd = mungeForPlatform(
            [base + "kakasi"] + kakasiArgs)
        os.environ['ITAIJIDICT'] = base + "itaijidict"
        os.environ['KANWADICT'] = base + "kanwadict"
        if not isWin:
            os.chmod(self.kakasiCmd[0], 0755)

    def ensureOpen(self):
        if not self.kakasi:
            self.setup()
            try:
                self.kakasi = subprocess.Popen(
                    self.kakasiCmd, bufsize=-1, stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                    startupinfo=si)
            except OSError:
                raise Exception("Please install kakasi")

    def reading(self, expr):
        self.ensureOpen()
        expr = escapeText(expr)
        self.kakasi.stdin.write(expr.encode("sjis", "ignore")+'\n')
        self.kakasi.stdin.flush()
        res = unicode(self.kakasi.stdout.readline().rstrip('\r\n'), "sjis")
        return res

# Init
##########################################################################

kakasi = KakasiController()
mecab = MecabController()

# Tests
##########################################################################

if __name__ == "__main__":
    if len(sys.argv) == 1:
        expr = u"カリン、自分でまいた種は自分で刈り取れ"
        print mecab.reading(expr).encode("utf-8")
        expr = u"昨日、林檎を2個買った。"
        print mecab.reading(expr)
        expr = u"真莉、大好きだよん＾＾"
        print mecab.reading(expr)
        expr = u"彼２０００万も使った。"
        print mecab.reading(expr)
        expr = u"彼二千三百六十円も使った。"
        print mecab.reading(expr)
        expr = u"千葉"
        print mecab.reading(expr)

        print "\n"
        print "Usage: python readingStandAlone.py inputFile [outputFile [formatter]]"
    elif len(sys.argv) >= 2:
        import codecs

        stdout = True
        if len(sys.argv) >= 3:
            stdout = False
            outfid = codecs.open(sys.argv[2], "w", "utf8")
        else:
            outfid = sys.stdout

        formatter = defaultFormatter
        if len(sys.argv) >= 4:
            if sys.argv[3] == "verboseFormatter":
                formatter = verboseFormatter
            # If you make new formatters, add to this switch case

        with codecs.open(sys.argv[1], "r", "utf8") as fid:
            allLines = fid.readlines()

        for s in allLines:
            outfid.write(mecab.reading(s, formatter))
            outfid.write('\n')

        if stdout:
            outfid.close()
	# -- coding: utf-8 --
	# Copyright: Damien Elmes <anki@ichi2.net>
	# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
	#
	# Automatic reading generation with kakasi and mecab.
	# See http://ichi2.net/anki/wiki/JapaneseSupport
	#

	"""

	Standalone Japanese reading generator based on Damien Elmes' Japanese Support
	plugin for Anki: https://ankiweb.net/shared/info/3918629684

	This script does not need Anki to run, but it does use the MeCab and Kakasi
	applications included in the Japanese Support plugin. You are expected to
	download that plugin and place this file inside your
	<Documents/Anki/addons/japanese/> directory alongside `reading.py` and run it
	from the command line.



	Usage: at the command prompt, run:

	python readingStandAlone.py inputFile [outputFile [formatter]]

	- inputFile should be UTF-8 encoded.

	- outputFile: optional. Output will be written to this file if provided (UTF-8
	- encoded), or written to screen if not provided.

	- formatter: optional. If omitted or with formatter="defaultFormatter", this
	script will put a space before Japanese words, and the hiragana reading in
	[square-brackets] immediately after the word. E.g., the following input
	sentence:

	お父さんは？

	becomes

	お父[とう]さんは？

	With formatter="verboseFormatter", the following will be produced:

	お_{父}[とう]さんは？

	Note how the prefix space is replaced by an underscore "_", and the Japanese
	word (in this case, just one kanji, but potentially more) is put in {curly
	brackets}.

	You can add other formatters to the source code: they should be functions of
	two arguments and one optional argument, i.e., with the following definition:

	def newFormatter(kanji, reading, optionalReading=""):

	Caveat: no HTML stripping available.
	"""

	import sys, os, platform, re, subprocess
	#from anki.utils import stripHTML, isWin, isMac
	#from anki.hooks import addHook
	isMac = sys.platform.startswith("darwin")
	isWin = sys.platform.startswith("win32")
	def stripHTML(s): return s

	def verboseFormatter(kanji, reading, optionalReading=""):
	return "_{%s}[%s]%s" % (kanji, reading, optionalReading)
	def defaultFormatter(kanji, reading, optionalReading=""):
	return " %s[%s]%s" % (kanji, reading, optionalReading)


	kakasiArgs = ["-isjis", "-osjis", "-u", "-JH", "-KH"]
	mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
	'--unk-format=%m[] ']

	def escapeText(text):
	# strip characters that trip up kakasi/mecab
	text = text.replace("\n", " ")
	text = text.replace(u'\uff5e', "~")
	text = re.sub("<br( /)?>", "---newline---", text)
	text = stripHTML(text)
	text = text.replace("---newline---", "<br>")
	return text

	if sys.platform == "win32":
	si = subprocess.STARTUPINFO()
	try:
	si.dwFlags \|= subprocess.STARTF_USESHOWWINDOW
	except:
	si.dwFlags \|= subprocess._subprocess.STARTF_USESHOWWINDOW
	else:
	si = None

	# Mecab
	##########################################################################

	def mungeForPlatform(popen):
	if isWin:
	popen = [os.path.normpath(x) for x in popen]
	popen[0] += ".exe"
	elif not isMac:
	popen[0] += ".lin"
	return popen

	class MecabController(object):

	def __init__(self):
	self.mecab = None

	def setup(self):
	base = "../../addons/japanese/support/"
	self.mecabCmd = mungeForPlatform(
	[base + "mecab"] + mecabArgs + [
	'-d', base, '-r', base + "mecabrc"])
	os.environ['DYLD_LIBRARY_PATH'] = base
	os.environ['LD_LIBRARY_PATH'] = base
	if not isWin:
	os.chmod(self.mecabCmd[0], 0755)

	def ensureOpen(self):
	if not self.mecab:
	self.setup()
	try:
	self.mecab = subprocess.Popen(
	self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE,
	stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
	startupinfo=si)
	except OSError:
	raise Exception("Please ensure your Linux system has 32 bit binary support.")

	def reading(self, expr, formatter=defaultFormatter):
	self.ensureOpen()
	expr = escapeText(expr)
	self.mecab.stdin.write(expr.encode("euc-jp", "ignore")+'\n')
	self.mecab.stdin.flush()
	expr = unicode(self.mecab.stdout.readline().rstrip('\r\n'), "euc-jp")
	out = []
	for node in expr.split(" "):
	if not node:
	break
	(kanji, reading) = re.match("(.+)\[(.*)\]", node).groups()
	# hiragana, punctuation, not japanese, or lacking a reading
	if kanji == reading or not reading:
	out.append(kanji)
	continue
	# katakana
	if kanji == kakasi.reading(reading):
	out.append(kanji)
	continue
	# convert to hiragana
	reading = kakasi.reading(reading)
	# ended up the same
	if reading == kanji:
	out.append(kanji)
	continue
	# don't add readings of numbers
	if kanji in u"一二三四五六七八九十０１２３４５６７８９":
	out.append(kanji)
	continue
	# strip matching characters and beginning and end of reading and kanji
	# reading should always be at least as long as the kanji
	placeL = 0
	placeR = 0
	for i in range(1,len(kanji)):
	if kanji[-i] != reading[-i]:
	break
	placeR = i
	for i in range(0,len(kanji)-1):
	if kanji[i] != reading[i]:
	break
	placeL = i+1
	if placeL == 0:
	if placeR == 0:
	out.append(formatter(kanji, reading))
	else:
	out.append(formatter(
	kanji[:-placeR], reading[:-placeR], reading[-placeR:]))
	else:
	if placeR == 0:
	out.append("%s%s" % (
	reading[:placeL], formatter(kanji[placeL:], reading[placeL:])))
	else:
	out.append("%s%s" % (
	reading[:placeL], formatter(kanji[placeL:-placeR],
	reading[placeL:-placeR], reading[-placeR:])))
	fin = u""
	for c, s in enumerate(out):
	if c < len(out) - 1 and re.match("^[A-Za-z0-9]+$", out[c+1]):
	s += " "
	fin += s
	return fin.strip().replace("< br>", "<br>")

	# Kakasi
	##########################################################################

	class KakasiController(object):

	def __init__(self):
	self.kakasi = None

	def setup(self):
	base = "../../addons/japanese/support/"
	self.kakasiCmd = mungeForPlatform(
	[base + "kakasi"] + kakasiArgs)
	os.environ['ITAIJIDICT'] = base + "itaijidict"
	os.environ['KANWADICT'] = base + "kanwadict"
	if not isWin:
	os.chmod(self.kakasiCmd[0], 0755)

	def ensureOpen(self):
	if not self.kakasi:
	self.setup()
	try:
	self.kakasi = subprocess.Popen(
	self.kakasiCmd, bufsize=-1, stdin=subprocess.PIPE,
	stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
	startupinfo=si)
	except OSError:
	raise Exception("Please install kakasi")

	def reading(self, expr):
	self.ensureOpen()
	expr = escapeText(expr)
	self.kakasi.stdin.write(expr.encode("sjis", "ignore")+'\n')
	self.kakasi.stdin.flush()
	res = unicode(self.kakasi.stdout.readline().rstrip('\r\n'), "sjis")
	return res

	# Init
	##########################################################################

	kakasi = KakasiController()
	mecab = MecabController()

	# Tests
	##########################################################################

	if __name__ == "__main__":
	if len(sys.argv) == 1:
	expr = u"カリン、自分でまいた種は自分で刈り取れ"
	print mecab.reading(expr).encode("utf-8")
	expr = u"昨日、林檎を2個買った。"
	print mecab.reading(expr)
	expr = u"真莉、大好きだよん＾＾"
	print mecab.reading(expr)
	expr = u"彼２０００万も使った。"
	print mecab.reading(expr)
	expr = u"彼二千三百六十円も使った。"
	print mecab.reading(expr)
	expr = u"千葉"
	print mecab.reading(expr)

	print "\n"
	print "Usage: python readingStandAlone.py inputFile [outputFile [formatter]]"
	elif len(sys.argv) >= 2:
	import codecs

	stdout = True
	if len(sys.argv) >= 3:
	stdout = False
	outfid = codecs.open(sys.argv[2], "w", "utf8")
	else:
	outfid = sys.stdout

	formatter = defaultFormatter
	if len(sys.argv) >= 4:
	if sys.argv[3] == "verboseFormatter":
	formatter = verboseFormatter
	# If you make new formatters, add to this switch case

	with codecs.open(sys.argv[1], "r", "utf8") as fid:
	allLines = fid.readlines()

	for s in allLines:
	outfid.write(mecab.reading(s, formatter))
	outfid.write('\n')

	if stdout:
	outfid.close()