Skip to content

Instantly share code, notes, and snippets.

@olsgaard
Forked from fasiha/README.md
Last active August 29, 2015 14:19
Show Gist options
  • Save olsgaard/4d455f1dcd5867f313ef to your computer and use it in GitHub Desktop.
Save olsgaard/4d455f1dcd5867f313ef to your computer and use it in GitHub Desktop.

Standalone Japanese reading generator based on Damien Elmes' Japanese Support plugin for Anki: https://ankiweb.net/shared/info/3918629684

This script does not need Anki to run, but it does use the MeCab and Kakasi applications included in the Japanese Support plugin. You are expected to download that plugin and place this file inside your <Documents/Anki/addons/japanese/> directory alongside reading.py and run it from the command line.

Usage: at the command prompt, run:

python readingStandAlone.py inputFile [outputFile [formatter]]

  • inputFile should be UTF-8 encoded.

  • outputFile: optional. Output will be written to this file if provided (UTF-8-encoded), or written to screen if not provided.

  • formatter: optional. If omitted or with formatter="defaultFormatter", this script will put a space before Japanese words, and the hiragana reading in [square-brackets] immediately after the word. E.g., the following input sentence:

お父さんは?

becomes

お 父[とう]さんは?

With formatter="verboseFormatter", the following will be produced:

お_{父}[とう]さんは?

Note how the prefix space is replaced by an underscore "_", and the Japanese word (in this case, just one kanji, but potentially more) is put in {curly brackets}.

You can add other formatters to the source code: they should be functions of two arguments and one optional argument, i.e., with the following definition:

def newFormatter(kanji, reading, optionalReading=""):

Caveat: no HTML stripping available.

# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
#
# Automatic reading generation with kakasi and mecab.
# See http://ichi2.net/anki/wiki/JapaneseSupport
#
"""
Standalone Japanese reading generator based on Damien Elmes' Japanese Support
plugin for Anki: https://ankiweb.net/shared/info/3918629684
This script does *not* need Anki to run, but it does use the MeCab and Kakasi
applications included in the Japanese Support plugin. You are expected to
download that plugin and place this file inside your
<Documents/Anki/addons/japanese/> directory alongside `reading.py` and run it
from the command line.
Usage: at the command prompt, run:
python readingStandAlone.py inputFile [outputFile [formatter]]
- inputFile should be UTF-8 encoded.
- outputFile: optional. Output will be written to this file if provided (UTF-8
- encoded), or written to screen if not provided.
- formatter: optional. If omitted or with formatter="defaultFormatter", this
script will put a space before Japanese words, and the hiragana reading in
[square-brackets] immediately after the word. E.g., the following input
sentence:
お父さんは?
becomes
お 父[とう]さんは?
With formatter="verboseFormatter", the following will be produced:
お_{父}[とう]さんは?
Note how the prefix space is replaced by an underscore "_", and the Japanese
word (in this case, just one kanji, but potentially more) is put in {curly
brackets}.
You can add other formatters to the source code: they should be functions of
two arguments and one optional argument, i.e., with the following definition:
def newFormatter(kanji, reading, optionalReading=""):
Caveat: no HTML stripping available.
"""
import sys, os, platform, re, subprocess
#from anki.utils import stripHTML, isWin, isMac
#from anki.hooks import addHook
isMac = sys.platform.startswith("darwin")
isWin = sys.platform.startswith("win32")
def stripHTML(s): return s
def verboseFormatter(kanji, reading, optionalReading=""):
return "_{%s}[%s]%s" % (kanji, reading, optionalReading)
def defaultFormatter(kanji, reading, optionalReading=""):
return " %s[%s]%s" % (kanji, reading, optionalReading)
kakasiArgs = ["-isjis", "-osjis", "-u", "-JH", "-KH"]
mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
'--unk-format=%m[] ']
def escapeText(text):
# strip characters that trip up kakasi/mecab
text = text.replace("\n", " ")
text = text.replace(u'\uff5e', "~")
text = re.sub("<br( /)?>", "---newline---", text)
text = stripHTML(text)
text = text.replace("---newline---", "<br>")
return text
if sys.platform == "win32":
si = subprocess.STARTUPINFO()
try:
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
except:
si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW
else:
si = None
# Mecab
##########################################################################
def mungeForPlatform(popen):
if isWin:
popen = [os.path.normpath(x) for x in popen]
popen[0] += ".exe"
elif not isMac:
popen[0] += ".lin"
return popen
class MecabController(object):
def __init__(self):
self.mecab = None
def setup(self):
base = "../../addons/japanese/support/"
self.mecabCmd = mungeForPlatform(
[base + "mecab"] + mecabArgs + [
'-d', base, '-r', base + "mecabrc"])
os.environ['DYLD_LIBRARY_PATH'] = base
os.environ['LD_LIBRARY_PATH'] = base
if not isWin:
os.chmod(self.mecabCmd[0], 0755)
def ensureOpen(self):
if not self.mecab:
self.setup()
try:
self.mecab = subprocess.Popen(
self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
startupinfo=si)
except OSError:
raise Exception("Please ensure your Linux system has 32 bit binary support.")
def reading(self, expr, formatter=defaultFormatter):
self.ensureOpen()
expr = escapeText(expr)
self.mecab.stdin.write(expr.encode("euc-jp", "ignore")+'\n')
self.mecab.stdin.flush()
expr = unicode(self.mecab.stdout.readline().rstrip('\r\n'), "euc-jp")
out = []
for node in expr.split(" "):
if not node:
break
(kanji, reading) = re.match("(.+)\[(.*)\]", node).groups()
# hiragana, punctuation, not japanese, or lacking a reading
if kanji == reading or not reading:
out.append(kanji)
continue
# katakana
if kanji == kakasi.reading(reading):
out.append(kanji)
continue
# convert to hiragana
reading = kakasi.reading(reading)
# ended up the same
if reading == kanji:
out.append(kanji)
continue
# don't add readings of numbers
if kanji in u"一二三四五六七八九十0123456789":
out.append(kanji)
continue
# strip matching characters and beginning and end of reading and kanji
# reading should always be at least as long as the kanji
placeL = 0
placeR = 0
for i in range(1,len(kanji)):
if kanji[-i] != reading[-i]:
break
placeR = i
for i in range(0,len(kanji)-1):
if kanji[i] != reading[i]:
break
placeL = i+1
if placeL == 0:
if placeR == 0:
out.append(formatter(kanji, reading))
else:
out.append(formatter(
kanji[:-placeR], reading[:-placeR], reading[-placeR:]))
else:
if placeR == 0:
out.append("%s%s" % (
reading[:placeL], formatter(kanji[placeL:], reading[placeL:])))
else:
out.append("%s%s" % (
reading[:placeL], formatter(kanji[placeL:-placeR],
reading[placeL:-placeR], reading[-placeR:])))
fin = u""
for c, s in enumerate(out):
if c < len(out) - 1 and re.match("^[A-Za-z0-9]+$", out[c+1]):
s += " "
fin += s
return fin.strip().replace("< br>", "<br>")
# Kakasi
##########################################################################
class KakasiController(object):
def __init__(self):
self.kakasi = None
def setup(self):
base = "../../addons/japanese/support/"
self.kakasiCmd = mungeForPlatform(
[base + "kakasi"] + kakasiArgs)
os.environ['ITAIJIDICT'] = base + "itaijidict"
os.environ['KANWADICT'] = base + "kanwadict"
if not isWin:
os.chmod(self.kakasiCmd[0], 0755)
def ensureOpen(self):
if not self.kakasi:
self.setup()
try:
self.kakasi = subprocess.Popen(
self.kakasiCmd, bufsize=-1, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
startupinfo=si)
except OSError:
raise Exception("Please install kakasi")
def reading(self, expr):
self.ensureOpen()
expr = escapeText(expr)
self.kakasi.stdin.write(expr.encode("sjis", "ignore")+'\n')
self.kakasi.stdin.flush()
res = unicode(self.kakasi.stdout.readline().rstrip('\r\n'), "sjis")
return res
# Init
##########################################################################
kakasi = KakasiController()
mecab = MecabController()
# Tests
##########################################################################
if __name__ == "__main__":
if len(sys.argv) == 1:
expr = u"カリン、自分でまいた種は自分で刈り取れ"
print mecab.reading(expr).encode("utf-8")
expr = u"昨日、林檎を2個買った。"
print mecab.reading(expr)
expr = u"真莉、大好きだよん^^"
print mecab.reading(expr)
expr = u"彼2000万も使った。"
print mecab.reading(expr)
expr = u"彼二千三百六十円も使った。"
print mecab.reading(expr)
expr = u"千葉"
print mecab.reading(expr)
print "\n"
print "Usage: python readingStandAlone.py inputFile [outputFile [formatter]]"
elif len(sys.argv) >= 2:
import codecs
stdout = True
if len(sys.argv) >= 3:
stdout = False
outfid = codecs.open(sys.argv[2], "w", "utf8")
else:
outfid = sys.stdout
formatter = defaultFormatter
if len(sys.argv) >= 4:
if sys.argv[3] == "verboseFormatter":
formatter = verboseFormatter
# If you make new formatters, add to this switch case
with codecs.open(sys.argv[1], "r", "utf8") as fid:
allLines = fid.readlines()
for s in allLines:
outfid.write(mecab.reading(s, formatter))
outfid.write('\n')
if stdout:
outfid.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment