Skip to content
Create a gist now

Instantly share code, notes, and snippets.

ZnK format furigana-izer by xfact2007 based on my work based on Damien Elmes' work http://forum.koohii.com/viewtopic.php?pid=233782#p233782
# -*- coding: utf-8 -*-
# i) place this file inside the Documents/Anki/addons/japanese/ directory alongside these files
# * bulkreading.py
# * reading.py
# * ZnK_input.txt
# * etc...
# ii) open a command line window in the directory (Windows Explorer, Ctrl+Right click on the directory name > Open command window here )
# iii) there should be a ZnK_input.txt file with all your Japanese text in ZnK's format
# * run c:/path/to/python/python.exe ZnK_plus_mecab.py ZnK_input.txt ZnK_output.txt
# iv) the script should finish the task in 3 minutes
#
# ZnK_input.txt example format, with or without blank lines
'''
#6Aあとは任せてほしいの!
これでアルゴールも大丈夫。
行こう、ナユタ……
'''
# original readingStandAlone.py found on http://gist.github.com/fasiha/9a8557026f94218cb922
import sys, os, platform, re, subprocess
from string import maketrans
# the script remembers the commonly used furiganas
# * eg. 魔導杖's furigana will always be オーバルスタッフ, doesn't matter what Mecab says
DISABLE_AUTO_FURIGANA = False
isMac = sys.platform.startswith("darwin")
isWin = sys.platform.startswith("win32")
#kakasiArgs = ["-isjis", "-osjis", "-u", "-JH", "-KH"]
mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
'--unk-format=%m[] ']
if sys.platform == "win32":
si = subprocess.STARTUPINFO()
try:
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
except:
si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW
else:
si = None
# Mecab
##########################################################################
def mungeForPlatform(popen):
if isWin:
popen = [os.path.normpath(x) for x in popen]
popen[0] += ".exe"
elif not isMac:
popen[0] += ".lin"
return popen
class MecabController(object):
def __init__(self):
self.mecab = None
self.translation = {}
self.kanji_number = u"一二三四五六七八九十0123456789"
# katakana to hiragana converter
self.s_hiragana = u"ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ"
self.s_katakana = u"ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ"
if ( len(self.s_hiragana) - len(self.s_katakana ) != 0 ):
raise Exception( "Hiragana and katakana tables are not matching in length.")
def dictionary(self, input_file_name):
global DISABLE_AUTO_FURIGANA
# remember furigana
if ( DISABLE_AUTO_FURIGANA == False ):
discarded = {}
with codecs.open( input_file_name, "r", "utf8" ) as f:
i = 1
for line in f:
i = i + 1
opcode = self.opcode_with_index( line )
for key in opcode:
result =self.opcode_furigana( opcode[key]["opcode"] )
# filter #<number>R opcode
if ( result != None ):
#and opcode[key].opcode[ len( result.groups()[0] ) -1 ] == "R" ):
if ( len( opcode[key]["post"]) == 0\
or len( opcode[key]["pre"] ) == 0 ):
raise Exception( "Kanji or furigana is missing at line " + i )
else:
# remove predefined kanji with multiple readings
if opcode[key]["pre"] in self.translation\
and opcode[key]["post"] !=self.translation[ opcode[key]["pre"] ]:
self.discarded[ opcode[key]["pre"] ] = True
else:
self.translation[ opcode[key]["pre"] ] = opcode[key]["post"]
if ( len( discarded) > 0 ):
print str( len( discarded ) ) + " discard keys"
for key in discarded:
self.translation.pop( key, None )
def ensureOpen(self):
if not self.mecab:
self.setup()
try:
self.mecab = subprocess.Popen(
self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
startupinfo=si)
except OSError:
raise Exception("Please ensure your Linux system has 32 bit binary support.")
def formatter(self, kanji, reading, optionalReading=""):
return "%s#%dR%s#R%s" % (kanji, len(kanji) *2, reading, optionalReading)
def opcode_with_index(self, line):
place = {}
#--find all opcodes
i = 0
i_temp = 0
n_line = len(line)
n_opcode_prev = 0
loop = 0
while ( i > -1 ):
#--filter opcodes
opcode_start = line.find("#", i)
relevant = line[ opcode_start: ]
opcode = re.match( "([#0-9a-zA-Z]+).*", relevant)
n_opcode = 0
opcode_next =len( line )
if ( opcode != None ):
n_opcode = len( opcode.groups()[0] )
opcode_next = line.find("#", opcode_start +n_opcode)
if ( opcode_next == -1 ):
opcode_next = n_line
# current opcode, string before and after the opcode
place[ opcode_start ] = {"opcode": opcode.groups()[0], "pre": line[ (i_temp +n_opcode_prev):opcode_start ], "post": line[ (opcode_start +n_opcode):opcode_next ] }
i_temp = i
i = opcode_next
else:
i = -1
if ( i > n_line\
or i_temp > i ):
i = -1
n_opcode_prev = n_opcode
loop += 1
if loop > 15:
print "Exception"
print line
raise Exception( "Line stopped at %dth/%d character" % (i +1, n_line) )
return place
return place
# is the opcode #<number>R
def opcode_furigana(self, opcode):
return re.match( "(#[0-9]+R)", opcode )
# mix the kanji-furigana pair with the ZnK-formatted text
# * eg. 『魔導杖』といいます x [ ["魔導杖", "オーバルスタッフ"] ] = 『魔導杖#6Rオーバルスタッフ#』といいます
def opcode_restore(self, original_line ):
s_result=[]
kanji_furigana = self.reading( original_line )
opcode = self.opcode_with_index( original_line )
# hiragana only sentence
if ( len(kanji_furigana) == 0 ):
return original_line
# kanji mixed with hiragana, katakana
s_result = []
i = 0
for pair in kanji_furigana:
word_start = original_line.find( pair[0], i )
found = None
# find opcode-furigana for the word
#for j in range( word_start, word_start +len(pair[0]) ):
# print "not in %d" % (j)
# if j in opcode\
# and self.opcode_furigana( opcode[j].opcode ):
# found = j
# break
for key in opcode:
if ( self.opcode_furigana( opcode[key]["opcode"] ) ):
n_kanji = int( re.match( "#([0-9]+).*", opcode[key]["opcode"] ).groups()[0] ) / 2
for j in range(key - n_kanji, key):
#print "%d %d %s %s" % (i, word_start, opcode[key]["opcode"], pair[0])
if ( self.opcode_furigana( opcode[key]["opcode"] )\
and j == word_start ):
found = j
break
# generate furigana only for words without reading
word_plus_reading = pair[0]
if ( found == None ):
word_plus_reading = pair[1]
# later join the words into a whole sentence
i_temp = i
i = word_start +len( pair[0] )
s_result.append( original_line[ i_temp:word_start ] + word_plus_reading )
if ( i < len(original_line) ):
s_result.append( original_line[ i: ] )
return "".join( s_result )
def setup(self):
base = "../../addons/japanese/support/"
self.mecabCmd = mungeForPlatform(
[base + "mecab"] + mecabArgs + [
'-d', base, '-r', base + "mecabrc"])
os.environ['DYLD_LIBRARY_PATH'] = base
os.environ['LD_LIBRARY_PATH'] = base
if not isWin:
os.chmod(self.mecabCmd[0], 0755)
# return reading as kanji-furigana pairs
def reading(self, line):
s_reading = []
self.ensureOpen()
#line = escapeText(line)
line = line.rstrip('\r\n')
self.mecab.stdin.write( self.text_only( line ).encode("euc-jp", "ignore")+'\n')
self.mecab.stdin.flush()
line = self.mecab.stdout.readline().rstrip('\r\n')
line = unicode( line, "euc-jp")
out = []
for node in line.split(" "):
if not node:
break
# <kanji/hiragana>[<katakana>] or <number>[<kanji>] or <katakana>[]
(kanji, reading) = re.match("(.+)\[(.*)\]", node).groups()
# convert to hiragana
hiragana = self.translate( reading )
# katakana, punctuation, not japanese, or lacking a reading
if kanji == reading or not reading:
out.append(kanji)
# hiragana
elif kanji == hiragana:
out.append(kanji)
# don't add readings of numbers
elif kanji in self.kanji_number:
out.append(kanji)
else:
reading = hiragana
# predefined furigana
if ( kanji in self.translation ):
reading = self.translation[ kanji ]
# strip matching characters and beginning and end of reading and kanji
# reading should always be at least as long as the kanji
n_reading = len(reading)
n_kanji = len(kanji) -1
for i in range(len(kanji) -1,-1, -1):
n_kanji = i
if kanji[ i ] != reading[ n_reading -1 ]:
break
else:
n_reading -= 1
s_reading.append(\
[\
kanji,\
self.formatter( kanji[:(n_kanji +1) ], reading[ :n_reading ] ,reading[ n_reading: ] ),\
]\
)
return s_reading
# return reading as text
def reading_only(self, expr):
self.ensureOpen()
#expr = escapeText(expr)
expr = expr.rstrip('\r\n')
self.mecab.stdin.write( self.text_only( expr ).encode("euc-jp", "ignore")+'\n')
self.mecab.stdin.flush()
line = self.mecab.stdout.readline().rstrip('\r\n')
expr = unicode( line, "euc-jp")
#print expr
out = []
for node in expr.split(" "):
if not node:
break
# <kanji/hiragana>[<katakana>] or <number>[<kanji>] or <katakana>[]
(kanji, reading) = re.match("(.+)\[(.*)\]", node).groups()
# convert to hiragana
hiragana = self.translate( reading )
# katakana, punctuation, not japanese, or lacking a reading
if kanji == reading or not reading:
out.append(kanji)
# hiragana
elif kanji == hiragana:
out.append(kanji)
# don't add readings of numbers
elif kanji in self.kanji_number:
out.append(kanji)
else:
reading = hiragana
# predefined furigana
if ( kanji in self.translation ):
reading = self.translation[ kanji ]
# strip matching characters and beginning and end of reading and kanji
# reading should always be at least as long as the kanji
n_reading = len(reading)
n_kanji = len(kanji) -1
for i in range(len(kanji) -1,-1, -1):
n_kanji = i
if kanji[ i ] != reading[ n_reading -1 ]:
break
else:
n_reading -= 1
out.append( self.formatter( kanji[:(n_kanji +1) ], reading[ :n_reading ] ,reading[ n_reading: ] ) )
return "".join( out )
# text without opcodes
def text_only(self, line):
return re.sub( "#[0-9a-zA-Z]+", "", line)
# convert katakana to hiragana
def translate(self, line):
tabin = [ord(char) for char in self.s_katakana]
translate_table = dict(zip(tabin, self.s_hiragana))
return line.translate(translate_table)
# Init
##########################################################################
mecab = MecabController()
# Tests
##########################################################################
if __name__ == "__main__":
#--tests from dump file ; http://docs.google.com/spreadsheets/d/1gPDihUqhJuEgmknrPCcWeIqU5s7JtH2cqY8IH3VOiog/edit?pli=1#gid=36837976
if len(sys.argv) == 1:
mecab.translation[u"自分"] = u"ジブン"
expr = u"カリン、自分でまいた種は自分で刈り取れ"
print mecab.opcode_restore(expr)
expr = u"昨日、林檎を2個買った。"
print mecab.opcode_restore(expr)
expr = u"真莉、大好きだよん^^"
print mecab.opcode_restore(expr)
expr = u"彼2000万も使った。"
print mecab.opcode_restore(expr)
expr = u"彼二千三百六十円も使った。"
print mecab.opcode_restore(expr)
expr = u"#6Aこれで、残るはあと一つですね。"
print mecab.opcode_restore(expr)
expr = u"彼奴#4Rきゃつ#Rらには届かん……"
print mecab.opcode_restore(expr)
expr = u"力を奪われた今の私では#0E彼奴#4Rきゃつ#Rらには届かん……"
print mecab.opcode_restore(expr)
expr = u"#1W#2E#030cおぬしら、ノイと人間か……#0E"
print mecab.opcode_restore(expr)
expr = u"#1W#2E#030c私もようやく『檻』から解放される時がきたようだ。#0E"
print mecab.opcode_restore(expr)
print "\n"
print "Usage: python ZnK_plus_mecab.py inputFile [outputFile]"
elif len(sys.argv) >= 2:
import codecs
stdout = True
if len(sys.argv) >= 3:
stdout = False
outfid = codecs.open(sys.argv[2], "w", "utf8")
else:
outfid = sys.stdout
#formatter = defaultFormatter
# find predefined furiganas
mecab.dictionary( sys.argv[1] )
print str( len( mecab.translation ) ) + " keys"
with codecs.open(sys.argv[1], "r", "utf8") as fid:
allLines = fid.readlines()
i = 0
increment = 100
for s in allLines:
i = i + 1
if ( i % increment == 0):
print str(i) + "lines"
if ( i > 1000 ):
increment = 1000
if ( i > 10000 ):
increment = 10000
outfid.write(mecab.opcode_restore(s))
outfid.write('\n')
print str(i) + "lines"
if stdout:
outfid.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.