| # -*- coding: utf-8 -*- | |
| # i) place this file inside the Documents/Anki/addons/japanese/ directory alongside these files | |
| # * bulkreading.py | |
| # * reading.py | |
| # * ZnK_input.txt | |
| # * etc... | |
| # ii) open a command line window in the directory (Windows Explorer, Ctrl+Right click on the directory name > Open command window here ) | |
| # iii) there should be a ZnK_input.txt file with all your Japanese text in ZnK's format | |
| # * run c:/path/to/python/python.exe ZnK_plus_mecab.py ZnK_input.txt ZnK_output.txt | |
| # iv) the script should finish the task in 3 minutes | |
| # | |
| # ZnK_input.txt example format, with or without blank lines | |
| ''' | |
| #6Aあとは任せてほしいの! | |
| これでアルゴールも大丈夫。 | |
| 行こう、ナユタ…… | |
| ''' | |
| # original readingStandAlone.py found on http://gist.github.com/fasiha/9a8557026f94218cb922 | |
| import sys, os, platform, re, subprocess | |
| from string import maketrans | |
| # the script remembers the commonly used furiganas | |
| # * eg. 魔導杖's furigana will always be オーバルスタッフ, doesn't matter what Mecab says | |
| DISABLE_AUTO_FURIGANA = False | |
| isMac = sys.platform.startswith("darwin") | |
| isWin = sys.platform.startswith("win32") | |
| #kakasiArgs = ["-isjis", "-osjis", "-u", "-JH", "-KH"] | |
| mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n', | |
| '--unk-format=%m[] '] | |
| if sys.platform == "win32": | |
| si = subprocess.STARTUPINFO() | |
| try: | |
| si.dwFlags |= subprocess.STARTF_USESHOWWINDOW | |
| except: | |
| si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW | |
| else: | |
| si = None | |
| # Mecab | |
| ########################################################################## | |
| def mungeForPlatform(popen): | |
| if isWin: | |
| popen = [os.path.normpath(x) for x in popen] | |
| popen[0] += ".exe" | |
| elif not isMac: | |
| popen[0] += ".lin" | |
| return popen | |
| class MecabController(object): | |
| def __init__(self): | |
| self.mecab = None | |
| self.translation = {} | |
| self.kanji_number = u"一二三四五六七八九十0123456789" | |
| # katakana to hiragana converter | |
| self.s_hiragana = u"ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ" | |
| self.s_katakana = u"ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ" | |
| if ( len(self.s_hiragana) - len(self.s_katakana ) != 0 ): | |
| raise Exception( "Hiragana and katakana tables are not matching in length.") | |
| def dictionary(self, input_file_name): | |
| global DISABLE_AUTO_FURIGANA | |
| # remember furigana | |
| if ( DISABLE_AUTO_FURIGANA == False ): | |
| discarded = {} | |
| with codecs.open( input_file_name, "r", "utf8" ) as f: | |
| i = 1 | |
| for line in f: | |
| i = i + 1 | |
| opcode = self.opcode_with_index( line ) | |
| for key in opcode: | |
| result =self.opcode_furigana( opcode[key]["opcode"] ) | |
| # filter #<number>R opcode | |
| if ( result != None ): | |
| #and opcode[key].opcode[ len( result.groups()[0] ) -1 ] == "R" ): | |
| if ( len( opcode[key]["post"]) == 0\ | |
| or len( opcode[key]["pre"] ) == 0 ): | |
| raise Exception( "Kanji or furigana is missing at line " + i ) | |
| else: | |
| # remove predefined kanji with multiple readings | |
| if opcode[key]["pre"] in self.translation\ | |
| and opcode[key]["post"] !=self.translation[ opcode[key]["pre"] ]: | |
| self.discarded[ opcode[key]["pre"] ] = True | |
| else: | |
| self.translation[ opcode[key]["pre"] ] = opcode[key]["post"] | |
| if ( len( discarded) > 0 ): | |
| print str( len( discarded ) ) + " discard keys" | |
| for key in discarded: | |
| self.translation.pop( key, None ) | |
| def ensureOpen(self): | |
| if not self.mecab: | |
| self.setup() | |
| try: | |
| self.mecab = subprocess.Popen( | |
| self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE, | |
| stdout=subprocess.PIPE, stderr=subprocess.STDOUT, | |
| startupinfo=si) | |
| except OSError: | |
| raise Exception("Please ensure your Linux system has 32 bit binary support.") | |
| def formatter(self, kanji, reading, optionalReading=""): | |
| return "%s#%dR%s#R%s" % (kanji, len(kanji) *2, reading, optionalReading) | |
| def opcode_with_index(self, line): | |
| place = {} | |
| #--find all opcodes | |
| i = 0 | |
| i_temp = 0 | |
| n_line = len(line) | |
| n_opcode_prev = 0 | |
| loop = 0 | |
| while ( i > -1 ): | |
| #--filter opcodes | |
| opcode_start = line.find("#", i) | |
| relevant = line[ opcode_start: ] | |
| opcode = re.match( "([#0-9a-zA-Z]+).*", relevant) | |
| n_opcode = 0 | |
| opcode_next =len( line ) | |
| if ( opcode != None ): | |
| n_opcode = len( opcode.groups()[0] ) | |
| opcode_next = line.find("#", opcode_start +n_opcode) | |
| if ( opcode_next == -1 ): | |
| opcode_next = n_line | |
| # current opcode, string before and after the opcode | |
| place[ opcode_start ] = {"opcode": opcode.groups()[0], "pre": line[ (i_temp +n_opcode_prev):opcode_start ], "post": line[ (opcode_start +n_opcode):opcode_next ] } | |
| i_temp = i | |
| i = opcode_next | |
| else: | |
| i = -1 | |
| if ( i > n_line\ | |
| or i_temp > i ): | |
| i = -1 | |
| n_opcode_prev = n_opcode | |
| loop += 1 | |
| if loop > 15: | |
| print "Exception" | |
| print line | |
| raise Exception( "Line stopped at %dth/%d character" % (i +1, n_line) ) | |
| return place | |
| return place | |
| # is the opcode #<number>R | |
| def opcode_furigana(self, opcode): | |
| return re.match( "(#[0-9]+R)", opcode ) | |
| # mix the kanji-furigana pair with the ZnK-formatted text | |
| # * eg. 『魔導杖』といいます x [ ["魔導杖", "オーバルスタッフ"] ] = 『魔導杖#6Rオーバルスタッフ#』といいます | |
| def opcode_restore(self, original_line ): | |
| s_result=[] | |
| kanji_furigana = self.reading( original_line ) | |
| opcode = self.opcode_with_index( original_line ) | |
| # hiragana only sentence | |
| if ( len(kanji_furigana) == 0 ): | |
| return original_line | |
| # kanji mixed with hiragana, katakana | |
| s_result = [] | |
| i = 0 | |
| for pair in kanji_furigana: | |
| word_start = original_line.find( pair[0], i ) | |
| found = None | |
| # find opcode-furigana for the word | |
| #for j in range( word_start, word_start +len(pair[0]) ): | |
| # print "not in %d" % (j) | |
| # if j in opcode\ | |
| # and self.opcode_furigana( opcode[j].opcode ): | |
| # found = j | |
| # break | |
| for key in opcode: | |
| if ( self.opcode_furigana( opcode[key]["opcode"] ) ): | |
| n_kanji = int( re.match( "#([0-9]+).*", opcode[key]["opcode"] ).groups()[0] ) / 2 | |
| for j in range(key - n_kanji, key): | |
| #print "%d %d %s %s" % (i, word_start, opcode[key]["opcode"], pair[0]) | |
| if ( self.opcode_furigana( opcode[key]["opcode"] )\ | |
| and j == word_start ): | |
| found = j | |
| break | |
| # generate furigana only for words without reading | |
| word_plus_reading = pair[0] | |
| if ( found == None ): | |
| word_plus_reading = pair[1] | |
| # later join the words into a whole sentence | |
| i_temp = i | |
| i = word_start +len( pair[0] ) | |
| s_result.append( original_line[ i_temp:word_start ] + word_plus_reading ) | |
| if ( i < len(original_line) ): | |
| s_result.append( original_line[ i: ] ) | |
| return "".join( s_result ) | |
| def setup(self): | |
| base = "../../addons/japanese/support/" | |
| self.mecabCmd = mungeForPlatform( | |
| [base + "mecab"] + mecabArgs + [ | |
| '-d', base, '-r', base + "mecabrc"]) | |
| os.environ['DYLD_LIBRARY_PATH'] = base | |
| os.environ['LD_LIBRARY_PATH'] = base | |
| if not isWin: | |
| os.chmod(self.mecabCmd[0], 0755) | |
| # return reading as kanji-furigana pairs | |
| def reading(self, line): | |
| s_reading = [] | |
| self.ensureOpen() | |
| #line = escapeText(line) | |
| line = line.rstrip('\r\n') | |
| self.mecab.stdin.write( self.text_only( line ).encode("euc-jp", "ignore")+'\n') | |
| self.mecab.stdin.flush() | |
| line = self.mecab.stdout.readline().rstrip('\r\n') | |
| line = unicode( line, "euc-jp") | |
| out = [] | |
| for node in line.split(" "): | |
| if not node: | |
| break | |
| # <kanji/hiragana>[<katakana>] or <number>[<kanji>] or <katakana>[] | |
| (kanji, reading) = re.match("(.+)\[(.*)\]", node).groups() | |
| # convert to hiragana | |
| hiragana = self.translate( reading ) | |
| # katakana, punctuation, not japanese, or lacking a reading | |
| if kanji == reading or not reading: | |
| out.append(kanji) | |
| # hiragana | |
| elif kanji == hiragana: | |
| out.append(kanji) | |
| # don't add readings of numbers | |
| elif kanji in self.kanji_number: | |
| out.append(kanji) | |
| else: | |
| reading = hiragana | |
| # predefined furigana | |
| if ( kanji in self.translation ): | |
| reading = self.translation[ kanji ] | |
| # strip matching characters and beginning and end of reading and kanji | |
| # reading should always be at least as long as the kanji | |
| n_reading = len(reading) | |
| n_kanji = len(kanji) -1 | |
| for i in range(len(kanji) -1,-1, -1): | |
| n_kanji = i | |
| if kanji[ i ] != reading[ n_reading -1 ]: | |
| break | |
| else: | |
| n_reading -= 1 | |
| s_reading.append(\ | |
| [\ | |
| kanji,\ | |
| self.formatter( kanji[:(n_kanji +1) ], reading[ :n_reading ] ,reading[ n_reading: ] ),\ | |
| ]\ | |
| ) | |
| return s_reading | |
| # return reading as text | |
| def reading_only(self, expr): | |
| self.ensureOpen() | |
| #expr = escapeText(expr) | |
| expr = expr.rstrip('\r\n') | |
| self.mecab.stdin.write( self.text_only( expr ).encode("euc-jp", "ignore")+'\n') | |
| self.mecab.stdin.flush() | |
| line = self.mecab.stdout.readline().rstrip('\r\n') | |
| expr = unicode( line, "euc-jp") | |
| #print expr | |
| out = [] | |
| for node in expr.split(" "): | |
| if not node: | |
| break | |
| # <kanji/hiragana>[<katakana>] or <number>[<kanji>] or <katakana>[] | |
| (kanji, reading) = re.match("(.+)\[(.*)\]", node).groups() | |
| # convert to hiragana | |
| hiragana = self.translate( reading ) | |
| # katakana, punctuation, not japanese, or lacking a reading | |
| if kanji == reading or not reading: | |
| out.append(kanji) | |
| # hiragana | |
| elif kanji == hiragana: | |
| out.append(kanji) | |
| # don't add readings of numbers | |
| elif kanji in self.kanji_number: | |
| out.append(kanji) | |
| else: | |
| reading = hiragana | |
| # predefined furigana | |
| if ( kanji in self.translation ): | |
| reading = self.translation[ kanji ] | |
| # strip matching characters and beginning and end of reading and kanji | |
| # reading should always be at least as long as the kanji | |
| n_reading = len(reading) | |
| n_kanji = len(kanji) -1 | |
| for i in range(len(kanji) -1,-1, -1): | |
| n_kanji = i | |
| if kanji[ i ] != reading[ n_reading -1 ]: | |
| break | |
| else: | |
| n_reading -= 1 | |
| out.append( self.formatter( kanji[:(n_kanji +1) ], reading[ :n_reading ] ,reading[ n_reading: ] ) ) | |
| return "".join( out ) | |
| # text without opcodes | |
| def text_only(self, line): | |
| return re.sub( "#[0-9a-zA-Z]+", "", line) | |
| # convert katakana to hiragana | |
| def translate(self, line): | |
| tabin = [ord(char) for char in self.s_katakana] | |
| translate_table = dict(zip(tabin, self.s_hiragana)) | |
| return line.translate(translate_table) | |
| # Init | |
| ########################################################################## | |
| mecab = MecabController() | |
| # Tests | |
| ########################################################################## | |
| if __name__ == "__main__": | |
| #--tests from dump file ; http://docs.google.com/spreadsheets/d/1gPDihUqhJuEgmknrPCcWeIqU5s7JtH2cqY8IH3VOiog/edit?pli=1#gid=36837976 | |
| if len(sys.argv) == 1: | |
| mecab.translation[u"自分"] = u"ジブン" | |
| expr = u"カリン、自分でまいた種は自分で刈り取れ" | |
| print mecab.opcode_restore(expr) | |
| expr = u"昨日、林檎を2個買った。" | |
| print mecab.opcode_restore(expr) | |
| expr = u"真莉、大好きだよん^^" | |
| print mecab.opcode_restore(expr) | |
| expr = u"彼2000万も使った。" | |
| print mecab.opcode_restore(expr) | |
| expr = u"彼二千三百六十円も使った。" | |
| print mecab.opcode_restore(expr) | |
| expr = u"#6Aこれで、残るはあと一つですね。" | |
| print mecab.opcode_restore(expr) | |
| expr = u"彼奴#4Rきゃつ#Rらには届かん……" | |
| print mecab.opcode_restore(expr) | |
| expr = u"力を奪われた今の私では#0E彼奴#4Rきゃつ#Rらには届かん……" | |
| print mecab.opcode_restore(expr) | |
| expr = u"#1W#2E#030cおぬしら、ノイと人間か……#0E" | |
| print mecab.opcode_restore(expr) | |
| expr = u"#1W#2E#030c私もようやく『檻』から解放される時がきたようだ。#0E" | |
| print mecab.opcode_restore(expr) | |
| print "\n" | |
| print "Usage: python ZnK_plus_mecab.py inputFile [outputFile]" | |
| elif len(sys.argv) >= 2: | |
| import codecs | |
| stdout = True | |
| if len(sys.argv) >= 3: | |
| stdout = False | |
| outfid = codecs.open(sys.argv[2], "w", "utf8") | |
| else: | |
| outfid = sys.stdout | |
| #formatter = defaultFormatter | |
| # find predefined furiganas | |
| mecab.dictionary( sys.argv[1] ) | |
| print str( len( mecab.translation ) ) + " keys" | |
| with codecs.open(sys.argv[1], "r", "utf8") as fid: | |
| allLines = fid.readlines() | |
| i = 0 | |
| increment = 100 | |
| for s in allLines: | |
| i = i + 1 | |
| if ( i % increment == 0): | |
| print str(i) + "lines" | |
| if ( i > 1000 ): | |
| increment = 1000 | |
| if ( i > 10000 ): | |
| increment = 10000 | |
| outfid.write(mecab.opcode_restore(s)) | |
| outfid.write('\n') | |
| print str(i) + "lines" | |
| if stdout: | |
| outfid.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment