Skip to content

Instantly share code, notes, and snippets.

@takehiko
Created August 13, 2015 22:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takehiko/816f735182780b690f94 to your computer and use it in GitHub Desktop.
Save takehiko/816f735182780b690f94 to your computer and use it in GitHub Desktop.
Romanization Rule into Trie
#!/usr/bin/env ruby
# kanatrie.rb : ローマ字の綴りをトライ木に(ヘボン式・訓令式・日本式に対応)
# by takehikom
# required: KAKASI, Graphviz
def create_romaji_kana_hash(option = {})
if option[:label]
puts "==== #{option[:label]} ===="
end
str1 = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽ"
str2 = "きしちにひみりぎじぢびぴ"
open("kana.txt", "w") do |f_out|
f_out.puts str1.split(//)
f_out.puts str2.split(//).map {|c| [c + "ゃ", c + "ゅ", c + "ょ"]}
end
command = "kakasi -Ha -r -iutf8 <kana.txt >romaji.txt"
command.sub!(/-r/, "-rkunrei") if option[:kunrei]
puts command
system command
a1 = open("kana.txt").read.split(/\n/)
a2 = open("romaji.txt").read.split(/\n/)
r2c_h = {}
c2r_h = {}
a1.each_with_index do |c1, i|
c2 = a2[i]
if option[:kunrei] && !option[:di]
if c1 == "ぢ"
c2 = "zi"
elsif c1 == "づ"
c2 = "zu"
end
end
if !option[:wo] && c1 == "を"
c2 = "o"
end
puts "#{c2} => #{c1}"
if r2c_h.key?(c2)
r2c_h[c2] = r2c_h[c2] + "," + c1
puts "(#{c2} => #{r2c_h[c2]})"
else
r2c_h[c2] = c1
end
c2r_h[c1] = c2
end
File.unlink("kana.txt")
File.unlink("romaji.txt")
[r2c_h, c2r_h]
end
rk_h = {}
rk_h[:hepburn] = create_romaji_kana_hash(:ji => true, :label => "hepburn")
rk_h[:kunrei] = create_romaji_kana_hash(:kunrei => true, :label => "kunrei")
rk_h[:nippon] = create_romaji_kana_hash(:kunrei => true, :di => true, :wo => true, :label => "nippon")
# exit
def generate_trie(c2r_h, basename, label)
puts "start: #{label}"
open("#{basename}.dot", "w") do |f_out|
f_out.puts "digraph #{basename} {"
f_out.puts " graph [rankdir = LR, label = \"#{label}\"];"
node_a = []
node_a << "empty [label = \"\"];"
path_a = []
romend_a = []
c_keys = c2r_h.keys.sort_by {|key|
case key[0]
when /[あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん]/
"1" + key
when /[がぎぐげござじずぜぞだぢづでどばびぶべぼ]/
"2" + key
when /[ぱぴぷぺぽ]/
"3" + key
else
"9" + key
end
}
c_keys.each do |c|
rom = c2r_h[c]
romseq_a = (1..(rom.length)).to_a.map {|i| rom[0, i]}
romend = "x_" + c.unpack("H*").first
romend_a << romend
node_a << "#{romend} [shape = box, label = \"#{c}\"];"
path_a << "empty -> #{romseq_a.first};"
0.upto(romseq_a.length - 2) do |i|
path_a << [romseq_a[i], romseq_a[i + 1]].join(" -> ") + ";"
end
path_a << "#{rom} -> #{romend} [dir = none, style = dotted];"
end
node_a.uniq!
path_a.uniq!
f_out.puts
f_out.puts node_a.map {|item| " " + item}
f_out.puts
f_out.puts path_a.map {|item| " " + item}
f_out.puts
f_out.puts " {rank = same; #{romend_a.join('; ')}}"
f_out.puts "}"
end
command = "dot -Tpng #{basename}.dot -o #{basename}.png"
puts command
system command
end
label = "ヘボン式ローマ字のトライ木"
basename = "kanatrie_h"
generate_trie(rk_h[:hepburn][1], basename, label)
label = "訓令式ローマ字のトライ木"
basename = "kanatrie_k"
generate_trie(rk_h[:kunrei][1], basename, label)
label = "日本式ローマ字のトライ木"
basename = "kanatrie_n"
generate_trie(rk_h[:nippon][1], basename, label)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment