Skip to content

Instantly share code, notes, and snippets.

@mwgamera
Last active January 1, 2020 08:57
Show Gist options
  • Save mwgamera/523f976635e4286fc7713886e112ec67 to your computer and use it in GitHub Desktop.
Save mwgamera/523f976635e4286fc7713886e112ec67 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# klg, Dec 2019
require 'MeCab'
require 'erb'
def to_hira(str)
str.gsub(/[ァ-ヴヽヾ]/) do |x|
(x.ord - 96).chr x.encoding
end
end
# Try to match lexeme-internal okurigana
# and return array of [rb, rt] pairs.
def ruby_match(rb, rt)
bb = rb.scan(/[ぁ-ゔァ-ヴ]+|[^ぁ-ゔァ-ヴ]+/)
rr = bb.map do |x|
r = [Regexp.quote(x), Regexp.quote(to_hira(x))]
r.push('.*?') if x !~ /^[ぁ-ゔァ-ヴ]/
"(#{r.join '|'})"
end
cc = Regexp.new("^#{rr.join}$").match(rt)
return [[rb, rt]] if cc.nil?
cc.captures.zip(bb).map do |t, b|
to_hira(b) == to_hira(t) ? [b] : [b, t]
end
end
def ruby_html(rbrt)
if rbrt.length == 1 && rbrt[0].length == 1
return ERB::Util.html_escape(rbrt[0][0])
end
a = rbrt.map do |x|
rb, rt = x.map(&ERB::Util.method(:html_escape))
"<rb>#{rb}</rb><rt>#{rt}</rt>"
end
"<ruby>#{a.join}</ruby>"
end
@mw = MeCab::Tagger.new('-Owakati')
@my = MeCab::Tagger.new('-Oyomi')
ARGV.each do |arg|
sw = @mw.parse(arg)
sy = @my.parse(sw)
sw = sw.split
sy = sy.split
raise unless sw.length == sy.length
ar = sw.zip(sy).map do |w, y|
ruby_html(ruby_match(w, to_hira(y)))
end
puts(ar.join)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment