Skip to content

Instantly share code, notes, and snippets.

@saki7
Last active December 17, 2015 04:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saki7/5550158 to your computer and use it in GitHub Desktop.
Save saki7/5550158 to your computer and use it in GitHub Desktop.
m.rb ― 辞書に読みがなが登録されていない単語でも読みがなが取れるクラス Yomigana を追加したMeCabのラッパーモジュール
# coding: utf-8
require 'MeCab'
# require 'lanx/str'
# Lanx::Str::Normalizer.katakana は、tr("ぁ-ん", "ァ-ン")してるだけです。自前で用意して下さい。
module Lanx
module M
UNK_FEATURE = '未知語'
class Node
def initialize(mecab_node, options = {})
raise ArgumentError, 'MeCab::Node' unless mecab_node.kind_of?(::MeCab::Node)
@options = options
@mecab_node = mecab_node
# feature
feature = @mecab_node.feature
if feature == UNK_FEATURE
@surface = Lanx::Str::Normalizer.katakana(@mecab_node.surface)
feature = "#{UNK_FEATURE},#{UNK_FEATURE},*,*,*,*,#{@surface},#{@surface},#{@surface}"
end
@feature = feature.split(',')
@surface ||= @mecab_node.surface
# 英語だけのノードは読み仮名を英語固定
if @mecab_node.surface =~ /[[:ascii:]]+/
@feature[6] = @mecab_node.surface
@feature[7] = @feature[6]
@feature[8] = @feature[6]
end
# 以下特別扱い
if @options[:force_yomi]
# 読みがなが入ってないノードだけ
if @feature[7].blank?
@feature[7] = Lanx::Str::Normalizer.katakana(@surface)
end
end # :force_yomi
# 表音なんてどうせ読みがなと同じでしょ的な
@feature[8] = @feature[7] if @feature[8].blank?
end
# see: http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html#parse
def surface; @surface; end
def pos; @feature[0]; end
def pos_detail1; @feature[1]; end
def pos_detail2; @feature[2]; end
def pos_detail3; @feature[3]; end
def katsuyou_shape; @feature[4] ; end
def katsuyou_type; @feature[5] ; end
def genkei; @feature[6]; end
def yomi; @feature[7]; end
def yomi=(new_yomi); @feature[7] = new_yomi; end
def hatsuon; @feature[8]; end
def word
self.genkei == "*" ? self.surface : self.genkei
end
end # Node
class Result
def initialize(mecab_node_head, options = {})
raise ArgumentError, 'MeCab::Node' unless mecab_node_head.kind_of?(::MeCab::Node)
@options = options
@node_array = []
@mecab_node_head = mecab_node_head
# to array
mecab_node = @mecab_node_head
while mecab_node
node = Node.new(mecab_node, @options)
unless node.pos == 'BOS/EOS'
@node_array << node
end
mecab_node = mecab_node.next
end
end
def to_a
@node_array
end
end # Result
class TaggerBase
def initialize(options = {})
@options = options
@tagger = ::MeCab::Tagger.new("--unk-feature=#{UNK_FEATURE}")
end
end
class Tagger < TaggerBase
def initialize(options = {})
super(options)
end
def parse(text)
return [] if text.blank?
res = Result.new(@tagger.parseToNode(text).next, @options)
res.to_a
end
end # Tagger
class TaggerNBest < TaggerBase
def initialize(n = 10, options = {})
super(options)
@n = n
end
def parse(text)
return [] if text.blank?
return [] unless @tagger.parseNBestInit(text)
ret = []
@n.times do
mecab_node = @tagger.nextNode
break unless mecab_node
result = Result.new(mecab_node.next, @options)
ret << result unless result.to_a.empty?
end
ret
end
end # TaggerNBest
class Yomigana
def self.get(text, try = 20)
tagger = TaggerNBest.new(try)
tagger.parse(text).each do |result|
every_word_has_yomigana = true
result.to_a.each do |node|
every_word_has_yomigana = false unless node.yomi.present?
end
if every_word_has_yomigana
return result.to_a.map(&:yomi).join
end
end
# 読みがなが取れませんでした。
tagger = Tagger.new(force_yomi: true)
tagger.parse(text).map(&:yomi).join
end
end
end # M
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment