-
-
Save saki7/5550158 to your computer and use it in GitHub Desktop.
m.rb ― 辞書に読みがなが登録されていない単語でも読みがなが取れるクラス Yomigana を追加したMeCabのラッパーモジュール
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'MeCab' | |
# require 'lanx/str' | |
# Lanx::Str::Normalizer.katakana は、tr("ぁ-ん", "ァ-ン")してるだけです。自前で用意して下さい。 | |
module Lanx | |
module M | |
UNK_FEATURE = '未知語' | |
class Node | |
def initialize(mecab_node, options = {}) | |
raise ArgumentError, 'MeCab::Node' unless mecab_node.kind_of?(::MeCab::Node) | |
@options = options | |
@mecab_node = mecab_node | |
# feature | |
feature = @mecab_node.feature | |
if feature == UNK_FEATURE | |
@surface = Lanx::Str::Normalizer.katakana(@mecab_node.surface) | |
feature = "#{UNK_FEATURE},#{UNK_FEATURE},*,*,*,*,#{@surface},#{@surface},#{@surface}" | |
end | |
@feature = feature.split(',') | |
@surface ||= @mecab_node.surface | |
# 英語だけのノードは読み仮名を英語固定 | |
if @mecab_node.surface =~ /[[:ascii:]]+/ | |
@feature[6] = @mecab_node.surface | |
@feature[7] = @feature[6] | |
@feature[8] = @feature[6] | |
end | |
# 以下特別扱い | |
if @options[:force_yomi] | |
# 読みがなが入ってないノードだけ | |
if @feature[7].blank? | |
@feature[7] = Lanx::Str::Normalizer.katakana(@surface) | |
end | |
end # :force_yomi | |
# 表音なんてどうせ読みがなと同じでしょ的な | |
@feature[8] = @feature[7] if @feature[8].blank? | |
end | |
# see: http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html#parse | |
def surface; @surface; end | |
def pos; @feature[0]; end | |
def pos_detail1; @feature[1]; end | |
def pos_detail2; @feature[2]; end | |
def pos_detail3; @feature[3]; end | |
def katsuyou_shape; @feature[4] ; end | |
def katsuyou_type; @feature[5] ; end | |
def genkei; @feature[6]; end | |
def yomi; @feature[7]; end | |
def yomi=(new_yomi); @feature[7] = new_yomi; end | |
def hatsuon; @feature[8]; end | |
def word | |
self.genkei == "*" ? self.surface : self.genkei | |
end | |
end # Node | |
class Result | |
def initialize(mecab_node_head, options = {}) | |
raise ArgumentError, 'MeCab::Node' unless mecab_node_head.kind_of?(::MeCab::Node) | |
@options = options | |
@node_array = [] | |
@mecab_node_head = mecab_node_head | |
# to array | |
mecab_node = @mecab_node_head | |
while mecab_node | |
node = Node.new(mecab_node, @options) | |
unless node.pos == 'BOS/EOS' | |
@node_array << node | |
end | |
mecab_node = mecab_node.next | |
end | |
end | |
def to_a | |
@node_array | |
end | |
end # Result | |
class TaggerBase | |
def initialize(options = {}) | |
@options = options | |
@tagger = ::MeCab::Tagger.new("--unk-feature=#{UNK_FEATURE}") | |
end | |
end | |
class Tagger < TaggerBase | |
def initialize(options = {}) | |
super(options) | |
end | |
def parse(text) | |
return [] if text.blank? | |
res = Result.new(@tagger.parseToNode(text).next, @options) | |
res.to_a | |
end | |
end # Tagger | |
class TaggerNBest < TaggerBase | |
def initialize(n = 10, options = {}) | |
super(options) | |
@n = n | |
end | |
def parse(text) | |
return [] if text.blank? | |
return [] unless @tagger.parseNBestInit(text) | |
ret = [] | |
@n.times do | |
mecab_node = @tagger.nextNode | |
break unless mecab_node | |
result = Result.new(mecab_node.next, @options) | |
ret << result unless result.to_a.empty? | |
end | |
ret | |
end | |
end # TaggerNBest | |
class Yomigana | |
def self.get(text, try = 20) | |
tagger = TaggerNBest.new(try) | |
tagger.parse(text).each do |result| | |
every_word_has_yomigana = true | |
result.to_a.each do |node| | |
every_word_has_yomigana = false unless node.yomi.present? | |
end | |
if every_word_has_yomigana | |
return result.to_a.map(&:yomi).join | |
end | |
end | |
# 読みがなが取れませんでした。 | |
tagger = Tagger.new(force_yomi: true) | |
tagger.parse(text).map(&:yomi).join | |
end | |
end | |
end # M | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment