Skip to content

Instantly share code, notes, and snippets.

@tily
Created August 1, 2011 03:53
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tily/1117548 to your computer and use it in GitHub Desktop.
Save tily/1117548 to your computer and use it in GitHub Desktop.
日本語テキストから脚韻を抽出するスクリプト (まとめ)
#!/usr/bin/env ruby
# Usage: ruby extract_rhyme.rb [-m (vowel|vowel_with_help|surface)] -n N /path/to/file.txt
# Example: ruby extract_rhyme.rb -m vowel -n 3 坊ちゃん.txt
%w|optparse MeCab|.each{|x| require x}
def main(args)
opts, args = parse_args(args)
rhyme = {}
File.open(args[0]).each do |line|
node_list = get_node_list(line)
node_list.each do |node|
yomi = node[:feature].split(',').last
if yomi == '*' && JaSound.only_kana?(node[:surface])
yomi = node[:surface]
end
sound_list = JaSound.split(yomi)
if opts[:mode] == 'vowel'
sound_list = sound_list.map {|s| JaSound.to_vowel(s) }
elsif opts[:mode] == 'vowel_with_help'
sound_list = sound_list.map {|s| JaSound.to_vowel_with_help(s) }
end
if sound_list.size >= opts[:num]
tail = sound_list[-opts[:num], opts[:num]].join('')
rhyme[tail] ||= []
if !rhyme[tail].include?(node[:surface])
rhyme[tail].push(node[:surface])
end
end
end
end
rhyme.keys.sort.each do |k|
if rhyme[k].size > 2
puts "#{k}: #{rhyme[k].join(' | ')}"
end
end
end
def parse_args(args)
opts = {}
OptionParser.new do |opt|
opt.on("-m MODE", String ) {|v| opts[:mode] = v }
opt.on("-n NUM" , Integer) {|v| opts[:num] = v }
opt.parse!(args)
end
[opts, args]
end
def get_node_list(text)
list = []
tagger = MeCab::Tagger.new
node = tagger.parseToNode(text)
while node = node.next
list << {:surface => node.surface, :feature => node.feature}
end
list
end
class JaSound
LARGE_MAP = {
%w|ア カ サ タ ナ ハ マ ヤ ラ ワ ガ ザ ダ バ パ| => 'ア',
%w|イ キ シ チ ニ ヒ ミ リ ヰ ギ ジ ヂ ビ ピ| => 'イ',
%w|ウ ク ス ツ ヌ フ ム ユ ル ヴ グ ズ ヅ ブ プ| => 'ウ',
%w|エ ケ セ テ ネ ヘ メ レ ヱ ゲ ゼ デ ベ ペ| => 'エ',
%w|オ コ ソ ト ノ ホ モ ヨ ロ ヲ ゴ ゾ ド ボ ポ| => 'オ'
}
SMALL_MAP = {
%w|ァ ャ ヮ| => 'ア',
%w|ィ | => 'イ',
%w|ゥ ュ | => 'ウ',
%w|ェ | => 'エ',
%w|ォ ョ | => 'オ'
}
LARGE_STR = LARGE_MAP.keys.flatten.join('')
SMALL_STR = SMALL_MAP.keys.flatten.join('')
HELP_STR = %w|ッ ー ン|.join('')
def self.split(text)
text.scan /[#{LARGE_STR}][#{SMALL_STR}#{HELP_STR}]*/u
end
def self.only_kana?(text)
text.match /^[#{LARGE_STR}#{SMALL_STR}#{HELP_STR}]+$/u
end
def self.to_vowel(sound)
result = to_vowel_with_help(sound)
result.gsub(/[#{HELP_STR}]/u, '')
end
def self.to_vowel_with_help(sound)
regexp = /([#{LARGE_STR}])([#{SMALL_STR}]?)/u
sound.sub(regexp) do
map = $2 == '' ? LARGE_MAP : SMALL_MAP
snd = $2 == '' ? $1 : $2
map.each {|l, c| break c if l.include?(snd) }
end
end
end
main(ARGV)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment