Created
March 31, 2012 00:40
-
-
Save hitode909/2258253 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
require 'httpclient' | |
require 'nokogiri' | |
def get text | |
endpoint = 'http://jlp.yahooapis.jp/MAService/V1/parse' | |
appid = 'RZv4ed6xg67n15cyyGFF8Io0r3i.o0uwISXfrFOZYyMghbdeNA10_M6KemHLqz0laQ--' | |
params = { | |
:appid => appid, | |
:sentence => text, | |
} | |
client = HTTPClient.new | |
Nokogiri (client.get endpoint, params).body | |
end | |
class Word | |
attr_accessor :surface, :reading, :pos | |
def self.new_from_node node | |
me = new | |
me.surface = (node.at 'surface').text | |
me.reading = (node.at 'reading').text | |
me.pos = (node.at 'pos').text | |
me | |
end | |
def is_suffix | |
%w{ 連体詞 接続詞 接尾辞 助詞 助動詞 特殊 }.include? pos | |
end | |
def is_special | |
pos == '特殊' | |
end | |
def reading_length | |
return 0 if is_special | |
reading.gsub(/[ぁぃぅぇぉゎゃゅょ]/, '').length | |
end | |
end | |
class Array | |
def body | |
map{ |item| | |
item.surface | |
}.join '' | |
end | |
def reading_length | |
inject(0) { |a, b| a + b.reading_length } | |
end | |
end | |
text = ARGV.first | |
unless text | |
warn "usage: #{$0} (text)" | |
exit 1 | |
end | |
res = get text | |
state = 0 | |
buffer = [] | |
length_count = Hash.new 0 | |
words = (res.search 'word') | |
words.each_with_index{ |node, index| | |
is_last = index+1 == words.length | |
word = Word.new_from_node node | |
if state == 1 and not word.is_suffix | |
state = 0 | |
puts "#{buffer.reading_length}\t#{buffer.body}" | |
length_count[buffer.reading_length] += 1 | |
buffer = [] | |
elsif state == 0 and word.is_suffix | |
state = 1 | |
end | |
buffer << word | |
if is_last | |
puts "#{buffer.reading_length}\t#{buffer.body}" | |
length_count[buffer.reading_length] += 1 | |
end | |
} | |
puts | |
(1 .. length_count.keys.max).each{ |key| | |
puts "#{key}\t#{'*' * length_count[key]}" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% ruby reading_length.rb "「この味がいいね」と君が言ったから七月六日はサラダ記念日" | |
2 「この | |
3 味が | |
4 いいね」と | |
3 君が | |
5 言ったから | |
3 七月 | |
4 六日は | |
7 サラダ記念日 | |
1 | |
2 * | |
3 *** | |
4 ** | |
5 * | |
6 | |
7 * |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment