Created
November 17, 2010 12:56
-
-
Save tily/703361 to your computer and use it in GitHub Desktop.
日本語文書に出現するカタカナ語の TF/IDF を取得する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Usage: ruby katakana_important_words.rb < source.txt > target.csv | |
%w(cgi open-uri rubygems nokogiri).each{|x| require x } | |
DOC_ALL = 19200000000 | |
APP_ID = 'your yahoo api id here' | |
API_URL = 'http://search.yahooapis.jp/WebSearchService/V1/webSearch?appid=%s&query=%s' | |
def main | |
score = {} | |
freq = freq(STDIN) | |
freq.each do |term,tf| | |
score[term] = tfidf(term, tf) | |
sleep 1 | |
end | |
sorted = score.sort_by {|term,tfidf| tfidf } | |
sorted.each do |term,tfidf| | |
puts "#{term},#{tfidf}" | |
end | |
end | |
def freq(io) | |
freq = Hash.new(0) | |
while l = io.gets | |
words = l.scan(/[ァ-ヾ==]+/u) | |
words.each {|m| freq[m] += 1 } | |
end | |
freq | |
end | |
def tfidf(term, tf) | |
df = df(term) | |
df = df == 0 ? 1 : df | |
tf * Math.log(DOC_ALL/df) | |
end | |
def df(term) | |
term = CGI.escape(term) | |
url = API_URL % [APP_ID, term] | |
xml = Nokogiri::XML(open(url)) | |
cnt = xml.xpath('//@totalResultsAvailable') | |
cnt.to_s.to_i | |
end | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment