tily/katakana_important_words.rb

## katakana_important_words.rb
#!/usr/bin/env ruby
# Usage: ruby katakana_important_words.rb < source.txt > target.csv
%w(cgi open-uri rubygems nokogiri).each{|x| require x }

DOC_ALL = 19200000000
APP_ID  = 'your yahoo api id here'
API_URL = 'http://search.yahooapis.jp/WebSearchService/V1/webSearch?appid=%s&query=%s'

def main
  score = {}
  freq = freq(STDIN)
  freq.each do |term,tf|
    score[term] = tfidf(term, tf)
    sleep 1
  end
  sorted = score.sort_by {|term,tfidf| tfidf }
  sorted.each do |term,tfidf|
    puts "#{term},#{tfidf}"
  end
end

def freq(io)
  freq = Hash.new(0)
  while l = io.gets
    words = l.scan(/[ァ-ヾ=＝]+/u)
    words.each {|m| freq[m] += 1 }
  end
  freq
end

def tfidf(term, tf)
  df = df(term)
  df = df == 0 ? 1 : df
  tf * Math.log(DOC_ALL/df)
end

def df(term)
  term = CGI.escape(term)
  url = API_URL % [APP_ID, term]
  xml = Nokogiri::XML(open(url))
  cnt = xml.xpath('//@totalResultsAvailable')
  cnt.to_s.to_i
end

main
	#!/usr/bin/env ruby
	# Usage: ruby katakana_important_words.rb < source.txt > target.csv
	%w(cgi open-uri rubygems nokogiri).each{\|x\| require x }

	DOC_ALL = 19200000000
	APP_ID = 'your yahoo api id here'
	API_URL = 'http://search.yahooapis.jp/WebSearchService/V1/webSearch?appid=%s&query=%s'

	def main
	score = {}
	freq = freq(STDIN)
	freq.each do \|term,tf\|
	score[term] = tfidf(term, tf)
	sleep 1
	end
	sorted = score.sort_by {\|term,tfidf\| tfidf }
	sorted.each do \|term,tfidf\|
	puts "#{term},#{tfidf}"
	end
	end

	def freq(io)
	freq = Hash.new(0)
	while l = io.gets
	words = l.scan(/[ァ-ヾ=＝]+/u)
	words.each {\|m\| freq[m] += 1 }
	end
	freq
	end

	def tfidf(term, tf)
	df = df(term)
	df = df == 0 ? 1 : df
	tf * Math.log(DOC_ALL/df)
	end

	def df(term)
	term = CGI.escape(term)
	url = API_URL % [APP_ID, term]
	xml = Nokogiri::XML(open(url))
	cnt = xml.xpath('//@totalResultsAvailable')
	cnt.to_s.to_i
	end

	main