Skip to content

Instantly share code, notes, and snippets.

@umaz
Created February 5, 2019 12:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save umaz/1bc9c4ab41953b5c63fef32512fce0cd to your computer and use it in GitHub Desktop.
Save umaz/1bc9c4ab41953b5c63fef32512fce0cd to your computer and use it in GitHub Desktop.
RubyによるTFIDF値の算出
require "csv"
require 'natto'
data = [
#文書の配列
]
#tfの計算
def tf(list)
word_store = Hash.new(0) #初期値0のハッシュ
list.each do |count| #出現回数のカウント
word_store[count] += 1
end
sum = 0
word_store.each_value do |v| #総単語数のカウント
sum += v
end
word_store.each do |k, v| #valueをtfに書き換え
word_store[k] = (v/sum.to_f)
end
return word_store
end
def idf(tfidf, word_list)
l = tfidf.length.to_f #企画数
idf = Hash.new(0)
word_list.each do |v| #単語が出てくる企画数を数える
idf[v] += 1
end
#各企画の単語のtfidfを求める
idf.each do |k, v|
tfidf.each_value do |value|
if value.include?(k)
value[k] *= Math.log(l / v) + 1
end
end
end
#tfidfの値でソートする
sort = Hash.new
tfidf.each do |k, v|
s = v.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
sort[k] = Hash[s] #sは配列なのでハッシュに戻す
end
return sort
end
tfidf = Hash.new { |h,k| h[k] = {} } #二重ハッシュ
tf = Hash.new { |h,k| h[k] = {} } #二重ハッシュ
word_list = Array.new #企画毎にuniqな単語を集める
data.each_with_index do |item, i|
term = tf(item.split(/ /))
#tfidfを求めるための二重ハッシュ{企画番号 => {単語 => tfidf}}
project_id = i + 1
term.each do |k, v|
tfidf[project_id][k] = v #この時点では値はtf
tf[project_id][k] = v #この時点では値はtf
word_list.push(k) #各企画でuniqになった単語を集める
end
end
tfidf = idf(tfidf, word_list) #tfidf
tfidf.each do |k, v|
tf.each do |key, val|
if k == key
v.each do |kk, vv|
val.each do |keys, values|
if kk == keys
v[kk] = [values, vv / values, vv]
end
end
end
end
end
end
push = Array.new
tfidf.each do |k, v|
v.each do |key, value|
flat = [k, key, value]
push.push(flat.flatten)
end
end
CSV.open("tfidf.csv", "wb") do |csv|
csv << ["企画番号", "単語", "tf", "idf", "tfidf"]
push.each do |p|
csv << p
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment