Created
February 5, 2019 12:51
-
-
Save umaz/1bc9c4ab41953b5c63fef32512fce0cd to your computer and use it in GitHub Desktop.
RubyによるTFIDF値の算出
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "csv" | |
require 'natto' | |
data = [ | |
#文書の配列 | |
] | |
#tfの計算 | |
def tf(list) | |
word_store = Hash.new(0) #初期値0のハッシュ | |
list.each do |count| #出現回数のカウント | |
word_store[count] += 1 | |
end | |
sum = 0 | |
word_store.each_value do |v| #総単語数のカウント | |
sum += v | |
end | |
word_store.each do |k, v| #valueをtfに書き換え | |
word_store[k] = (v/sum.to_f) | |
end | |
return word_store | |
end | |
def idf(tfidf, word_list) | |
l = tfidf.length.to_f #企画数 | |
idf = Hash.new(0) | |
word_list.each do |v| #単語が出てくる企画数を数える | |
idf[v] += 1 | |
end | |
#各企画の単語のtfidfを求める | |
idf.each do |k, v| | |
tfidf.each_value do |value| | |
if value.include?(k) | |
value[k] *= Math.log(l / v) + 1 | |
end | |
end | |
end | |
#tfidfの値でソートする | |
sort = Hash.new | |
tfidf.each do |k, v| | |
s = v.sort {|(k1, v1), (k2, v2)| v2 <=> v1 } | |
sort[k] = Hash[s] #sは配列なのでハッシュに戻す | |
end | |
return sort | |
end | |
tfidf = Hash.new { |h,k| h[k] = {} } #二重ハッシュ | |
tf = Hash.new { |h,k| h[k] = {} } #二重ハッシュ | |
word_list = Array.new #企画毎にuniqな単語を集める | |
data.each_with_index do |item, i| | |
term = tf(item.split(/ /)) | |
#tfidfを求めるための二重ハッシュ{企画番号 => {単語 => tfidf}} | |
project_id = i + 1 | |
term.each do |k, v| | |
tfidf[project_id][k] = v #この時点では値はtf | |
tf[project_id][k] = v #この時点では値はtf | |
word_list.push(k) #各企画でuniqになった単語を集める | |
end | |
end | |
tfidf = idf(tfidf, word_list) #tfidf | |
tfidf.each do |k, v| | |
tf.each do |key, val| | |
if k == key | |
v.each do |kk, vv| | |
val.each do |keys, values| | |
if kk == keys | |
v[kk] = [values, vv / values, vv] | |
end | |
end | |
end | |
end | |
end | |
end | |
push = Array.new | |
tfidf.each do |k, v| | |
v.each do |key, value| | |
flat = [k, key, value] | |
push.push(flat.flatten) | |
end | |
end | |
CSV.open("tfidf.csv", "wb") do |csv| | |
csv << ["企画番号", "単語", "tf", "idf", "tfidf"] | |
push.each do |p| | |
csv << p | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment