Last active
October 6, 2019 14:03
-
-
Save rixwwd/b6a7314d912af80a38d859ca2ff07402 to your computer and use it in GitHub Desktop.
tf-idf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# coding: UTF-8 | |
require 'logger' | |
require 'mecab' | |
@logger = Logger.new(STDOUT) | |
@logger.level = Logger::INFO | |
# ドキュメントの単語数をカウントする。 | |
# | |
# doc :: 対象ドキュメントのファイル名 | |
def count_word(doc) | |
model = MeCab::Model.new | |
tagger = model.createTagger | |
word_list = {} | |
word_list.default = 0 | |
open(doc).each do |line| | |
line.chomp! | |
next if line.empty? | |
node = tagger.parseToNode(line) | |
while node do | |
hinshi, tango = node.feature.split(',').values_at(0, 6) | |
if ['名詞', '動詞'].include?(hinshi) && !tango.empty? | |
word_list[tango] += 1 | |
end | |
node = node.next | |
end | |
end | |
word_list | |
end | |
# されたドキュメントの単語数をマージする。 | |
# docs :: count_wordの結果。単語:出現数のハッシュのリスト。 | |
def merge_word(docs) | |
count_map = {} | |
docs.each do |d| | |
count_map.merge!(d) { |k, a, b| a + b } | |
end | |
count_map | |
end | |
# ドキュメントにおける単語の出現頻度を求める。 | |
# doc :: 単語:出現回数のハッシュ。 | |
def tf(doc) | |
a = {} | |
s = doc.values.sum | |
doc.each_pair do |k, v| | |
a[k] = v / s.to_f | |
end | |
a | |
end | |
# ドキュメントにおける出現頻度の逆を求める。 | |
# 単語:出現数のハッシュのリスト。 | |
def idf(docs) | |
all_doc = docs.length.to_f | |
a = {} | |
a.default = 0 | |
docs.each do |d| | |
d.each_key {|k| a[k] += 1 } | |
end | |
b = {} | |
a.each_pair do |k, v| | |
b[k] = Math.log(all_doc / v) | |
end | |
b | |
end | |
# tf-idfを求める。 | |
# files :: 対象ドキュメントのファイル名。 | |
def tfidf(files) | |
@logger.debug("tf") | |
docs = files.map {|f| count_word(f)} | |
kekka_tf = docs.map {|d| tf(d) } | |
@logger.debug(kekka_tf) | |
@logger.debug("idf") | |
kekka_idf = idf(docs) | |
@logger.debug(kekka_idf) | |
@logger.debug("tf-idf") | |
kekka_tfidf = [] | |
kekka_tf.each do |d| | |
a = [] | |
d.each_pair do |word, tf| | |
a << [word, tf * kekka_idf[word]] | |
end | |
kekka_tfidf << a | |
end | |
@logger.debug(kekka_tfidf) | |
kekka_tfidf | |
end | |
kekka_tfidf = tfidf(["a.txt", "b.txt", "c.txt", "d.txt"]) | |
kekka_tfidf.each do |x| | |
x.sort! { |a, b| b[1] - a[1] } | |
x.first(5).each {|x| puts x.join("\t")} | |
puts "----" | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment