Created
November 24, 2015 10:43
-
-
Save sasamijp/aded239e101de6d622c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
require 'natto' | |
require 'pp' | |
@natto = Natto::MeCab.new | |
def tf_idf(all_docs, doc_words, word) | |
nword = doc_words.count(word) | |
tf = nword.to_f / doc_words.length | |
df = 0 | |
all_docs.each do |doc| | |
df += 1 if doc.include? word | |
end | |
idf = Math.log( all_docs.length.to_f / df.to_f) | |
tf * idf | |
end | |
corpus = [] | |
loop do # コーパスデータ読み込み | |
input = gets | |
break if input.nil? | |
corpus << input.chomp.split('|||') | |
end | |
bigrams =[] | |
File::open("all_bigram.csv") {|f| | |
f.each {|line| bigrams << line.chomp} | |
} | |
ncorpus = [] | |
corpus.each do |sent| # コーパスをbigramの連なりの構造に変換 | |
nd = [] | |
@natto.parse(sent[0]) do |n| | |
nd << n.surface | |
end | |
g = [] | |
nd[0..-3].each_with_index do |word, i| | |
g << [nd[i], nd[i+1]].join('') | |
end | |
ncorpus << g | |
end | |
ncorpus.each do |g| | |
vec = [] | |
bigrams.each_with_index do |bigram, i| | |
if g.include? bigram | |
vec << [tf_idf(ncorpus, g, bigram), i] # ベクトルのi番目にtf-idfの値が入る明示 | |
end | |
end | |
puts vec.flatten.join(',') | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment