Skip to content

Instantly share code, notes, and snippets.

@sasamijp
Last active November 24, 2015 11:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sasamijp/3c90fc55a176fe3b9aaa to your computer and use it in GitHub Desktop.
Save sasamijp/3c90fc55a176fe3b9aaa to your computer and use it in GitHub Desktop.
# encoding: utf-8
require 'natto'
require 'pp'
require 'matrix'
@natto = Natto::MeCab.new
def simcos(v1, v2)
v1.inner_product(v2).to_f / (v1.norm * v2.norm)
end
def to_bigrams(sent)
words = []
@natto.parse(sent) { |n| words << n.surface }
bigrams = []
words[0..-3].each_with_index do |word, i|
bigrams << [words[i], words[i+1]].join('')
end
bigrams
end
all_bigrams =[]
File::open("all_bigram.csv") {|f| # 全ての2-gramファイル
f.each {|line| all_bigrams << line.chomp}
}
vecs =[]
File::open("vecs.csv") {|f| # ベクトル
f.each {|line| vecs << line.chomp.split(',')}
}
sents = []
File::open("amami_n.csv") {|f| # 元コーパス
f.each {|line| sents << line.chomp.split('|||')[1]}
}
test_case = []
loop do
input = gets
break if input.nil?
test_case << input.chomp
end
test_case.delete_if{|v|v == ''}
test_case.each do |sent|
print sent
dims = []
bag = to_bigrams(sent)
bag.each do |bigram|
all_bigrams.each_with_index do |v, i|
if bigram == v
dims << i
end
end
end
vecsent = Array.new(40355, 0)
dims.each do |d|
vecsent[d] = 1
end
vecsent = Vector.elements(vecsent)
nvecs = []
vecs.each do |vec|
vec.each_with_index do |v, i|
if i % 2 == 1 and dims.include? v.to_i
nvecs << vec
break
end
end
end
ret = nvecs.map do |vec|
nvec = Array.new(40355, 0)
vec.each_with_index do |v, i|
nvec[v.to_i] = vec[i-1].to_f if i % 2 == 1
end
[simcos(Vector.elements(nvec), vecsent), sents[vecs.index(vec)]]
end
print " - "
puts ret.sort_by{|v|v[0]}.map{|v|v[1]}[0]
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment