Created
December 20, 2014 01:27
入力テキストとして270次元のベクトルファイルが必要になる
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sys | |
import math | |
title = sys.argv[1:] | |
vocab = {} | |
i = 0 | |
N=20 | |
#ベクトルの読み込み | |
for line in open(title[0],"r"): | |
j = 0 | |
# 1行目(ベクトル以外) | |
if i ==0: | |
print('now reading vectors') | |
# 1単語のベクトルずつ読み込む | |
if i != 0: | |
line = line.split() | |
tlist = [0] # 初期化 | |
# ベクトルの全要素を読み込む | |
for score in line: | |
# リストの1要素目はベクトルの長さを格納する | |
if j==0: | |
tlist[0] =0 | |
# ベクトルをtlistに追加 | |
if j != 0: | |
tlist.append(float(score)) | |
# ベクトルの長さを計算 | |
tlist[0] += float(score)*float(score) | |
j+=1 | |
# ベクトルの長さの計算(ルート取る) | |
tlist[0] = math.sqrt(tlist[0]) | |
# dict型に代入 | |
vocab[line[0]] = tlist | |
#print vocab[line[0]] | |
i+=1 | |
abc = 0 | |
# コサイン類似度計算(無限ループ) | |
for abc in range(0,100): | |
# 計算対象語の入力 | |
query = raw_input('please input word >') | |
print query | |
flug = 0 | |
if query in vocab.keys(): | |
# 入力語が存在しているならflug=1 | |
flug =1 | |
if flug == 0: | |
# 入力語が存在していないならflug =0 | |
print'the word does not exist in my dictionary.' | |
continue | |
print' Word Cosine distance' | |
print'------------------------------------------------------------------------' | |
cos={} # コサイン類似度を格納する辞書型リストの初期化 | |
queryVec = vocab[query] | |
# 全単語について探索する | |
for word in vocab.keys(): | |
tlist = vocab[word] | |
i = 0 | |
# 内積の初期化 | |
inpro = 0 | |
# 内積の計算 | |
for i in range(1,271): | |
inpro = queryVec[i] *tlist[i] | |
# 内積÷長さ | |
cos[word] = inpro / (queryVec[0]*tlist[0]) | |
# 上位N位を初期化 | |
i = 0 | |
bestd = {} | |
for i in range(0,N): | |
bestd[i]=[-1,''] | |
count = 0 | |
# この中から上位N位を選出 | |
for word in vocab.keys(): | |
#print word,cos[word] | |
a =0 | |
# 自分と同じ単語以外 | |
if query != word : | |
for a in range(0,N): | |
if cos[word] > bestd[a][0] : | |
b = N-1 | |
count +=1 | |
for b in range(b,a,-1): | |
temp = bestd[b-1] | |
bestd[b] = temp | |
bestd[a] = [word,cos[word]] | |
break | |
a+=1 | |
# 上位N位を表示 | |
print str(count) | |
i = 0 | |
for i in range(0,N): | |
print '%50s\t\t%f' % (bestd[i][0], bestd[i][1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment