Skip to content

Instantly share code, notes, and snippets.

@asSqr
Created May 10, 2019 07:03
Show Gist options
  • Save asSqr/86838056ae89d585858dd476175201a2 to your computer and use it in GitHub Desktop.
Save asSqr/86838056ae89d585858dd476175201a2 to your computer and use it in GitHub Desktop.
# coding: utf-8
from pykakasi import kakasi
import Levenshtein
import gensim
import MeCab
import numpy as np
from scipy import spatial
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('model/model_neologd.vec', binary=False)
mecab = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd -Owakati")
kakasi = kakasi()
kakasi.setMode( "J", "H" )
conv = kakasi.getConverter()
# 文章で使用されている単語の特徴ベクトルの平均を算出
def avg_feature_vector(sentence, model, num_features):
words = mecab.parse(sentence).replace(' \n', '').split() # mecabの分かち書きでは最後に改行(\n)が出力されてしまうため、除去
feature_vec = np.zeros((num_features,), dtype="float32") # 特徴ベクトルの入れ物を初期化
for word in words:
feature_vec = np.add(feature_vec, model[word])
if len(words) > 0:
feature_vec = np.divide(feature_vec, len(words))
return feature_vec
# 2つの文章の類似度を算出
def sentence_similarity(sentence_1, sentence_2):
# 今回使うWord2Vecのモデルは300次元の特徴ベクトルで生成されているので、num_featuresも300に指定
num_features=300
sentence_1_avg_vector = avg_feature_vector(sentence_1, word2vec_model, num_features)
sentence_2_avg_vector = avg_feature_vector(sentence_2, word2vec_model, num_features)
# 1からベクトル間の距離を引いてあげることで、コサイン類似度を計算
return 1 - spatial.distance.cosine(sentence_1_avg_vector, sentence_2_avg_vector)
def strConvert(str,distFunc):
source = conv.do(str)
targets = [u'みぎをあげて', u'みぎをさげて', u'ひだりをあげて', u'ひだりをさげて', u'しゅうりょう', u'認識不能']
codes = ['02', '01', '20', '10', '33', '00']
minD = 6
res = -1
for i in range(len(targets)):
d = distFunc( source, targets[i] )
if minD > d:
minD = d
res = i
return codes[res]+"\n"
with open('./codes.txt', mode='w') as f:
f.write(strConvert('みきあげて', sentence_similarity))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment