Created
May 10, 2019 07:03
-
-
Save asSqr/86838056ae89d585858dd476175201a2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from pykakasi import kakasi | |
import Levenshtein | |
import gensim | |
import MeCab | |
import numpy as np | |
from scipy import spatial | |
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('model/model_neologd.vec', binary=False) | |
mecab = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd -Owakati") | |
kakasi = kakasi() | |
kakasi.setMode( "J", "H" ) | |
conv = kakasi.getConverter() | |
# 文章で使用されている単語の特徴ベクトルの平均を算出 | |
def avg_feature_vector(sentence, model, num_features): | |
words = mecab.parse(sentence).replace(' \n', '').split() # mecabの分かち書きでは最後に改行(\n)が出力されてしまうため、除去 | |
feature_vec = np.zeros((num_features,), dtype="float32") # 特徴ベクトルの入れ物を初期化 | |
for word in words: | |
feature_vec = np.add(feature_vec, model[word]) | |
if len(words) > 0: | |
feature_vec = np.divide(feature_vec, len(words)) | |
return feature_vec | |
# 2つの文章の類似度を算出 | |
def sentence_similarity(sentence_1, sentence_2): | |
# 今回使うWord2Vecのモデルは300次元の特徴ベクトルで生成されているので、num_featuresも300に指定 | |
num_features=300 | |
sentence_1_avg_vector = avg_feature_vector(sentence_1, word2vec_model, num_features) | |
sentence_2_avg_vector = avg_feature_vector(sentence_2, word2vec_model, num_features) | |
# 1からベクトル間の距離を引いてあげることで、コサイン類似度を計算 | |
return 1 - spatial.distance.cosine(sentence_1_avg_vector, sentence_2_avg_vector) | |
def strConvert(str,distFunc): | |
source = conv.do(str) | |
targets = [u'みぎをあげて', u'みぎをさげて', u'ひだりをあげて', u'ひだりをさげて', u'しゅうりょう', u'認識不能'] | |
codes = ['02', '01', '20', '10', '33', '00'] | |
minD = 6 | |
res = -1 | |
for i in range(len(targets)): | |
d = distFunc( source, targets[i] ) | |
if minD > d: | |
minD = d | |
res = i | |
return codes[res]+"\n" | |
with open('./codes.txt', mode='w') as f: | |
f.write(strConvert('みきあげて', sentence_similarity)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment