Created
November 8, 2017 19:42
-
-
Save yoneda/8f2ae88ff1484669f107870b81c60975 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import math | |
def calc_cos(dictA, dictB): | |
""" | |
cos類似度を計算する関数 | |
@param dictA 1つ目の文章 | |
@param dictB 2つ目の文章 | |
@return cos類似度を計算した結果。0〜1で1に近ければ類似度が高い。 | |
""" | |
# 文書Aのベクトル長を計算 | |
lengthA = 0.0 | |
for key,value in dictA.items(): | |
lengthA = lengthA + value*value | |
lengthA = math.sqrt(lengthA) | |
# 文書Bのベクトル長を計算 | |
lengthB = 0.0 | |
for key,value in dictB.items(): | |
lengthB = lengthB + value*value | |
lengthB = math.sqrt(lengthB) | |
# AとBの内積を計算 | |
dotProduct = 0.0 | |
for keyA,valueA in dictA.items(): | |
for keyB,valueB in dictB.items(): | |
if keyA==keyB: | |
dotProduct = dotProduct + valueA*valueB | |
# cos類似度を計算 | |
cos = dotProduct / (lengthA*lengthB) | |
return cos | |
def words_to_freqdict(words): | |
""" | |
単語の配列を、単語と頻度の辞書に変換する関数 | |
例: ["X","X","Y","Z","X"] => {"X":3, "Y":1, "Z":1} | |
@param words 単語の配列 | |
@return 単語と頻度の辞書 | |
""" | |
freqdict = {} | |
for word in words: | |
if word in freqdict: | |
freqdict[word] = freqdict[word] + 1 | |
else: | |
freqdict[word] = 1 | |
return freqdict | |
def main(): | |
docA = ["リンゴ", "ぶどう", "リンゴ", "パイナップル", "リンゴ"] | |
docB = ["バスケ", "サッカー", "野球", "ぶどう", "テニス"] | |
docC = ["ぶどう", "リンゴ", "マンゴー", "ぶどう"] | |
freqdictA = words_to_freqdict(docA) # {"リンゴ":3, "ぶどう":1, "パイナップル":1} | |
freqdictB = words_to_freqdict(docB) # {"バスケ":1, "サッカー":1, "野球":1, "ぶどう":1. "テニス":1} | |
freqdictC = words_to_freqdict(docC) # {"ぶどう":2, "リンゴ":1, "マンゴー":1} | |
cosAB = calc_cos(freqdictA,freqdictB) | |
cosAC = calc_cos(freqdictA,freqdictC) | |
print(cosAB) | |
print(cosAC) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment