Skip to content

Instantly share code, notes, and snippets.

@yoneda
Created November 8, 2017 19:42
Show Gist options
  • Save yoneda/8f2ae88ff1484669f107870b81c60975 to your computer and use it in GitHub Desktop.
Save yoneda/8f2ae88ff1484669f107870b81c60975 to your computer and use it in GitHub Desktop.
# coding: utf-8
import math
def calc_cos(dictA, dictB):
"""
cos類似度を計算する関数
@param dictA 1つ目の文章
@param dictB 2つ目の文章
@return cos類似度を計算した結果。0〜1で1に近ければ類似度が高い。
"""
# 文書Aのベクトル長を計算
lengthA = 0.0
for key,value in dictA.items():
lengthA = lengthA + value*value
lengthA = math.sqrt(lengthA)
# 文書Bのベクトル長を計算
lengthB = 0.0
for key,value in dictB.items():
lengthB = lengthB + value*value
lengthB = math.sqrt(lengthB)
# AとBの内積を計算
dotProduct = 0.0
for keyA,valueA in dictA.items():
for keyB,valueB in dictB.items():
if keyA==keyB:
dotProduct = dotProduct + valueA*valueB
# cos類似度を計算
cos = dotProduct / (lengthA*lengthB)
return cos
def words_to_freqdict(words):
"""
単語の配列を、単語と頻度の辞書に変換する関数
例: ["X","X","Y","Z","X"] => {"X":3, "Y":1, "Z":1}
@param words 単語の配列
@return 単語と頻度の辞書
"""
freqdict = {}
for word in words:
if word in freqdict:
freqdict[word] = freqdict[word] + 1
else:
freqdict[word] = 1
return freqdict
def main():
docA = ["リンゴ", "ぶどう", "リンゴ", "パイナップル", "リンゴ"]
docB = ["バスケ", "サッカー", "野球", "ぶどう", "テニス"]
docC = ["ぶどう", "リンゴ", "マンゴー", "ぶどう"]
freqdictA = words_to_freqdict(docA) # {"リンゴ":3, "ぶどう":1, "パイナップル":1}
freqdictB = words_to_freqdict(docB) # {"バスケ":1, "サッカー":1, "野球":1, "ぶどう":1. "テニス":1}
freqdictC = words_to_freqdict(docC) # {"ぶどう":2, "リンゴ":1, "マンゴー":1}
cosAB = calc_cos(freqdictA,freqdictB)
cosAC = calc_cos(freqdictA,freqdictC)
print(cosAB)
print(cosAC)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment