Skip to content

Instantly share code, notes, and snippets.

@Kasahs
Last active December 17, 2020 17:40
Show Gist options
  • Save Kasahs/a1cd720e3c9d2f66bff0ee9a81210b9e to your computer and use it in GitHub Desktop.
Save Kasahs/a1cd720e3c9d2f66bff0ee9a81210b9e to your computer and use it in GitHub Desktop.
Cosine similarity of two string vectors
import math
import re
import sys
from collections import Counter
WORD = re.compile(r"\w+")
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
def main():
infile_path = sys.argv[1]
query_file_path = sys.argv[2]
with open(query_file_path, "r") as query_file:
with open(infile_path, "r") as infile:
q_l_no = 1
for query in query_file:
q_l_no += 1
l_no = 1
lines = []
for line in infile:
l_no += 1
vector1 = text_to_vector(query)
vector2 = text_to_vector(line)
cosine = get_cosine(vector1, vector2)
lines.append((cosine, line, l_no))
s_lines = sorted(lines, key=lambda x: x[0], reverse=True)
print(str(q_l_no) + ": " + query)
print(
"\n".join([str(l_no) + ": " + str(l[0]) + ", " + l[1] for l in s_lines]))
print("\n")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment