December 9, 2015
experiments with disco
from operator import add
from itertools import combinations
from math import sqrt
def emit_pairs(words):
for pair in combinations(words, 2):
yield pair, 1
def cosine_similarity((w1,w2), cross_product, magnitudes):
similarity = cross_product/sqrt(magnitudes.value[w1])/sqrt(magnitudes.value[w2])
return (w1,w2), similarity
text = sc.textFile('text.txt').map(lambda s: s.lower())\
.map(lambda s: set(s.split()))\
magnitudes = text.flatMap(lambda s: s).map(lambda w: (w,1)).reduceByKey(add).collectAsMap()
broadcasted_magnitudes = sc.broadcast(magnitudes)
similarities = text.flatMap(emit_pairs)\
.map(lambda (k,v): cosine_similarity(k, v, broadcasted_magnitudes))
print similarities.collect()
