Skip to content

Instantly share code, notes, and snippets.

@usami
Created March 3, 2012 16:21
Show Gist options
  • Save usami/1966856 to your computer and use it in GitHub Desktop.
Save usami/1966856 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from stemming.porter2 import stem
from math import sqrt
class Docsim:
def __init__(self):
self.x = {}
self.y = {}
self.xlen = 0
self.ylen = 0
self.cap = None
def measure(self, document1, document2, types=['matching', 'dice', 'jaccard', 'overlap', 'b_cosine', 'cosine']):
self.x = self.count(document1)
self.y = self.count(document2)
self.xlen = len(self.x.keys())
self.ylen = len(self.y.keys())
self.cap = None
ret = []
for type in types:
if type == 'matching':
ret.append(self.matching())
if type == 'dice':
ret.append(self.dice())
if type == 'jaccard':
ret.append(self.jaccard())
if type == 'overlap':
ret.append(self.overlap())
if type == 'b_cosine':
ret.append(self.b_cosine())
if type == 'cosine':
ret.append(self.cosine())
if len(ret) > 1:
return ret
else:
return ret[0]
def count(self, document):
ret = {}
for w in document.split(' '):
w = stem(w)
if w in ret:
ret[w] += 1
else:
ret[w] = 1
return ret
def matching(self):
if self.cap is None:
self.cap = len(set(self.x.keys()) & set(self.y.keys()))
return self.cap
def dice(self):
return 2.0 * self.matching() / (self.xlen + self.ylen)
def jaccard(self):
return 1.0 * self.matching() / len(set(self.x.keys()) | set(self.y.keys()))
def overlap(self):
return 1.0 * self.matching() / min(self.xlen, self.ylen)
def b_cosine(self):
return 1.0 * self.matching() / sqrt(self.xlen * self.ylen)
def cosine(self):
dot = self.dot_product()
return 1.0 * dot / (self.norm(self.x) * self.norm(self.y))
def dot_product(self):
sum = 0
for k, v in self.x.items():
if k in self.y:
sum += v * self.y[k]
return sum
def norm(self, dic):
sum = 0
for v in dic.values():
sum += v * v
return sqrt(sum)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment