Skip to content

Instantly share code, notes, and snippets.

@krishpop
Created April 16, 2019 03:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krishpop/6c7da5e47acccc0037cd7bc2b2806cd7 to your computer and use it in GitHub Desktop.
Save krishpop/6c7da5e47acccc0037cd7bc2b2806cd7 to your computer and use it in GitHub Desktop.
Similarity metrics for Sparse Matrices
def jaccard_metric(x, y):
"""
x: scipy.sparse CSR matrix shape (1, n)
y: scipy.sparse CSR matrix shape (1, n)
returns: jaccard similarity
"""
return x.minimum(y).sum()/x.maximum(y).sum()
def l2_metric(x,y):
"""
x: scipy.sparse CSR matrix shape (1, n)
y: scipy.sparse CSR matrix shape (1, n)
returns: l2 similarity
"""
from scipy.sparse import linalg
return -linalg.norm(x-y)
def cos_metric(x, y):
"""
x: scipy.sparse CSR matrix shape (1, n)
y: scipy.sparse CSR matrix shape (1, n)
returns: cosine similarity
"""
from scipy.sparse import linalg
return x.dot(y) / (linalg.norm(x) * linalg.norm(y))
def jaccard_pdist(X, Y):
"""
X: scipy.sparse CSR matrix, shape (m1, n)
Y: scipy.sparse CSR matrix, shape (m2, n)
returns: pairwise jaccard distance between X and Y, shape (m1, m2)
"""
m2 = Y.shape[0]
d = []
Y_idx = np.repeat(0, m2)
for i in range(m2):
num = np.asarray(X.minimum(Y[Y_idx]).sum(1))
denom = np.asarray(X.maximum(Y[Y_idx]).sum(1))
d.append(num/denom)
Y_idx += 1
return np.hstack(d)
def l2_pdist(X, Y):
"""
help from https://stackoverflow.com/a/37903795
X: scipy.sparse CSR matrix, shape (m1, n)
Y: scipy.sparse CSR matrix, shape (m2, n)
returns: pairwise l2 distance between X and Y, shape (m1, m2)
"""
m2 = Y.shape[0]
d = []
Y_idx = np.repeat(0, m2)
for i in range(m2):
d.append(-np.sqrt(np.asarray((X-Y[Y_idx]).power(2).sum(1))))
Y_idx += 1
return np.hstack(d)
def cos_pdist(X, Y):
"""
help from https://stackoverflow.com/a/43493487
X: scipy.sparse CSR matrix, shape (m1, n)
Y: scipy.sparse CSR matrix, shape (m2, n)
returns: pairwise cosine distance between X and Y, shape (m1, m2)
"""
sumyy = np.asarray((Y.power(2)).sum(1)).flatten()
sumxx = np.asarray((X.power(2)).sum(1))
sumxy = X.dot(Y.T).toarray()
return (sumxy/np.sqrt(sumxx))/np.sqrt(sumyy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment