farhanhubble/cosine-similarity-matrices.py

## cosine-similarity-matrices.py
## Computes cosine similarity between two matrices.
## similarity[i][j] = A[i,:] * B[:,j] / |A[i,:]| * |B[:,j]|
def cosine_similarity(A,B):
    B_T = B.T

    product = np.matmul(A,B_T)

    normA = np.linalg.norm(A,axis=1)
    normA = normA.reshape(normA.size,1)

    normB_T = np.linalg.norm(B_T,axis=0)
    normB_T = normB_T.reshape(1,normB_T.size)

    product_norms = np.matmul(normA,normB_T)

    Similarity = np.subtract(1,np.divide(product,product_norms))

    return Similarity

## For every row of A, compute the number of rows of B that
## are within a given threshold distance of A. When A or B
## are very large the counts are calculated iteratively to
## avoid memory overruns. The default value of threshold = 0.0
## counts dupicates.
def similarity_count(A,B,stride=5000,threshold=0.0):

    result = np.zeros(A.shape[0])

    nrowsA = A.shape[0]
    nrowsB = B.shape[0]

    for rowA in range(0,nrowsA,stride):
        startA = rowA
        endA   = min(rowA+stride,nrowsA)

        result[startA:endA] = 0

        for rowB in range(0,nrowsB,stride):
            startB = rowB
            endB   = min(rowB+stride,nrowsB)

            result[startA:endA] = result[startA:endA] + \
            np.sum(cosine_similarity(A[startA:endA],B[startB:endB]) <= threshold, axis=1)

    return result
	## Computes cosine similarity between two matrices.
	## similarity[i][j] = A[i,:] * B[:,j] / \|A[i,:]\| * \|B[:,j]\|
	def cosine_similarity(A,B):
	B_T = B.T

	product = np.matmul(A,B_T)

	normA = np.linalg.norm(A,axis=1)
	normA = normA.reshape(normA.size,1)

	normB_T = np.linalg.norm(B_T,axis=0)
	normB_T = normB_T.reshape(1,normB_T.size)

	product_norms = np.matmul(normA,normB_T)

	Similarity = np.subtract(1,np.divide(product,product_norms))

	return Similarity

	## For every row of A, compute the number of rows of B that
	## are within a given threshold distance of A. When A or B
	## are very large the counts are calculated iteratively to
	## avoid memory overruns. The default value of threshold = 0.0
	## counts dupicates.
	def similarity_count(A,B,stride=5000,threshold=0.0):

	result = np.zeros(A.shape[0])

	nrowsA = A.shape[0]
	nrowsB = B.shape[0]

	for rowA in range(0,nrowsA,stride):
	startA = rowA
	endA = min(rowA+stride,nrowsA)

	result[startA:endA] = 0

	for rowB in range(0,nrowsB,stride):
	startB = rowB
	endB = min(rowB+stride,nrowsB)

	result[startA:endA] = result[startA:endA] + \
	np.sum(cosine_similarity(A[startA:endA],B[startB:endB]) <= threshold, axis=1)

	return result