Created
December 31, 2016 17:41
-
-
Save farhanhubble/f0f9c9f026620428ba0a9b450cdea327 to your computer and use it in GitHub Desktop.
Vectorized implementation of cosine similarity between two rows of two matrices. I wrote this code for finding duplicate images between two image datasets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Computes cosine similarity between two matrices. | |
## similarity[i][j] = A[i,:] * B[:,j] / |A[i,:]| * |B[:,j]| | |
def cosine_similarity(A,B): | |
B_T = B.T | |
product = np.matmul(A,B_T) | |
normA = np.linalg.norm(A,axis=1) | |
normA = normA.reshape(normA.size,1) | |
normB_T = np.linalg.norm(B_T,axis=0) | |
normB_T = normB_T.reshape(1,normB_T.size) | |
product_norms = np.matmul(normA,normB_T) | |
Similarity = np.subtract(1,np.divide(product,product_norms)) | |
return Similarity | |
## For every row of A, compute the number of rows of B that | |
## are within a given threshold distance of A. When A or B | |
## are very large the counts are calculated iteratively to | |
## avoid memory overruns. The default value of threshold = 0.0 | |
## counts dupicates. | |
def similarity_count(A,B,stride=5000,threshold=0.0): | |
result = np.zeros(A.shape[0]) | |
nrowsA = A.shape[0] | |
nrowsB = B.shape[0] | |
for rowA in range(0,nrowsA,stride): | |
startA = rowA | |
endA = min(rowA+stride,nrowsA) | |
result[startA:endA] = 0 | |
for rowB in range(0,nrowsB,stride): | |
startB = rowB | |
endB = min(rowB+stride,nrowsB) | |
result[startA:endA] = result[startA:endA] + \ | |
np.sum(cosine_similarity(A[startA:endA],B[startB:endB]) <= threshold, axis=1) | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment