Created
August 1, 2017 09:24
-
-
Save victorkohler/e02f58329ddf1492c3ca0521e417313a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------ | |
# ITEM-ITEM CALCULATIONS | |
#------------------------ | |
# As a first step we normalize the user vectors to unit vectors. | |
# magnitude = sqrt(x2 + y2 + z2 + ...) | |
magnitude = np.sqrt(np.square(data_items).sum(axis=1)) | |
# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...) | |
data_items = data_items.divide(magnitude, axis='index') | |
def calculate_similarity(data_items): | |
"""Calculate the column-wise cosine similarity for a sparse | |
matrix. Return a new dataframe matrix with similarities. | |
""" | |
data_sparse = sparse.csr_matrix(data_items) | |
similarities = cosine_similarity(data_sparse.transpose()) | |
sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns) | |
return sim | |
# Build the similarity matrix | |
data_matrix = calculate_similarity(data_items) | |
# Lets get the top 11 similar artists for Beyonce | |
print data_matrix.loc['beyonce'].nlargest(11) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment