Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Fast Numpy implementation of collaborative filtering
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise
def get_recommendation_matrix(listening_history, n_similar = 20):
"""Collaborative filtering using cosine similarity"""
# Get similarity matrix, shape = (n artists, n artists)
sim_matrix = sklearn.metrics.pairwise.cosine_similarity(listening_history.T)
# add miniscule noise for sorting without duplicate values
sim_matrix += np.random.randn(sim_matrix.shape[0], sim_matrix.shape[1]) / 10000000000
# set artist similarity to self to zero
sim_matrix[np.identity(sim_matrix.shape[0], dtype=np.bool)] = 0
# Get the similarity score of the n_similar most similar artist per artist, shape = (n artist)
similarity_thresholds = np.sort(sim_matrix)[:,-n_similar]
# Set all artists similaries not in the n top most similar to zero
sim_matrix[~(sim_matrix >= similarity_thresholds.reshape(-1,1))] = 0
# Get the summed similarity value per artist (for standardization), shape = (n artist)
tot_sim = np.sum(sim_matrix, axis=1)
# Per user, per artist, get the similarity scores to all other artists but set scores of artists
# the user did not listened to to zero, shape = (n users, n artists, n artists)
user_specific_artist_similarities = listening_history.reshape(-1, 1, sim_matrix.shape[0]) * sim_matrix
# Calculate recommendation scores
# per user, per artist, sum similarity of artists listened to and divide by total similarity
# shape = (n users, n artists)
recomm_all = np.sum(user_specific_artist_similarities, axis=-1) / tot_sim
return recomm_all
def tidy_recommondations(recommendation_matrix,
history_df,
filter_out_listened_to=True,
top_n=10):
# Reshape to 3 column dataframe with columns: user, artist, recommendation_score
tidy_recommendations_df = (pd.DataFrame(recommendation_matrix,
columns=history_df.columns,
index=history_df.index)
.assign(user = lambda x: x.index)
.melt(id_vars='user', var_name='artist', value_name='recommendation_score'))
# Reshape to 3 column dataframe with columns: user, artist, listened_to (True / False)
tidy_listening_history_df = (history_df
.reset_index()
.melt(id_vars='user', var_name='artist', value_name='listened_to')
.assign(listened_to = lambda x: x.listened_to.astype(np.bool)))
# Combine both datasets to allow for filtering on artists the user has not listenend to.
# Also select the top_n artists with highest recommendation scores per user
per_user_recommendations_df = (pd.merge(tidy_recommendations_df,
tidy_listening_history_df,
on=['user', 'artist'])
.query(f'listened_to != {filter_out_listened_to}')
.sort_values(['user', 'recommendation_score'], ascending=[True, False])
.groupby('user')
.head(top_n)
.set_index('user'))
return per_user_recommendations_df
# Get raw data
csv_url = 'https://raw.githubusercontent.com/jeroenboeye/ml-tutorial/master/source/1/data/lastfm-matrix-germany.csv'
listening_history_df = pd.read_csv(csv_url).set_index('user').astype(np.bool)
# Calculate the full recommendation matrix
full_recomm = get_recommendation_matrix(listening_history_df.values, n_similar = 10)
# Transform to tidy pandas dataframe and filter on top most recommended
per_user_recommendations_df = tidy_recommondations(full_recomm, listening_history_df)
per_user_recommendations_df.loc[642]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.