Skip to content

Instantly share code, notes, and snippets.

@jeroenboeye
Created December 12, 2019 09:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeroenboeye/835995e8fabc35cb3fc27252d404365e to your computer and use it in GitHub Desktop.
Save jeroenboeye/835995e8fabc35cb3fc27252d404365e to your computer and use it in GitHub Desktop.
Fast Numpy implementation of collaborative filtering
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise
def get_recommendation_matrix(listening_history, n_similar = 20):
"""Collaborative filtering using cosine similarity"""
# Get similarity matrix, shape = (n artists, n artists)
sim_matrix = sklearn.metrics.pairwise.cosine_similarity(listening_history.T)
# add miniscule noise for sorting without duplicate values
sim_matrix += np.random.randn(sim_matrix.shape[0], sim_matrix.shape[1]) / 10000000000
# set artist similarity to self to zero
sim_matrix[np.identity(sim_matrix.shape[0], dtype=np.bool)] = 0
# Get the similarity score of the n_similar most similar artist per artist, shape = (n artist)
similarity_thresholds = np.sort(sim_matrix)[:,-n_similar]
# Set all artists similaries not in the n top most similar to zero
sim_matrix[~(sim_matrix >= similarity_thresholds.reshape(-1,1))] = 0
# Get the summed similarity value per artist (for standardization), shape = (n artist)
tot_sim = np.sum(sim_matrix, axis=1)
# Per user, per artist, get the similarity scores to all other artists but set scores of artists
# the user did not listened to to zero, shape = (n users, n artists, n artists)
user_specific_artist_similarities = listening_history.reshape(-1, 1, sim_matrix.shape[0]) * sim_matrix
# Calculate recommendation scores
# per user, per artist, sum similarity of artists listened to and divide by total similarity
# shape = (n users, n artists)
recomm_all = np.sum(user_specific_artist_similarities, axis=-1) / tot_sim
return recomm_all
def tidy_recommondations(recommendation_matrix,
history_df,
filter_out_listened_to=True,
top_n=10):
# Reshape to 3 column dataframe with columns: user, artist, recommendation_score
tidy_recommendations_df = (pd.DataFrame(recommendation_matrix,
columns=history_df.columns,
index=history_df.index)
.assign(user = lambda x: x.index)
.melt(id_vars='user', var_name='artist', value_name='recommendation_score'))
# Reshape to 3 column dataframe with columns: user, artist, listened_to (True / False)
tidy_listening_history_df = (history_df
.reset_index()
.melt(id_vars='user', var_name='artist', value_name='listened_to')
.assign(listened_to = lambda x: x.listened_to.astype(np.bool)))
# Combine both datasets to allow for filtering on artists the user has not listenend to.
# Also select the top_n artists with highest recommendation scores per user
per_user_recommendations_df = (pd.merge(tidy_recommendations_df,
tidy_listening_history_df,
on=['user', 'artist'])
.query(f'listened_to != {filter_out_listened_to}')
.sort_values(['user', 'recommendation_score'], ascending=[True, False])
.groupby('user')
.head(top_n)
.set_index('user'))
return per_user_recommendations_df
# Get raw data
csv_url = 'https://raw.githubusercontent.com/jeroenboeye/ml-tutorial/master/source/1/data/lastfm-matrix-germany.csv'
listening_history_df = pd.read_csv(csv_url).set_index('user').astype(np.bool)
# Calculate the full recommendation matrix
full_recomm = get_recommendation_matrix(listening_history_df.values, n_similar = 10)
# Transform to tidy pandas dataframe and filter on top most recommended
per_user_recommendations_df = tidy_recommondations(full_recomm, listening_history_df)
per_user_recommendations_df.loc[642]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment