Created
December 12, 2019 09:33
-
-
Save jeroenboeye/835995e8fabc35cb3fc27252d404365e to your computer and use it in GitHub Desktop.
Fast Numpy implementation of collaborative filtering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import sklearn.metrics.pairwise | |
def get_recommendation_matrix(listening_history, n_similar = 20): | |
"""Collaborative filtering using cosine similarity""" | |
# Get similarity matrix, shape = (n artists, n artists) | |
sim_matrix = sklearn.metrics.pairwise.cosine_similarity(listening_history.T) | |
# add miniscule noise for sorting without duplicate values | |
sim_matrix += np.random.randn(sim_matrix.shape[0], sim_matrix.shape[1]) / 10000000000 | |
# set artist similarity to self to zero | |
sim_matrix[np.identity(sim_matrix.shape[0], dtype=np.bool)] = 0 | |
# Get the similarity score of the n_similar most similar artist per artist, shape = (n artist) | |
similarity_thresholds = np.sort(sim_matrix)[:,-n_similar] | |
# Set all artists similaries not in the n top most similar to zero | |
sim_matrix[~(sim_matrix >= similarity_thresholds.reshape(-1,1))] = 0 | |
# Get the summed similarity value per artist (for standardization), shape = (n artist) | |
tot_sim = np.sum(sim_matrix, axis=1) | |
# Per user, per artist, get the similarity scores to all other artists but set scores of artists | |
# the user did not listened to to zero, shape = (n users, n artists, n artists) | |
user_specific_artist_similarities = listening_history.reshape(-1, 1, sim_matrix.shape[0]) * sim_matrix | |
# Calculate recommendation scores | |
# per user, per artist, sum similarity of artists listened to and divide by total similarity | |
# shape = (n users, n artists) | |
recomm_all = np.sum(user_specific_artist_similarities, axis=-1) / tot_sim | |
return recomm_all | |
def tidy_recommondations(recommendation_matrix, | |
history_df, | |
filter_out_listened_to=True, | |
top_n=10): | |
# Reshape to 3 column dataframe with columns: user, artist, recommendation_score | |
tidy_recommendations_df = (pd.DataFrame(recommendation_matrix, | |
columns=history_df.columns, | |
index=history_df.index) | |
.assign(user = lambda x: x.index) | |
.melt(id_vars='user', var_name='artist', value_name='recommendation_score')) | |
# Reshape to 3 column dataframe with columns: user, artist, listened_to (True / False) | |
tidy_listening_history_df = (history_df | |
.reset_index() | |
.melt(id_vars='user', var_name='artist', value_name='listened_to') | |
.assign(listened_to = lambda x: x.listened_to.astype(np.bool))) | |
# Combine both datasets to allow for filtering on artists the user has not listenend to. | |
# Also select the top_n artists with highest recommendation scores per user | |
per_user_recommendations_df = (pd.merge(tidy_recommendations_df, | |
tidy_listening_history_df, | |
on=['user', 'artist']) | |
.query(f'listened_to != {filter_out_listened_to}') | |
.sort_values(['user', 'recommendation_score'], ascending=[True, False]) | |
.groupby('user') | |
.head(top_n) | |
.set_index('user')) | |
return per_user_recommendations_df | |
# Get raw data | |
csv_url = 'https://raw.githubusercontent.com/jeroenboeye/ml-tutorial/master/source/1/data/lastfm-matrix-germany.csv' | |
listening_history_df = pd.read_csv(csv_url).set_index('user').astype(np.bool) | |
# Calculate the full recommendation matrix | |
full_recomm = get_recommendation_matrix(listening_history_df.values, n_similar = 10) | |
# Transform to tidy pandas dataframe and filter on top most recommended | |
per_user_recommendations_df = tidy_recommondations(full_recomm, listening_history_df) | |
per_user_recommendations_df.loc[642] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment