jeroenboeye/recommender.py

## recommender.py
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise

def get_recommendation_matrix(listening_history, n_similar = 20):
    """Collaborative filtering using cosine similarity"""
    # Get similarity matrix, shape = (n artists, n artists)
    sim_matrix = sklearn.metrics.pairwise.cosine_similarity(listening_history.T)

    # add miniscule noise for sorting without duplicate values
    sim_matrix += np.random.randn(sim_matrix.shape[0], sim_matrix.shape[1]) / 10000000000

    # set artist similarity to self to zero
    sim_matrix[np.identity(sim_matrix.shape[0], dtype=np.bool)] = 0

    # Get the similarity score of the n_similar most similar artist per artist, shape = (n artist)
    similarity_thresholds = np.sort(sim_matrix)[:,-n_similar]

    # Set all artists similaries not in the n top most similar to zero
    sim_matrix[~(sim_matrix >= similarity_thresholds.reshape(-1,1))] = 0

    # Get the summed similarity value per artist (for standardization), shape = (n artist)
    tot_sim = np.sum(sim_matrix, axis=1)

    # Per user, per artist, get the similarity scores to all other artists but set scores of artists
    # the user did not listened to to zero, shape = (n users, n artists, n artists)
    user_specific_artist_similarities = listening_history.reshape(-1, 1, sim_matrix.shape[0]) * sim_matrix

    # Calculate recommendation scores
    # per user, per artist, sum similarity of artists listened to and divide by total similarity
    # shape = (n users, n artists)
    recomm_all = np.sum(user_specific_artist_similarities, axis=-1) / tot_sim

    return recomm_all

def tidy_recommondations(recommendation_matrix,
                         history_df,
                         filter_out_listened_to=True,
                         top_n=10):

    # Reshape to 3 column dataframe with columns: user, artist, recommendation_score
    tidy_recommendations_df = (pd.DataFrame(recommendation_matrix,
                                            columns=history_df.columns,
                                            index=history_df.index)
     .assign(user = lambda x: x.index)
     .melt(id_vars='user', var_name='artist', value_name='recommendation_score'))

    # Reshape to 3 column dataframe with columns: user, artist, listened_to (True / False)
    tidy_listening_history_df = (history_df
                                  .reset_index()
                                  .melt(id_vars='user', var_name='artist', value_name='listened_to')
                                  .assign(listened_to = lambda x: x.listened_to.astype(np.bool)))

    # Combine both datasets to allow for filtering on artists the user has not listenend to.
    # Also select the top_n artists with highest recommendation scores per user
    per_user_recommendations_df = (pd.merge(tidy_recommendations_df,
                                            tidy_listening_history_df,
                                            on=['user', 'artist'])
     .query(f'listened_to != {filter_out_listened_to}')
     .sort_values(['user', 'recommendation_score'], ascending=[True, False])
     .groupby('user')
     .head(top_n)
     .set_index('user'))

    return per_user_recommendations_df

# Get raw data
csv_url = 'https://raw.githubusercontent.com/jeroenboeye/ml-tutorial/master/source/1/data/lastfm-matrix-germany.csv'
listening_history_df = pd.read_csv(csv_url).set_index('user').astype(np.bool)

# Calculate the full recommendation matrix
full_recomm = get_recommendation_matrix(listening_history_df.values, n_similar = 10)

# Transform to tidy pandas dataframe and filter on top most recommended
per_user_recommendations_df = tidy_recommondations(full_recomm, listening_history_df)
per_user_recommendations_df.loc[642]
	import numpy as np
	import pandas as pd
	import sklearn.metrics.pairwise

	def get_recommendation_matrix(listening_history, n_similar = 20):
	"""Collaborative filtering using cosine similarity"""
	# Get similarity matrix, shape = (n artists, n artists)
	sim_matrix = sklearn.metrics.pairwise.cosine_similarity(listening_history.T)

	# add miniscule noise for sorting without duplicate values
	sim_matrix += np.random.randn(sim_matrix.shape[0], sim_matrix.shape[1]) / 10000000000

	# set artist similarity to self to zero
	sim_matrix[np.identity(sim_matrix.shape[0], dtype=np.bool)] = 0

	# Get the similarity score of the n_similar most similar artist per artist, shape = (n artist)
	similarity_thresholds = np.sort(sim_matrix)[:,-n_similar]

	# Set all artists similaries not in the n top most similar to zero
	sim_matrix[~(sim_matrix >= similarity_thresholds.reshape(-1,1))] = 0

	# Get the summed similarity value per artist (for standardization), shape = (n artist)
	tot_sim = np.sum(sim_matrix, axis=1)

	# Per user, per artist, get the similarity scores to all other artists but set scores of artists
	# the user did not listened to to zero, shape = (n users, n artists, n artists)
	user_specific_artist_similarities = listening_history.reshape(-1, 1, sim_matrix.shape[0]) * sim_matrix

	# Calculate recommendation scores
	# per user, per artist, sum similarity of artists listened to and divide by total similarity
	# shape = (n users, n artists)
	recomm_all = np.sum(user_specific_artist_similarities, axis=-1) / tot_sim

	return recomm_all

	def tidy_recommondations(recommendation_matrix,
	history_df,
	filter_out_listened_to=True,
	top_n=10):

	# Reshape to 3 column dataframe with columns: user, artist, recommendation_score
	tidy_recommendations_df = (pd.DataFrame(recommendation_matrix,
	columns=history_df.columns,
	index=history_df.index)
	.assign(user = lambda x: x.index)
	.melt(id_vars='user', var_name='artist', value_name='recommendation_score'))

	# Reshape to 3 column dataframe with columns: user, artist, listened_to (True / False)
	tidy_listening_history_df = (history_df
	.reset_index()
	.melt(id_vars='user', var_name='artist', value_name='listened_to')
	.assign(listened_to = lambda x: x.listened_to.astype(np.bool)))

	# Combine both datasets to allow for filtering on artists the user has not listenend to.
	# Also select the top_n artists with highest recommendation scores per user
	per_user_recommendations_df = (pd.merge(tidy_recommendations_df,
	tidy_listening_history_df,
	on=['user', 'artist'])
	.query(f'listened_to != {filter_out_listened_to}')
	.sort_values(['user', 'recommendation_score'], ascending=[True, False])
	.groupby('user')
	.head(top_n)
	.set_index('user'))

	return per_user_recommendations_df

	# Get raw data
	csv_url = 'https://raw.githubusercontent.com/jeroenboeye/ml-tutorial/master/source/1/data/lastfm-matrix-germany.csv'
	listening_history_df = pd.read_csv(csv_url).set_index('user').astype(np.bool)

	# Calculate the full recommendation matrix
	full_recomm = get_recommendation_matrix(listening_history_df.values, n_similar = 10)

	# Transform to tidy pandas dataframe and filter on top most recommended
	per_user_recommendations_df = tidy_recommondations(full_recomm, listening_history_df)
	per_user_recommendations_df.loc[642]