Last active
July 1, 2019 02:23
-
-
Save acdick/fb76dcde98cd622f7756e9af9d54d464 to your computer and use it in GitHub Desktop.
Creating a Content-Based Product Similarity Matrix for a Recommender System
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# calculates the content-based product similarity matrix | |
# items is a Pandas dataframe containing all product details available for comparison | |
# returns the feature matrix and the correlation matrix | |
def product_similarity(items): | |
# drop multicollinear columns and columns not considered for similarity | |
items = items.drop(['Item', 'Style', 'Product', 'On Sale'], axis=1) | |
print(items.nunique()) | |
# select continuous and categorical features for correlation | |
similarity_features = items[ | |
['URL', 'Original', 'Discount', 'Gender', 'Made In', 'Category']] | |
similarity_features = similarity_features.set_index('URL') | |
# dummy encoding of categorical variables | |
similarity_features = pd.get_dummies( | |
similarity_features, columns=['Gender', 'Made In', 'Category']) | |
# calculate correlation matrix | |
similarity_matrix = similarity_features.T | |
similarity_matrix = similarity_matrix.corr(method='pearson') | |
return similarity_features, similarity_matrix | |
# requests a content-based recommendation | |
# similarity_matrix is the pre-computed correlation matrix | |
# top_favorite is the user-rated favorite product | |
# returns the i-th product most-correlated with top_favorite | |
def content_based_similarity(similarity_matrix, top_favorite, i): | |
# sort all product correlations related to top_favorite | |
recommendations = similarity_matrix[top_favorite].sort_values(ascending = False) | |
# drop auto-correlation of top_favorite | |
recommendations = recommendations.drop([top_favorite], axis=0).index | |
# return the product with the highest correlation | |
new_recommendation = recommendations[i] | |
return new_recommendation |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment