Skip to content

Instantly share code, notes, and snippets.

@nathancooperjones
Last active May 4, 2021 20:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nathancooperjones/33e9c7ce1a3733d6dbfb06809c131253 to your computer and use it in GitHub Desktop.
Save nathancooperjones/33e9c7ce1a3733d6dbfb06809c131253 to your computer and use it in GitHub Desktop.
Code accompanying the blog post "Fetching Better Beer Recommendations with Collie"

Code for Fetching Better Beer Recommendations with Collie Blog Post

This GitHub Gist is intended to accompany the blog posts below:

Part 1: https://medium.com/shoprunner/fetching-better-beer-recommendations-with-collie-part-1-18c73ab30fbd Part 2: https://medium.com/shoprunner/fetching-better-beer-recommendations-with-collie-part-2-27930a421459 Part 3: https://medium.com/shoprunner/fetching-better-beer-recommendations-with-collie-part-3-6aaae9bad169

Get the Data

Data for this blog post can be found here, specifically in the Beeradvocate.txt.gz and Ratebeer.txt.gz files. In total, these files should be 3.29 GB.

Run this Code

It is highly recommended you run this code on the GPU for the fastest execution time. If you have extra time and patience, all code below this will still work on the CPU.

I ran the code below using collie_recs==0.1.3 in the pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel base Docker image on a p3.2xlarge EC2 instance that is equipped with a single Tesla V100, 16GB GPU.

from itertools import groupby, chain
def chunk_text_data_generator(filepath: str):
with open(filepath, encoding='utf8', errors='ignore') as f:
# luckily, both files start with the same pattern of ``beer/name``
grps = groupby(f, key=lambda x: x.startswith('beer/name'))
for k, v in grps:
if k:
try:
# remove the final ``\n`` line in each chunk
chunk = list(chain([next(v)], (next(grps)[1])))[:-1]
# convert the text chunk to a dictionary
chunk_dict = {}
for line in chunk:
formatted_line = line.strip()
key, value = formatted_line.split(':', maxsplit=1)
key = key.replace('/', '_')
chunk_dict[key] = value
yield chunk_dict
except StopIteration:
return
beeradvocates_filepath = '../data/Beeradvocate.txt'
ratebeer_filepath = '../data/Ratebeer.txt'
beeradvocates_dicts = [x for x in chunk_text_data_generator(filepath=beeradvocates_filepath)]
ratebeer_dicts = [x for x in chunk_text_data_generator(filepath=ratebeer_filepath)]
import pandas as pd
# list of dicts -> pandas DataFrame
beeradvocates_df = pd.DataFrame(data=beeradvocates_dicts)
ratebeer_df = pd.DataFrame(data=ratebeer_dicts)
# adjust ratings in ``ratebeer_df`` to match that of ``beeradvocates_df``
review_number_columns = ['review_appearance',
'review_aroma',
'review_palate',
'review_taste',
'review_overall']
for col in review_number_columns:
# set columns to float, which involves removing the denominator from ``ratebeer_df``
beeradvocates_df[col] = beeradvocates_df[col].astype(float)
ratebeer_df[col] = ratebeer_df[col].str.split('/').str[0].astype(float)
# turn a 10-point scale to a 5 point one
ratebeer_df['review_aroma'] /= 2
ratebeer_df['review_taste'] /= 2
ratebeer_df['review_overall'] /= 4
# combine our two datasets into one
beer_df = pd.concat([beeradvocates_df, ratebeer_df], ignore_index=True)
# convert string columns to integer IDs
beer_df['unique_beer_id'] = beer_df['beer_name'].astype('category').cat.codes
beer_df['unique_reviewer_id'] = beer_df['review_profileName'].astype('category').cat.codes
from collie_recs.utils import convert_to_implicit, remove_users_with_fewer_than_n_interactions
implicit_beer_df = convert_to_implicit(explicit_df=beer_df,
min_rating_to_keep=3,
user_col='unique_reviewer_id',
item_col='unique_beer_id',
ratings_col='review_overall')
implicit_beer_df = remove_users_with_fewer_than_n_interactions(df=implicit_beer_df,
min_num_of_interactions=2,
user_col='unique_reviewer_id')
print(
f'Reduced shape from {len(beer_df)} to {len(implicit_beer_df)},'
f' or {round((len(implicit_beer_df) / len(beer_df) * 100), 3)}% the size.'
)
from collie_recs.interactions import Interactions
beer_interactions = Interactions(users=implicit_beer_df['unique_reviewer_id'],
items=implicit_beer_df['unique_beer_id'],
num_negative_samples=15,
allow_missing_ids=True,
seed=42)
from collie_recs.cross_validation import stratified_split
train, val = stratified_split(interactions=beer_interactions, test_p=0.1, seed=42)
from collie_recs.model import CollieTrainer, MatrixFactorizationModel
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
train.num_negative_samples = 1
val.num_negative_samples = 1
model = MatrixFactorizationModel(train=train,
val=val,
embedding_dim=30,
lr=1e-3,
optimizer='adam',
bias_optimizer=None,
loss='hinge')
trainer = CollieTrainer(model=model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='val_loss_epoch')])
trainer.fit(model)
from collie_recs.metrics import evaluate_in_batches, mapk
mapk_score = evaluate_in_batches(metric_list=[mapk], test_interactions=val, model=model, k=10)
print(f'MAP@10: {mapk_score}')
# sample a random member
random_user_id = implicit_beer_df['unique_reviewer_id'].sample(1).item()
print(f'Random user ID selected: {random_user_id}')
# get beers that user has previously interacted with
beers_user_interacted_with = (
implicit_beer_df[implicit_beer_df['unique_reviewer_id'] == random_user_id]
.drop_duplicates('unique_beer_id')
)
print(beers_user_interacted_with)
# finally (the actual work), get the top 5 recommended beers for the user...
recommended_beer_ids = model.get_item_predictions(user_id=random_user_id, unseen_items_only=True).index[:5]
# ... and display those beers in our DataFrame
implicit_beer_df[implicit_beer_df['unique_beer_id'].isin(recommended_beer_ids)].drop_duplicates('unique_beer_id')
from collie_recs.model import NeuralCollaborativeFiltering
neucf_model = NeuralCollaborativeFiltering(train=train,
val=val,
embedding_dim=8,
num_layers=3,
final_layer='sigmoid',
dropout_p=0.2,
lr=1e-3,
optimizer='adam',
loss='hinge')
neucf_trainer = CollieTrainer(model=neucf_model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='val_loss_epoch')])
neucf_trainer.fit(neucf_model)
train.num_negative_samples = 15
val.num_negative_samples = 15
model = MatrixFactorizationModel(train=train,
val=val,
embedding_dim=30,
lr=1e-3,
optimizer='adam',
bias_optimizer=None,
loss='hinge')
trainer = CollieTrainer(model=model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='val_loss_epoch')])
trainer.fit(model)
model = MatrixFactorizationModel(train=train,
val=val,
embedding_dim=30,
lr=1e-3,
bias_lr=1e-2,
optimizer='adam',
bias_optimizer='sgd',
loss='hinge')
trainer = CollieTrainer(model=model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='val_loss_epoch')])
trainer.fit(model)
from collie_recs.model import HybridPretrainedModel
# dummify our beer styles
beer_item_dummies = pd.get_dummies(beer_item_df['beer_style'], prefix='beer_style')
# we will apply two linear layers to the metadata with ``metadata_layers_dims`` and
# two linear layers to the combined embeddings + metadata data with ``combined_layers_dims``
hybrid_model = HybridPretrainedModel(train=train,
val=val,
item_metadata=beer_item_dummies,
trained_model=model,
metadata_layers_dims=[256, 16],
combined_layers_dims=[32, 8],
lr=1e-3,
optimizer='adam',
loss='hinge')
trainer = CollieTrainer(model=hybrid_model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='val_loss_epoch')])
trainer.fit(hybrid_model)
# now we can unfreeze our embeddings and fine-tune
hybrid_model_unfrozen = HybridPretrainedModel(train=train,
val=val,
item_metadata=beer_item_dummies,
trained_model=model,
metadata_layers_dims=[256, 16],
combined_layers_dims=[32, 8],
lr=1e-5,
optimizer='adam',
loss='hinge')
hybrid_model_unfrozen.load_from_hybrid_model(hybrid_model)
hybrid_model_unfrozen.unfreeze_embeddings()
trainer_unfrozen = CollieTrainer(model=hybrid_model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='val_loss_epoch')])
trainer_unfrozen.fit(hybrid_model_unfrozen)
import torch
# create a DataFrame with each item appearing only once, sorted by item ID
beer_item_df = (
beer_df
.drop_duplicates(subset=['unique_beer_id'], ignore_index=True)
.sort_values(by='unique_beer_id', ignore_index=True)
)
# create an integer ID for ``beer_style``
beer_item_df['beer_style'] = beer_item_df['beer_style'].str.strip()
beer_item_df['beer_style_id'] = beer_item_df['beer_style'].astype('category').cat.codes
# convert the ``pd.Series`` to a ``torch.tensor``
beer_styles_tensor = torch.tensor(beer_item_df['beer_style_id'])
model = MatrixFactorizationModel(train=train,
val=val,
embedding_dim=30,
lr=1e-3,
bias_lr=1e-2,
optimizer='adam',
bias_optimizer='sgd',
loss='hinge',
metadata_for_loss={'beer_style': beer_styles_tensor},
metadata_for_loss_weights={'beer_style': 0.15})
trainer = CollieTrainer(model=model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='val_loss_epoch')])
trainer.fit(model)
stella_artois_beer_id = 136889
# get the top 5 most similar beers
similar_beer_ids = model.item_item_similarity(stella_artois_beer_id).index[1:6]
beer_item_df[beer_item_df['unique_beer_id'].isin(similar_beer_ids)]
from collie_recs.utils import _create_sparse_ratings_matrix_helper
import numpy as np
# Stella Artois, Corona Light, and PBR (of course)
beers_ids_I_like = [136889, 8200, 108327]
# put these items into a very small ``Interactions`` object
new_mat = _create_sparse_ratings_matrix_helper(users=np.zeros_like(beers_ids_I_like),
items=beers_ids_I_like,
num_users=train.num_users,
num_items=train.num_items)
new_interactions = Interactions(mat=new_mat,
num_negative_samples=5,
allow_missing_ids=True,
check_num_negative_samples_is_valid=False,
seed=42)
fine_tune_model = MatrixFactorizationModel(train=new_interactions,
embedding_dim=model.hparams.embedding_dim,
lr=1e-2,
bias_lr=1e-1)
# copy over the previously-trained model's embeddings...
fine_tune_model.item_embeddings.weight.data.copy_(model.item_embeddings.weight.data)
fine_tune_model.item_biases.weight.data.copy_(model.item_biases.weight.data)
# ... and freeze them so we keep that model's item information intact
fine_tune_model.item_embeddings.weight.requires_grad = False
fine_tune_model.item_biases.weight.requires_grad = False
fine_tune_trainer = CollieTrainer(model=model,
max_epochs=30,
deterministic=True,
benchmark=True,
callbacks=[EarlyStopping(monitor='train_loss_epoch')])
fine_tune_trainer.fit(fine_tune_model)
# get the top 5 recommended beers for me
recommended_beer_ids = fine_tune_model.get_item_predictions(user_id=0, unseen_items_only=True).index[:5]
beer_item_df[beer_item_df['unique_beer_id'].isin(recommended_beer_ids)]
# get the top 5 beers with the highest appeal
mass_appeal_beer_ids = model.item_biases.weight.data.flatten().argsort(descending=True).cpu().numpy()[:5]
beer_item_df[beer_item_df['unique_beer_id'].isin(mass_appeal_beer_ids)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment