Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import pandas as pd
from collections import Counter
import tensorflow as tf
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
# Loading datasets'
buys = open('yoochoose-buys.dat', 'r')
clicks = open('yoochoose-clicks.dat', 'r')
initial_buys_df = pd.read_csv(buys, names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
dtype={'Session ID': 'float32', 'Timestamp': 'str', 'Item ID': 'float32',
'Category': 'str'})
initial_buys_df.set_index('Session ID', inplace=True)
initial_clicks_df = pd.read_csv(clicks, names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
dtype={'Category': 'str'})
initial_clicks_df.set_index('Session ID', inplace=True)
# We won't use timestamps in this example
initial_buys_df = initial_buys_df.drop('Timestamp', 1)
initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)
# For illustrative purposes, we will only use a subset of the data: top 10000 buying users,
x = Counter(initial_buys_df.index).most_common(10000)
top_k = dict(x).keys()
initial_buys_df = initial_buys_df[initial_buys_df.index.isin(top_k)]
initial_clicks_df = initial_clicks_df[initial_clicks_df.index.isin(top_k)]
# Create a copy of the index, since we will also apply one-hot encoding on the index
initial_buys_df['_Session ID'] = initial_buys_df.index
# One-hot encode all columns for clicks and buys
transformed_buys = pd.get_dummies(initial_buys_df)
transformed_clicks = pd.get_dummies(initial_clicks_df)
# Aggregate historical data for Items and Categories
filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")
historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)
historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)
# Merge historical data of every user_id
merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)
merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)
# Create the MF model, you can play around with the parameters
model = TFFMRegressor(
order=2,
rank=7,
optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
n_epochs=100,
batch_size=-1,
init_std=0.001,
input_type='dense'
)
merged2.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID'], 1, inplace=True)
X = np.array(merged2)
X = np.nan_to_num(X)
y = np.array(merged2['Quantity'].as_matrix())
# Split data into train, test
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)
#Split testing data in half: Full information vs Cold-start
X_te, X_te_cs, y_te, y_te_cs = train_test_split(X_te, y_te, test_size=0.5)
X_te_cs = pd.DataFrame(X_te_cs, columns=merged2.columns)
# What happens if we only have access to categories and no historical click/purchase data?
# Let's delete historical click and purchasing data for the cold_start test set
for column in X_te_cs.columns:
if ('buy' in column or 'click' in column) and ('Category' not in column):
X_te_cs[column] = 0
# Compute the mean squared error for both test sets
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
cold_start_predictions = model.predict(X_te_cs)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))
print('Cold-start MSE: {}'.format(mean_squared_error(y_te_cs, predictions)))
model.destroy()
# Fun fact: Dropping the category columns in the training dataset makes the MSE even smaller
# but doing so means that we cannot tackle the cold-start recommendation problem
# Full Information MSE: 0.00644224764093
# Cold-start MSE: 3.09448346431
@teoguso

This comment has been minimized.

Copy link

commented Oct 15, 2018

Hello there, thank you for the nice example. I'm just a little confused by your comment about the category column, and beside that, I can't replicate your result using your parameters. Here are my questions: why would you keep the category (which you use as label) in the feature matrix? Why removing it would hinder the cold-start problem treatment? Thanks.

@jujaryu

This comment has been minimized.

Copy link

commented Jan 14, 2019

Hello, how do i get "Full Information MSE: 0.00644224764093" loss?
I used your parameters and 10000 common buy index but only got 0.5 MSE.
did you used full data for training?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.