Skip to content

Instantly share code, notes, and snippets.

View dpoulopoulos's full-sized avatar
🏠
Working from home

Dimitris Poulopoulos dpoulopoulos

🏠
Working from home
View GitHub Profile
@dpoulopoulos
dpoulopoulos / incremental_recommender_1.py
Last active February 22, 2020 18:41
Load the movielens 1m dataset ratings file.
# load the data
col_names = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv('/tmp/ratings.dat', delimiter='::', names=col_names, engine='python')
# transform users and movies to categorical features
ratings_df['user_id'] = ratings_df['user_id'].astype('category')
ratings_df['movie_id'] = ratings_df['movie_id'].astype('category')
# use the category codes to avoid creating separate vocabularies
ratings_df['user_code'] = ratings_df['user_id'].cat.codes.astype(int)
@dpoulopoulos
dpoulopoulos / incremental_recommender_5.py
Last active February 22, 2020 18:44
Create Step model.
net = SimpleCF(n_users, n_movies, factors=128, init=torch.nn.init.normal_, mean=0., std=.1)
objective = lambda pred, target: target - pred
optimizer = SGD(net.parameters(), lr=6e-2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Step(net, objective, optimizer, device=device)
@dpoulopoulos
dpoulopoulos / incremental_recommender_6.py
Last active February 22, 2020 18:44
Batch training using Step.
pct = int(data_df_cleaned.shape[0] * .2)
bootstrapping_data = data_df_cleaned[:pct]
features = ['user_code', 'movie_code', 'rating']
target = ['preference']
data_set = TensorDataset(torch.tensor(bootstrapping_data[features].values), torch.tensor(bootstrapping_data[target].values))
data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, shuffle=False)
model.batch_fit(data_loader)
@dpoulopoulos
dpoulopoulos / incremental_recommender_7.py
Last active February 22, 2020 18:44
Create the streaming data for Step evaluation.
# get the remaining data
data_df_step = data_df_cleaned.drop(bootstrapping_data.index)
data_df_step = data_df_step.reset_index(drop=True)
# create the DataLoader
stream_data_set = TensorDataset(torch.tensor(data_df_step[features].values), torch.tensor(data_df_step[target].values))
stream_data_loader = DataLoader(stream_data_set, batch_size=1, shuffle=False)
@dpoulopoulos
dpoulopoulos / incremental_recommender_8.py
Last active February 22, 2020 18:44
Incremental training using step.
k = 10 # we keep only the top 10 recommendations
recalls = []
known_users = []
with tqdm(total=len(stream_data_loader)) as pbar:
for idx, (features, preferences) in enumerate(stream_data_loader):
itr = idx + 1
user = features[:, 0]
item = features[:, 1]
@dpoulopoulos
dpoulopoulos / incremental_recommender_9.py
Last active February 22, 2020 18:49
Visualize the recall@10 for movielens dataset, using Step.
avgs = moving_avg(recalls, 5000)
plt.title('Recall@10')
plt.xlabel('Iterations')
plt.ylabel('Metric')
plt.ylim(0., .1)
plt.plot(avgs)
plt.show()
@dpoulopoulos
dpoulopoulos / incremental_recommender_2.py
Last active February 22, 2020 18:41
How many unique users and items are in movielens 1m dataset.
n_users = ratings_df['user_code'].max() + 1
n_movies = ratings_df['movie_code'].max() + 1
@dpoulopoulos
dpoulopoulos / incremental_recommender_3.py
Last active February 22, 2020 18:42
Sort the movielens 1m dataset by timestamp.
data_df = ratings_df.sort_values(by='timestamp')
@dpoulopoulos
dpoulopoulos / incremental_recommender_4.py
Last active February 22, 2020 18:44
Filter the movielens dataset based on the rating value.
# more than 4 -> 1, less than 5 -> 0
data_df['preference'] = np.where(data_df['rating'] > 4, 1, 0)
# keep only ones and discard the others
data_df_cleaned = data_df[(data_df['preference'] == 1)]
data_df_cleaned.head()
@dpoulopoulos
dpoulopoulos / metaflow_1.py
Created February 22, 2020 17:11
Linear metaflow transition.
import numpy as np
from metaflow import FlowSpec, step
class CalculateMean(FlowSpec):
@step
def start(self):
"""
Initializes a random dataset.