Skip to content

Instantly share code, notes, and snippets.

View dpoulopoulos's full-sized avatar
🏠
Working from home

Dimitris Poulopoulos dpoulopoulos

🏠
Working from home
View GitHub Profile
@dpoulopoulos
dpoulopoulos / metaflow_1.py
Created February 22, 2020 17:11
Linear metaflow transition.
import numpy as np
from metaflow import FlowSpec, step
class CalculateMean(FlowSpec):
@step
def start(self):
"""
Initializes a random dataset.
@dpoulopoulos
dpoulopoulos / metaflow_2.py
Created February 22, 2020 17:42
Convenient functions to calculate prime, odd and even numbers.
def check_prime(x):
"""
Convenient function that checks if a number is prime.
"""
if x > 1:
for i in range(2, x):
if (x % i) == 0:
return False
else:
return True
@dpoulopoulos
dpoulopoulos / metaflow_4.py
Last active February 22, 2020 18:26
Nesting branches.
import numpy as np
from metaflow import FlowSpec, Parameter, step
class CheckNumbers(FlowSpec):
cores = Parameter('cores',
help="Parallelize the operation in that many CPU cores.",
default=4)
@dpoulopoulos
dpoulopoulos / metaflow_3.py
Last active February 22, 2020 18:27
Find prime, odd and even numbers in the dataset.
import numpy as np
from metaflow import FlowSpec, step
class CheckNumbers(FlowSpec):
@step
def start(self):
"""
Initializes a random dataset.
@dpoulopoulos
dpoulopoulos / incremental_recommender_1.py
Last active February 22, 2020 18:41
Load the movielens 1m dataset ratings file.
# load the data
col_names = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv('/tmp/ratings.dat', delimiter='::', names=col_names, engine='python')
# transform users and movies to categorical features
ratings_df['user_id'] = ratings_df['user_id'].astype('category')
ratings_df['movie_id'] = ratings_df['movie_id'].astype('category')
# use the category codes to avoid creating separate vocabularies
ratings_df['user_code'] = ratings_df['user_id'].cat.codes.astype(int)
@dpoulopoulos
dpoulopoulos / incremental_recommender_2.py
Last active February 22, 2020 18:41
How many unique users and items are in movielens 1m dataset.
n_users = ratings_df['user_code'].max() + 1
n_movies = ratings_df['movie_code'].max() + 1
@dpoulopoulos
dpoulopoulos / incremental_recommender_3.py
Last active February 22, 2020 18:42
Sort the movielens 1m dataset by timestamp.
data_df = ratings_df.sort_values(by='timestamp')
@dpoulopoulos
dpoulopoulos / incremental_recommender_4.py
Last active February 22, 2020 18:44
Filter the movielens dataset based on the rating value.
# more than 4 -> 1, less than 5 -> 0
data_df['preference'] = np.where(data_df['rating'] > 4, 1, 0)
# keep only ones and discard the others
data_df_cleaned = data_df[(data_df['preference'] == 1)]
data_df_cleaned.head()
@dpoulopoulos
dpoulopoulos / incremental_recommender_5.py
Last active February 22, 2020 18:44
Create Step model.
net = SimpleCF(n_users, n_movies, factors=128, init=torch.nn.init.normal_, mean=0., std=.1)
objective = lambda pred, target: target - pred
optimizer = SGD(net.parameters(), lr=6e-2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Step(net, objective, optimizer, device=device)
@dpoulopoulos
dpoulopoulos / incremental_recommender_6.py
Last active February 22, 2020 18:44
Batch training using Step.
pct = int(data_df_cleaned.shape[0] * .2)
bootstrapping_data = data_df_cleaned[:pct]
features = ['user_code', 'movie_code', 'rating']
target = ['preference']
data_set = TensorDataset(torch.tensor(bootstrapping_data[features].values), torch.tensor(bootstrapping_data[target].values))
data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, shuffle=False)
model.batch_fit(data_loader)