This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load the data | |
col_names = ['user_id', 'movie_id', 'rating', 'timestamp'] | |
ratings_df = pd.read_csv('/tmp/ratings.dat', delimiter='::', names=col_names, engine='python') | |
# transform users and movies to categorical features | |
ratings_df['user_id'] = ratings_df['user_id'].astype('category') | |
ratings_df['movie_id'] = ratings_df['movie_id'].astype('category') | |
# use the category codes to avoid creating separate vocabularies | |
ratings_df['user_code'] = ratings_df['user_id'].cat.codes.astype(int) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
net = SimpleCF(n_users, n_movies, factors=128, init=torch.nn.init.normal_, mean=0., std=.1) | |
objective = lambda pred, target: target - pred | |
optimizer = SGD(net.parameters(), lr=6e-2) | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model = Step(net, objective, optimizer, device=device) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pct = int(data_df_cleaned.shape[0] * .2) | |
bootstrapping_data = data_df_cleaned[:pct] | |
features = ['user_code', 'movie_code', 'rating'] | |
target = ['preference'] | |
data_set = TensorDataset(torch.tensor(bootstrapping_data[features].values), torch.tensor(bootstrapping_data[target].values)) | |
data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, shuffle=False) | |
model.batch_fit(data_loader) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get the remaining data | |
data_df_step = data_df_cleaned.drop(bootstrapping_data.index) | |
data_df_step = data_df_step.reset_index(drop=True) | |
# create the DataLoader | |
stream_data_set = TensorDataset(torch.tensor(data_df_step[features].values), torch.tensor(data_df_step[target].values)) | |
stream_data_loader = DataLoader(stream_data_set, batch_size=1, shuffle=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
k = 10 # we keep only the top 10 recommendations | |
recalls = [] | |
known_users = [] | |
with tqdm(total=len(stream_data_loader)) as pbar: | |
for idx, (features, preferences) in enumerate(stream_data_loader): | |
itr = idx + 1 | |
user = features[:, 0] | |
item = features[:, 1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
avgs = moving_avg(recalls, 5000) | |
plt.title('Recall@10') | |
plt.xlabel('Iterations') | |
plt.ylabel('Metric') | |
plt.ylim(0., .1) | |
plt.plot(avgs) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
n_users = ratings_df['user_code'].max() + 1 | |
n_movies = ratings_df['movie_code'].max() + 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data_df = ratings_df.sort_values(by='timestamp') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# more than 4 -> 1, less than 5 -> 0 | |
data_df['preference'] = np.where(data_df['rating'] > 4, 1, 0) | |
# keep only ones and discard the others | |
data_df_cleaned = data_df[(data_df['preference'] == 1)] | |
data_df_cleaned.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from metaflow import FlowSpec, step | |
class CalculateMean(FlowSpec): | |
@step | |
def start(self): | |
""" | |
Initializes a random dataset. |
OlderNewer