Skip to content

Instantly share code, notes, and snippets.

View oscar-defelice's full-sized avatar
:atom:
Cooking & Coding

Oscar oscar-defelice

:atom:
Cooking & Coding
View GitHub Profile
@oscar-defelice
oscar-defelice / df_ratings.py
Last active May 14, 2020 06:54
snippet of code for recommender systems
df_matrix = df_rating.pivot(index='UserId', columns='MovieId', values='Rating')
class TripletLossLayer(Layer):
"""
Layer object to minimise the triplet loss.
Here we implement the Bayesian Personal Ranking triplet loss.
"""
def __init__(self, **kwargs):
super(TripletLossLayer, self).__init__(**kwargs)
def bpr_triplet_loss(self, inputs):
"""
def build_embedding(df, features, emb_dim = 10, name = 'embedding_layer'):
'''
Define the embedding neural network to encode features in a emb_dim-dimensional vector.
Parameters
----------
df : pandas DataFrame
dataframe containing input metadata
features : list of str
def build_model(n_users, n_items, emb_dim = 30):
'''
Define the Keras Model for training
Parameters
----------
n_users : int
number of users
def get_posneg(df, anchor):
"""
Given a user id anchor, it gives back the max number of triplets [anchor, positive, negative]
available.
Triplets are randomly shuffled to better feed the training network.
Parameters
----------
df : Pandas DataFrame
Dataframe containing ratings, having user id as rows, movie id as columns
print("Starting training process!")
print("-------------------------------------")
t_start = time.time()
for i in range(1, n_iter+1):
triplets = get_triplets_hard(batch_size, X_usr, X_item, df_matrix)
loss = network_train.train_on_batch(triplets, None)
n_iteration += 1
if i % evaluate_every == 0:
print("\n ------------- \n")
def get_triplets_hard(batch_size, X_usr, X_item, df, return_cache = False):
"""
Returns the list of three arrays to feed the model.
Parameters
----------
batch_size : int
size of the batch.
X_usr : numpy array of shape (n_users, n_user_features)
import pandas as pd
train_data = pd.read_csv('train.csv', engine='python', encoding='utf-8', header = None, names=['Class Index', 'Title', 'Text'])
test_data = pd.read_csv('test.csv', engine='python', encoding='utf-8', header = None, names=['Class Index', 'Title', 'Text'])
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokeniser = Tokenizer()
tokeniser.fit_on_texts(train_data['Text'])
tokenised_text = tokeniser.texts_to_sequences(train_data['Text'])
tokenised_text = pad_sequences(tokenised_text, maxlen=max_len)
encoded_labels = preprocessing.LabelEncoder()
y = encoded_labels.fit_transform(train_data['label'])
y = to_categorical(y)