Skip to content

Instantly share code, notes, and snippets.

@victorkohler
Last active June 11, 2019 19:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save victorkohler/e7e6a11ada6d2e6616f841f4a7a53536 to your computer and use it in GitHub Desktop.
Save victorkohler/e7e6a11ada6d2e6616f841f4a7a53536 to your computer and use it in GitHub Desktop.
def get_negatives(uids, iids, items, df_test):
"""Returns a pandas dataframe of 100 negative interactions
based for each user in df_test.
Args:
uids (np.array): Numpy array of all user ids.
iids (np.array): Numpy array of all item ids.
items (list): List of all unique items.
df_test (dataframe): Our test set.
Returns:
df_neg (dataframe): dataframe with 100 negative items
for each (u, i) pair in df_test.
"""
negativeList = []
test_u = df_test['user_id'].values.tolist()
test_i = df_test['item_id'].values.tolist()
test_ratings = list(zip(test_u, test_i))
zipped = set(zip(uids, iids))
for (u, i) in test_ratings:
negatives = []
negatives.append((u, i))
for t in range(100):
j = np.random.randint(len(items)) # Get random item id.
while (u, j) in zipped: # Check if there is an interaction
j = np.random.randint(len(items)) # If yes, generate a new item id
negatives.append(j) # Once a negative interaction is found we add it.
negativeList.append(negatives)
df_neg = pd.DataFrame(negativeList)
return df_neg
def mask_first(x):
"""
Return a list of 0 for the first item and 1 for all others
"""
result = np.ones_like(x)
result[0] = 0
return result
def train_test_split(df):
"""
Splits our original data into one test and one
training set.
The test set is made up of one item for each user. This is
our holdout item used to compute Top@K later.
The training set is the same as our original data but
without any of the holdout items.
Args:
df (dataframe): Our original data
Returns:
df_train (dataframe): All of our data except holdout items
df_test (dataframe): Only our holdout items.
"""
# Create two copies of our dataframe that we can modify
df_test = df.copy(deep=True)
df_train = df.copy(deep=True)
# Group by user_id and select only the first item for
# each user (our holdout).
df_test = df_test.groupby(['user_id']).first()
df_test['user_id'] = df_test.index
df_test = df_test[['user_id', 'item_id', 'plays']]
del df_test.index.name
# Remove the same items as we for our test set in our training set.
mask = df.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
df_train = df.loc[mask]
return df_train, df_test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment