victorkohler/CFDNN_data_02.py

## CFDNN_data_02.py
def get_negatives(uids, iids, items, df_test):
    """Returns a pandas dataframe of 100 negative interactions
    based for each user in df_test.

    Args:
        uids (np.array): Numpy array of all user ids.
        iids (np.array): Numpy array of all item ids.
        items (list): List of all unique items.
        df_test (dataframe): Our test set.

    Returns:
        df_neg (dataframe): dataframe with 100 negative items
            for each (u, i) pair in df_test.
    """

    negativeList = []
    test_u = df_test['user_id'].values.tolist()
    test_i = df_test['item_id'].values.tolist()

    test_ratings = list(zip(test_u, test_i))
    zipped = set(zip(uids, iids))

    for (u, i) in test_ratings:
        negatives = []
        negatives.append((u, i))
        for t in range(100):
            j = np.random.randint(len(items)) # Get random item id.
            while (u, j) in zipped: # Check if there is an interaction
                j = np.random.randint(len(items)) # If yes, generate a new item id
            negatives.append(j) # Once a negative interaction is found we add it.
        negativeList.append(negatives)

    df_neg = pd.DataFrame(negativeList)

    return df_neg

def mask_first(x):
    """
    Return a list of 0 for the first item and 1 for all others
    """
    result = np.ones_like(x)
    result[0] = 0

    return result

def train_test_split(df):
    """
    Splits our original data into one test and one
    training set.

    The test set is made up of one item for each user. This is
    our holdout item used to compute Top@K later.

    The training set is the same as our original data but
    without any of the holdout items.

    Args:
        df (dataframe): Our original data

    Returns:
        df_train (dataframe): All of our data except holdout items
        df_test (dataframe): Only our holdout items.
    """

    # Create two copies of our dataframe that we can modify
    df_test = df.copy(deep=True)
    df_train = df.copy(deep=True)

    # Group by user_id and select only the first item for
    # each user (our holdout).
    df_test = df_test.groupby(['user_id']).first()
    df_test['user_id'] = df_test.index
    df_test = df_test[['user_id', 'item_id', 'plays']]
    del df_test.index.name

    # Remove the same items as we for our test set in our training set.
    mask = df.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
    df_train = df.loc[mask]

    return df_train, df_test
	def get_negatives(uids, iids, items, df_test):
	"""Returns a pandas dataframe of 100 negative interactions
	based for each user in df_test.

	Args:
	uids (np.array): Numpy array of all user ids.
	iids (np.array): Numpy array of all item ids.
	items (list): List of all unique items.
	df_test (dataframe): Our test set.

	Returns:
	df_neg (dataframe): dataframe with 100 negative items
	for each (u, i) pair in df_test.
	"""

	negativeList = []
	test_u = df_test['user_id'].values.tolist()
	test_i = df_test['item_id'].values.tolist()

	test_ratings = list(zip(test_u, test_i))
	zipped = set(zip(uids, iids))

	for (u, i) in test_ratings:
	negatives = []
	negatives.append((u, i))
	for t in range(100):
	j = np.random.randint(len(items)) # Get random item id.
	while (u, j) in zipped: # Check if there is an interaction
	j = np.random.randint(len(items)) # If yes, generate a new item id
	negatives.append(j) # Once a negative interaction is found we add it.
	negativeList.append(negatives)

	df_neg = pd.DataFrame(negativeList)

	return df_neg

	def mask_first(x):
	"""
	Return a list of 0 for the first item and 1 for all others
	"""
	result = np.ones_like(x)
	result[0] = 0

	return result

	def train_test_split(df):
	"""
	Splits our original data into one test and one
	training set.

	The test set is made up of one item for each user. This is
	our holdout item used to compute Top@K later.

	The training set is the same as our original data but
	without any of the holdout items.

	Args:
	df (dataframe): Our original data

	Returns:
	df_train (dataframe): All of our data except holdout items
	df_test (dataframe): Only our holdout items.
	"""

	# Create two copies of our dataframe that we can modify
	df_test = df.copy(deep=True)
	df_train = df.copy(deep=True)

	# Group by user_id and select only the first item for
	# each user (our holdout).
	df_test = df_test.groupby(['user_id']).first()
	df_test['user_id'] = df_test.index
	df_test = df_test[['user_id', 'item_id', 'plays']]
	del df_test.index.name

	# Remove the same items as we for our test set in our training set.
	mask = df.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
	df_train = df.loc[mask]

	return df_train, df_test