Skip to content

Instantly share code, notes, and snippets.

@micaleel
Created November 23, 2020 12:25
Show Gist options
  • Save micaleel/c028217f936460aa097875b42be07644 to your computer and use it in GitHub Desktop.
Save micaleel/c028217f936460aa097875b42be07644 to your computer and use it in GitHub Desktop.
For splitting recommendation data loaded into a Pandas DataFrame
from typing import List, Tuple
import numpy as np
import os
import pandas as pd
DATA_DIR = ...
data = pd.read_csv(
os.path.join(DATA_DIR, "ml-100k", "u.data"),
sep="\t",
names=["user_id", "item_id", "rating", "timestamp"],
)
def split(
data: pd.DataFrame,
chrono: bool = True,
loo: bool = True,
min_ratings: int = 0,
shuffle: bool = False,
train_size=0.8,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
train, test = [], [] # List[np.ndarray]
for user_id, df in data.groupby("user_id", sort=False):
if chrono:
if "timestamp" in df.columns:
X = df.sort_values("timestamp").values
else:
raise ValueError("timestamp column missing in DataFrame")
else:
X = df.values
if X.shape[0] < min_ratings:
continue
if shuffle:
np.random.shuffle(X) # TODO Use default_rng()
n_samples = X.shape[0]
indices = np.arange(n_samples)
mask = np.full(n_samples, fill_value=True, dtype=np.bool)
if loo:
mask[-1] = False
else:
train_idxs = np.random.choice(
a=indices, size=int(n_samples * train_size), replace=False
)
mask[train_idxs] = False
train.append(X[mask])
test.append(X[np.logical_not(mask)])
return np.concatenate(train), np.concatenate(test)
train_arrs, test_arrs = split(data=data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment