Skip to content

Instantly share code, notes, and snippets.

@morrisalp
Last active May 14, 2021 07:49
Show Gist options
  • Save morrisalp/b6cb81bf73411c40f861f865a2b12fa9 to your computer and use it in GitHub Desktop.
Save morrisalp/b6cb81bf73411c40f861f865a2b12fa9 to your computer and use it in GitHub Desktop.
loading MovieLens 100K dataset with Surprise & Pandas
import surprise
import pandas as pd
data = surprise.Dataset.load_builtin('ml-100k')
ddir = surprise.get_dataset_dir()
item_data = pd.read_csv(f'{ddir}/ml-100k/ml-100k/u.item',
sep='|',
header=None,
encoding='ISO-8859-1',
usecols=[0, 1],
names=['iid', 'item_name'])
def dataset2df(ds, train=True):
df = pd.DataFrame(ds.all_ratings() if train else ds,
columns=['uid', 'iid', 'rating'])
if train:
df.uid = df.uid.apply(trainset.to_raw_uid)
df.iid = df.iid.apply(trainset.to_raw_iid)
df.uid = df.uid.astype(int)
df.iid = df.iid.astype(int)
return pd.merge(df, item_data, how='left')
def datasets2dfs(trainset, testset):
df_train = dataset2df(trainset)
df_test = dataset2df(testset, train=False)
return df_train, df_test
# # To get test set of all unrated items:
# trainset = data.build_full_trainset()
# testset = trainset.build_anti_testset()
# df_train, df_test = datasets2dfs(trainset, testset)
# # To get test set of some rated items:
# trainset, testset = surprise.model_selection.train_test_split(data, test_size=.25)
# df_train, df_test = datasets2dfs(trainset, testset)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment