Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
This is a very basic data generator to test recommender systems. A future version may simulate the actual sparseness of ratings data with a simple bootstrap function but for now, numpy generator does the job.

RecData

To use this snippet, install faker:

pip install faker

Generate 100 users with 200 title features (default settings)

rec_data = RecData()
df = rec_data.generate_df()
df

Generate 20 users /w 12 title features

rec_data = RecData(n_users=20, n_features=12)
df = rec_data.generate_df()
df

Generate 500 users /w 3000 title features with a ratings scale of 0-5

rec_data = RecData(n_users=500, n_features=3000, ratings_scale=6)
df = rec_data.generate_df()
df

Share this with anyone you want 😉

import numpy as np, pandas as pd
from faker import Factory
class RecData:
ratings_scale = 2
n_users = 100
n_features = 200
def __init__(self, **opts):
## set defaults
self.n_users = opts.get('n_users', 100)
self.n_features = opts.get('n_features', 200)
self.ratings_scale = opts.get('ratings_scale', 2) # 2 = binary 1 or 0
## dont forget to pip install faker
self.faker = Factory.create()
def generate_names(self):
return [self.faker.name() for n in range(0, self.n_users)]
def generate_fake_title(self):
return " ".join([title.title() for title in self.faker.text().split()[:3]])
def generate_titles(self):
return [self.generate_fake_title() for n in range(0, self.n_features)]
def generate_df(self):
df = pd.DataFrame(
np.random.randint(
self.ratings_scale,
size = (self.n_users, self.n_features)
),
columns = self.generate_titles(),
index = self.generate_names()
)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.