Skip to content

Instantly share code, notes, and snippets.

@devforfu
Created March 6, 2019 13:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save devforfu/18cee41301b931b722fbadf881a084c6 to your computer and use it in GitHub Desktop.
Save devforfu/18cee41301b931b722fbadf881a084c6 to your computer and use it in GitHub Desktop.
A simple random dataset generating script
def generate_dataset(n_rows, num_count, cat_count, max_nan=0.1, max_cat_size=100):
"""Randomly generate datasets with numerical and categorical features.
The numerical features are taken from the normal distribution X ~ N(0, 1).
The categorical features are generated as random uuid4 strings with
cardinality C where 2 <= C <= max_cat_size.
Also, a max_nan proportion of both numerical and categorical features is replaces
with NaN values.
"""
dataset, types = {}, {}
def generate_categories():
from uuid import uuid4
category_size = np.random.randint(2, max_cat_size)
return [str(uuid4()) for _ in range(category_size)]
for col in range(num_count):
name = f'n{col}'
values = np.random.normal(0, 1, n_rows)
nan_cnt = np.random.randint(1, int(max_nan*n_rows))
index = np.random.choice(n_rows, nan_cnt, replace=False)
values[index] = np.nan
dataset[name] = values
types[name] = 'float32'
for col in range(cat_count):
name = f'c{col}'
cats = generate_categories()
values = np.array(np.random.choice(cats, n_rows, replace=True), dtype=object)
nan_cnt = np.random.randint(1, int(max_nan*n_rows))
index = np.random.choice(n_rows, nan_cnt, replace=False)
values[index] = np.nan
dataset[name] = values
types[name] = 'object'
return pd.DataFrame(dataset), types
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment