devforfu/gendata.py

## gendata.py
def generate_dataset(n_rows, num_count, cat_count, max_nan=0.1, max_cat_size=100):
    """Randomly generate datasets with numerical and categorical features.

    The numerical features are taken from the normal distribution X ~ N(0, 1).
    The categorical features are generated as random uuid4 strings with
    cardinality C where 2 <= C <= max_cat_size.

    Also, a max_nan proportion of both numerical and categorical features is replaces
    with NaN values.
    """
    dataset, types = {}, {}

    def generate_categories():
        from uuid import uuid4
        category_size = np.random.randint(2, max_cat_size)
        return [str(uuid4()) for _ in range(category_size)]

    for col in range(num_count):
        name = f'n{col}'
        values = np.random.normal(0, 1, n_rows)
        nan_cnt = np.random.randint(1, int(max_nan*n_rows))
        index = np.random.choice(n_rows, nan_cnt, replace=False)
        values[index] = np.nan
        dataset[name] = values
        types[name] = 'float32'

    for col in range(cat_count):
        name = f'c{col}'
        cats = generate_categories()
        values = np.array(np.random.choice(cats, n_rows, replace=True), dtype=object)
        nan_cnt = np.random.randint(1, int(max_nan*n_rows))
        index = np.random.choice(n_rows, nan_cnt, replace=False)
        values[index] = np.nan
        dataset[name] = values
        types[name] = 'object'

    return pd.DataFrame(dataset), types
	def generate_dataset(n_rows, num_count, cat_count, max_nan=0.1, max_cat_size=100):
	"""Randomly generate datasets with numerical and categorical features.

	The numerical features are taken from the normal distribution X ~ N(0, 1).
	The categorical features are generated as random uuid4 strings with
	cardinality C where 2 <= C <= max_cat_size.

	Also, a max_nan proportion of both numerical and categorical features is replaces
	with NaN values.
	"""
	dataset, types = {}, {}

	def generate_categories():
	from uuid import uuid4
	category_size = np.random.randint(2, max_cat_size)
	return [str(uuid4()) for _ in range(category_size)]

	for col in range(num_count):
	name = f'n{col}'
	values = np.random.normal(0, 1, n_rows)
	nan_cnt = np.random.randint(1, int(max_nan*n_rows))
	index = np.random.choice(n_rows, nan_cnt, replace=False)
	values[index] = np.nan
	dataset[name] = values
	types[name] = 'float32'

	for col in range(cat_count):
	name = f'c{col}'
	cats = generate_categories()
	values = np.array(np.random.choice(cats, n_rows, replace=True), dtype=object)
	nan_cnt = np.random.randint(1, int(max_nan*n_rows))
	index = np.random.choice(n_rows, nan_cnt, replace=False)
	values[index] = np.nan
	dataset[name] = values
	types[name] = 'object'

	return pd.DataFrame(dataset), types