Forked from juanluisrto/generate_fake_dataframe.py
Created
September 6, 2021 09:54
-
-
Save koyanloshe/99ad4bad8c88114f6174f78c21c9e117 to your computer and use it in GitHub Desktop.
Function which generates dataframes with fake data in a controlled manner.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from itertools import cycle | |
def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None): | |
categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'], | |
'names' : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'], | |
'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'], | |
'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white'] | |
} | |
default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")} | |
rng = np.random.default_rng(seed) | |
first_c = default_intervals["c"][0] | |
categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c]) | |
default_intervals["c"] = (categories_names, default_intervals["c"][1]) | |
if isinstance(col_names,list): | |
assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements" | |
elif col_names is None: | |
suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"} | |
col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)] | |
if isinstance(intervals,list): | |
assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements" | |
else: | |
if isinstance(intervals,dict): | |
assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys" | |
default_intervals.update(intervals) | |
intervals = [default_intervals[col] for col in cols] | |
df = pd.DataFrame() | |
for col, col_name, interval in zip(cols, col_names, intervals): | |
if interval is None: | |
interval = default_intervals[col] | |
assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings." | |
if col in ("i","f","d"): | |
start, end = interval | |
if col == "i": | |
df[col_name] = rng.integers(start, end, size) | |
elif col == "f": | |
df[col_name] = rng.uniform(start, end, size) | |
elif col == "c": | |
if isinstance(interval, list): | |
categories = np.array(interval) | |
else: | |
cat_family, length = interval | |
if isinstance(cat_family, cycle): | |
cat_family = next(cat_family) | |
assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}" | |
categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True) | |
df[col_name] = rng.choice(categories, size, shuffle = True) | |
elif col == "d": | |
df[col_name] = rng.choice(pd.date_range(start, end), size) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment