Last active
May 31, 2024 05:33
-
-
Save RyanJulyan/4b094b7c510409586c5d1cb24c7fc5a4 to your computer and use it in GitHub Desktop.
Generates fake data instances using attrs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Any, get_args, get_origin, Type, Dict, Optional, List | |
import random | |
from datetime import date, datetime | |
import attr | |
from faker import Faker | |
def data_factory( | |
cls: Type, | |
num_records: int, | |
custom_attribute_mapping: Dict[str, str] = {}, | |
seed: Optional[int] = None, | |
list_min: int = 0, | |
list_max: int = 10, | |
) -> List: | |
""" | |
Generates instances of a given class using the Faker library. | |
This can be made consisten if a `seed` is set. | |
Args: | |
cls (Type): The class type for which instances will be generated. | |
num_records (int): The number of records to generate. | |
custom_attribute_mapping (Dict[str, str], optional): Custom mapping between attribute names and Faker methods. | |
seed (Optional[int], optional): Seed for the random number generator to ensure consistency. Defaults to None. | |
list_min (int, optional): Minimum number of elements in a list attribute. Defaults to 0. | |
list_max (int, optional): Maximum number of elements in a list attribute. Defaults to 10. | |
Returns: | |
List: A list of instances of the specified class. | |
""" | |
# Define a mapping between attribute names and corresponding Faker methods | |
attribute_mapping = {**custom_attribute_mapping} | |
# Define a mapping between attribute types and corresponding Faker methods | |
type_mapping = { | |
int: "random_int", | |
str: "word", | |
float: "random_number", | |
bool: "boolean", | |
date: "date_between", | |
datetime: "date_time_between", # Add this line | |
} | |
def generate_instance(cls: Type) -> Type: | |
""" | |
Generates an instance of the specified class. | |
Args: | |
cls (Type): The class type for which an instance will be generated. | |
Returns: | |
Type: An instance of the specified class. | |
""" | |
kwargs = {} | |
for attribute in attr.fields(cls): | |
name = attribute.name | |
attr_type = attribute.type | |
faker_method_name = attribute_mapping.get( | |
name, name | |
) # First, check attribute-based mapping | |
# Check if the attribute type is a list of attrs classes | |
if get_origin(attr_type) is list: | |
element_type = get_args(attr_type)[0] | |
if attr.has(element_type): | |
kwargs[name] = [ | |
generate_instance(element_type) | |
for _ in range(list_min, random.randint(list_min, list_max)) | |
] | |
# Check if the attribute type is a list of primitive types | |
elif element_type in type_mapping: | |
faker_method = getattr(fake, type_mapping[element_type]) | |
kwargs[name] = [faker_method() for _ in range(2)] | |
else: | |
kwargs[name] = None | |
# Check if the attribute type is an attrs class | |
elif attr.has(attr_type): | |
kwargs[name] = generate_instance(attr_type) | |
else: | |
# Try to call a method with the same name as the attribute | |
faker_method = getattr(fake, faker_method_name, None) | |
# If the method is not found, use the type-based mapping | |
if faker_method is None and attr_type in type_mapping: | |
faker_method = getattr(fake, type_mapping[attr_type]) | |
if callable(faker_method): | |
kwargs[name] = faker_method() | |
else: | |
kwargs[name] = None # Default value if no method matches | |
return cls(**kwargs) | |
if seed is not None: # Ensure that the seed is used if provided | |
fake = Faker() | |
fake.seed_instance(seed) | |
else: | |
fake = Faker() | |
instances = [generate_instance(cls) for _ in range(num_records)] | |
return instances | |
def wrong_type(value: Any) -> Any: | |
""" | |
Returns a value of a wrong data type. | |
Args: | |
value (Any): The original value. | |
Returns: | |
Any: A value of a wrong data type. | |
""" | |
type_mapping = { | |
int: [ | |
"wrong", | |
3.14, | |
True, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
str: [ | |
42, | |
3.14, | |
False, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
float: [ | |
"wrong", | |
42, | |
False, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
bool: [ | |
"wrong", | |
42, | |
3.14, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
date: [ | |
"wrong", | |
42, | |
True, | |
3.14, | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
datetime: [ | |
"wrong", | |
42, | |
True, | |
3.14, | |
date(1991, 9, 23), | |
], | |
} | |
wrong_values = type_mapping.get(type(value), [None]) | |
return random.choice(wrong_values) | |
def emulate_bad_data( | |
instances: List[Type], | |
remove_prob: float = 0.1, | |
wrong_type_prob: float = 0.15, | |
null_prob: float = 0.2, | |
null_values: List = [None, "NULL", "<null>"], | |
seed: Optional[int] = None, | |
) -> List[Dict]: | |
""" | |
Emulates bad data by randomly removing or nullifying attributes of instances. | |
This can be made consisten if a `seed` is set. | |
Args: | |
instances (List[Type]): List of instances to be modified. | |
remove_prob (float, optional): Probability of removing an attribute. Defaults to 0.1. | |
wrong_type_prob (float, optional): Probability of setting an attribute to a different type value. Defaults to 0.15. | |
null_prob (float, optional): Probability of setting an attribute to a null value. Defaults to 0.2. | |
null_values (List, optional): List of possible null values. Defaults to [None, "NULL", "<null>"]. | |
seed (Optional[int], optional): Seed for random number generator for reproducibility. Defaults to None. | |
Returns: | |
List[Dict]: List of dictionaries, each representing an instance with emulated bad data. | |
""" | |
if seed is not None: | |
random.seed(seed) | |
bad_data = [] | |
for instance in instances: | |
instance_dict = attr.asdict(instance) | |
for key in list(instance_dict.keys()): | |
rand_val = random.random() | |
# Randomly remove some attributes | |
if rand_val < remove_prob: | |
del instance_dict[key] | |
# Randomly set some attributes to a wrong data type | |
elif rand_val < wrong_type_prob: | |
instance_dict[key] = wrong_type(instance_dict[key]) | |
# Randomly set some attributes to a null value | |
elif rand_val < null_prob: | |
instance_dict[key] = random.choice(null_values) | |
bad_data.append(instance_dict) | |
return bad_data | |
if __name__ == "__main__": | |
@attr.s | |
class Hobby: | |
tag: str = attr.ib() | |
@attr.s | |
class Address: | |
street: str = attr.ib() | |
city: str = attr.ib() | |
@attr.s | |
class Person: | |
name: str = attr.ib() | |
age: int = attr.ib() | |
birth_date: date = attr.ib() | |
address: Address = attr.ib() | |
hobbies: List[Hobby] = attr.ib() | |
investments: List[int] = attr.ib() | |
last_login: datetime = attr.ib() | |
seed = 42 | |
num_records = 5 | |
people = data_factory(Person, num_records, seed=seed) | |
print() | |
print("List People Data:") | |
for person in people: | |
print(person) | |
print() | |
# Emulate bad data with custom probabilities and seed | |
remove_probability = 0.1 | |
wrong_type_prob = 0.15 | |
null_probability = 0.2 | |
bad_people_data = emulate_bad_data( | |
instances=people, | |
remove_prob=remove_probability, | |
wrong_type_prob=wrong_type_prob, | |
null_prob=null_probability, | |
null_values=[None, "NULL", "<null>", 0], | |
seed=seed, | |
) | |
# Print bad data | |
print() | |
print("List People Bad Data:") | |
for person in bad_people_data: | |
print(person) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
from typing_extensions import Literal | |
from typing import List, Optional | |
from dataclasses import dataclass | |
import numpy as np | |
import pandas as pd | |
@dataclass | |
class SeasonalityRecurrencePeriods: | |
""" | |
Data class to represent the seasonality patterns to be applied to a time series. | |
Attributes: | |
periods (int): The number of periods within which the seasonality pattern occurs. | |
For example, if the time series is daily and periods=7, the seasonality | |
pattern will repeat every 7 days. | |
recurrence (int, optional): The number of times the seasonality pattern occurs within | |
the specified periods. Default is 1. | |
For example, if periods=7 and recurrence=2, the pattern | |
will occur twice within every 7-day period. | |
""" | |
periods: int # within these periods | |
recurrence: int = 1 # number of times it occurs | |
def generate_time_series( | |
n: int, | |
freq: str = "D", | |
start_date: str = "2022-01-01", | |
seasonality_recurrence_periods: Optional[List[SeasonalityRecurrencePeriods]] = None, | |
trend: Literal["up", "down", "random"] = None, | |
noise_std: float = 0.1, | |
seed: Optional[int] = None, | |
min_value: Optional[float] = None, | |
max_value: Optional[float] = None, | |
) -> pd.Series: | |
""" | |
Generate synthetic time series data. | |
Args: | |
n (int): Number of data points. | |
freq (str): Frequency of the time series ('D' for daily, 'H' for hourly, etc.). | |
start_date (str): Start date of the time series. | |
seasonality_periods (Optional[List[SeasonalityRecurrencePeriods]]): List of SeasonalityRecurrencePeriods for multiple seasonality patterns. | |
trend (Literal['up', 'down', 'random']): Type of trend ('up', 'down', 'random'). | |
noise_std (float): Standard deviation of the noise. | |
seed (Optional[int]): Optional Random seed for reproducibility. | |
min_value (Optional[float]): Optional Minimum value for the time series. | |
max_value (Optional[float]): Optional Maximum value for the time series. | |
Returns: | |
pd.Series: Generated time series data. | |
""" | |
# Set seed for consistency in output | |
if seed is not None: | |
np.random.seed(seed) | |
# Generate time index | |
time_index = pd.date_range(start=start_date, periods=n, freq=freq) | |
# Initialize time series | |
ts = np.zeros(n) | |
# Add multiple seasonality_recurrence_periods | |
if seasonality_recurrence_periods: | |
for season in seasonality_recurrence_periods: | |
freq = season.recurrence | |
periods = season.periods | |
omega = 2 * np.pi / periods | |
ts += np.sin(omega * np.arange(n) * freq) | |
# Add trend | |
if trend == "up": | |
ts += np.linspace(0, 1, n) | |
elif trend == "down": | |
ts += np.linspace(1, 0, n) | |
elif trend == "random": | |
ts += np.random.uniform(-1, 1, n).cumsum() | |
# Add noise | |
ts += np.random.normal(scale=noise_std, size=n) | |
# Rescale to fit within min_value and max_value | |
if min_value is not None and max_value is not None: | |
ts_min = np.min(ts) | |
ts_max = np.max(ts) | |
ts = min_value + (max_value - min_value) * (ts - ts_min) / (ts_max - ts_min) | |
return pd.Series(ts, index=time_index) | |
@dataclass | |
class Adjustment: | |
""" | |
Data class to represent an adjustment to be applied to a time series. | |
Attributes: | |
adjustment_name (str): The name of the adjustment, used for labeling in the DataFrame. | |
start_date (str): The start date for the adjustment in 'YYYY-MM-DD' format. | |
end_date (str): The end date for the adjustment in 'YYYY-MM-DD' format. | |
adjustment_type (Literal["percentage", "value"]): The type of adjustment to apply. | |
- 'percentage': Applies a percentage adjustment to the existing value. | |
- 'value': Replaces the existing value with the specified adjustment_value. | |
adjustment_value (float): The value of the adjustment. | |
- For 'percentage': A positive or negative float representing the percentage change. | |
- For 'value': The value to replace the existing data with. | |
""" | |
adjustment_name: str | |
start_date: str | |
end_date: str | |
adjustment_type: Literal["percentage", "value"] # 'percentage' or 'value' | |
adjustment_value: float | |
def apply_adjustments(ts: pd.Series, adjustments: list[Adjustment]) -> pd.DataFrame: | |
""" | |
Apply adjustments to a time series. | |
This will cumulatively apply the adjustments in the order they are loaded. | |
If `Adjustment.adjustment_type` = "percentage" it will be a percentage of the existing value | |
If `Adjustment.adjustment_type` = "value" it will be the explicit value and override the existing value | |
Args: | |
ts (pd.Series): Original time series. | |
adjustments (list[Adjustment]): List of Adjustment objects containing adjustment parameters. | |
Returns: | |
pd.DataFrame: DataFrame containing original, adjusted with column names based on the `Adjustment.adjustment_name`, | |
and final time series. | |
""" | |
df = pd.DataFrame({"original": ts, "final": ts}) | |
for adj in adjustments: | |
name = adj.adjustment_name | |
start_date = adj.start_date | |
end_date = adj.end_date | |
adj_type = adj.adjustment_type | |
adj_value = adj.adjustment_value | |
mask = (df.index >= start_date) & (df.index <= end_date) | |
if adj_type == "percentage": | |
df[name] = ( | |
df["final"] * (1 + adj_value) | |
if adj_value >= 0 | |
else df["final"] * (1 - abs(adj_value)) | |
) | |
elif adj_type == "value": | |
df[name] = adj_value | |
df[name].where(mask, df["final"], inplace=True) | |
df["final"] = df[name] | |
# Reorder columns to make sure 'final' is the last column | |
column_order = [col for col in df.columns if col != "final"] + ["final"] | |
df = df[column_order] | |
return df | |
# %% | |
if __name__ == "__main__": | |
# Example usage | |
n = 210 | |
freq = "D" | |
start_date = "2022-01-01" | |
seasonality_recurrence_periods = [ | |
SeasonalityRecurrencePeriods(periods=30, recurrence=1), # Monthly seasonality | |
SeasonalityRecurrencePeriods(periods=7, recurrence=1), # Weekly seasonality | |
] | |
trend = "up" # Can be 'up', 'down', or 'random' | |
noise_std = 0.3 | |
seed = 42 | |
min_value = 0 # Optional minimum value | |
max_value = 1 # Optional maximum value | |
time_series_df = generate_time_series( | |
n=n, | |
freq=freq, | |
start_date=start_date, | |
seasonality_recurrence_periods=seasonality_recurrence_periods, | |
trend=trend, | |
noise_std=noise_std, | |
seed=seed, | |
min_value=min_value, | |
max_value=max_value, | |
) | |
time_series_df.plot() | |
adjustments = [ | |
Adjustment("Holiday", "2022-12-01", "2022-12-31", "percentage", 0.1), | |
Adjustment("Outage", "2022-06-01", "2022-06-30", "value", -0.5), | |
Adjustment("Holiday2", "2022-06-01", "2022-06-30", "percentage", 0.9), | |
] | |
adjusted_time_series_df = apply_adjustments(time_series_df, adjustments) | |
adjusted_time_series_df.plot() | |
print(adjusted_time_series_df) | |
# %% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Any, get_args, get_origin, Type, Dict, Optional, List | |
import random | |
from datetime import date, datetime | |
from pydantic import BaseModel, Field | |
from faker import Faker | |
def pydantic_data_factory( | |
cls: Type[BaseModel], | |
num_records: int, | |
custom_attribute_mapping: Dict[str, str] = {}, | |
seed: Optional[int] = None, | |
list_min: int = 0, | |
list_max: int = 10, | |
) -> List[BaseModel]: | |
""" | |
Generates instances of a given class using the Faker library. | |
This can be made consistent if a `seed` is set. | |
Args: | |
cls (Type): The class type for which instances will be generated. | |
num_records (int): The number of records to generate. | |
custom_attribute_mapping (Dict[str, str], optional): Custom mapping between attribute names and Faker methods. | |
seed (Optional[int], optional): Seed for the random number generator to ensure consistency. Defaults to None. | |
list_min (int, optional): Minimum number of elements in a list attribute. Defaults to 0. | |
list_max (int, optional): Maximum number of elements in a list attribute. Defaults to 10. | |
Returns: | |
List: A list of instances of the specified class. | |
""" | |
# Define a mapping between attribute names and corresponding Faker methods | |
attribute_mapping = {**custom_attribute_mapping} | |
# Define a mapping between attribute types and corresponding Faker methods | |
type_mapping = { | |
int: "random_int", | |
str: "word", | |
float: "random_number", | |
bool: "boolean", | |
date: "date_between", | |
datetime: "date_time_between", | |
} | |
def generate_instance(cls: Type[BaseModel]) -> BaseModel: | |
""" | |
Generates an instance of the specified class. | |
Args: | |
cls (Type): The class type for which an instance will be generated. | |
Returns: | |
Type: An instance of the specified class. | |
""" | |
kwargs = {} | |
for name, field in cls.model_fields.items(): | |
attr_type = field.annotation | |
faker_method_name = attribute_mapping.get(name, name) | |
if get_origin(attr_type) is list: | |
element_type = get_args(attr_type)[0] | |
if issubclass(element_type, BaseModel): | |
kwargs[name] = [ | |
generate_instance(element_type) | |
for _ in range(list_min, random.randint(list_min, list_max)) | |
] | |
elif element_type in type_mapping: | |
faker_method = getattr(fake, type_mapping[element_type]) | |
kwargs[name] = [faker_method() for _ in range(2)] | |
else: | |
kwargs[name] = None | |
elif issubclass(attr_type, BaseModel): | |
kwargs[name] = generate_instance(attr_type) | |
else: | |
faker_method = getattr(fake, faker_method_name, None) | |
if faker_method is None and attr_type in type_mapping: | |
faker_method = getattr(fake, type_mapping[attr_type]) | |
if callable(faker_method): | |
kwargs[name] = faker_method() | |
else: | |
kwargs[name] = None | |
return cls(**kwargs) | |
if seed is not None: | |
fake = Faker() | |
fake.seed_instance(seed) | |
else: | |
fake = Faker() | |
instances = [generate_instance(cls) for _ in range(num_records)] | |
return instances | |
def pydantic_wrong_type(value: Any) -> Any: | |
""" | |
Returns a value of a wrong data type. | |
Args: | |
value (Any): The original value. | |
Returns: | |
Any: A value of a wrong data type. | |
""" | |
type_mapping = { | |
int: [ | |
"wrong", | |
3.14, | |
True, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
str: [ | |
42, | |
3.14, | |
False, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
float: [ | |
"wrong", | |
42, | |
False, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
bool: [ | |
"wrong", | |
42, | |
3.14, | |
date(1991, 9, 23), | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
date: [ | |
"wrong", | |
42, | |
True, | |
3.14, | |
datetime(1991, 9, 23, 12, 59, 59), | |
], | |
datetime: [ | |
"wrong", | |
42, | |
True, | |
3.14, | |
date(1991, 9, 23), | |
], | |
} | |
wrong_values = type_mapping.get(type(value), [None]) | |
return random.choice(wrong_values) | |
def pydantic_emulate_bad_data( | |
instances: List[BaseModel], | |
remove_prob: float = 0.1, | |
wrong_type_prob: float = 0.15, | |
null_prob: float = 0.2, | |
null_values: List = [None, "NULL", "<null>"], | |
seed: Optional[int] = None, | |
) -> List[Dict]: | |
""" | |
Emulates bad data by randomly removing or nullifying attributes of instances. | |
This can be made consistent if a `seed` is set. | |
Args: | |
instances (List[Type]): List of instances to be modified. | |
remove_prob (float, optional): Probability of removing an attribute. Defaults to 0.1. | |
wrong_type_prob (float, optional): Probability of setting an attribute to a different type value. Defaults to 0.15. | |
null_prob (float, optional): Probability of setting an attribute to a null value. Defaults to 0.2. | |
null_values (List, optional): List of possible null values. Defaults to [None, "NULL", "<null>"]. | |
seed (Optional[int], optional): Seed for random number generator for reproducibility. Defaults to None. | |
Returns: | |
List[Dict]: List of dictionaries, each representing an instance with emulated bad data. | |
""" | |
if seed is not None: | |
random.seed(seed) | |
bad_data = [] | |
for instance in instances: | |
instance_dict = instance.dict() | |
for key in list(instance_dict.keys()): | |
rand_val = random.random() | |
if rand_val < remove_prob: | |
del instance_dict[key] | |
elif rand_val < wrong_type_prob: | |
instance_dict[key] = pydantic_wrong_type(instance_dict[key]) | |
elif rand_val < null_prob: | |
instance_dict[key] = random.choice(null_values) | |
bad_data.append(instance_dict) | |
return bad_data | |
if __name__ == "__main__": | |
class Hobby(BaseModel): | |
tag: str | |
class Address(BaseModel): | |
street: str | |
city: str | |
class Person(BaseModel): | |
name: str | |
age: int | |
birth_date: date | |
address: Address | |
hobbies: List[Hobby] | |
investments: List[int] | |
last_login: datetime | |
seed = 42 | |
num_records = 5 | |
people = pydantic_data_factory(Person, num_records, seed=seed) | |
print() | |
print("List People Data:") | |
for person in people: | |
print(person) | |
print() | |
remove_probability = 0.1 | |
wrong_type_prob = 0.15 | |
null_probability = 0.2 | |
bad_people_data = pydantic_emulate_bad_data( | |
instances=people, | |
remove_prob=remove_probability, | |
wrong_type_prob=wrong_type_prob, | |
null_prob=null_probability, | |
null_values=[None, "NULL", "<null>", 0], | |
seed=seed, | |
) | |
print() | |
print("List People Bad Data:") | |
for person in bad_people_data: | |
print(person) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
attrs | |
Faker | |
matplotlib | |
pandas | |
pydantic |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment