Skip to content

Instantly share code, notes, and snippets.

@RyanJulyan
Last active May 31, 2024 05:33
Show Gist options
  • Save RyanJulyan/4b094b7c510409586c5d1cb24c7fc5a4 to your computer and use it in GitHub Desktop.
Save RyanJulyan/4b094b7c510409586c5d1cb24c7fc5a4 to your computer and use it in GitHub Desktop.
Generates fake data instances using attrs.
from typing import Any, get_args, get_origin, Type, Dict, Optional, List
import random
from datetime import date, datetime
import attr
from faker import Faker
def data_factory(
cls: Type,
num_records: int,
custom_attribute_mapping: Dict[str, str] = {},
seed: Optional[int] = None,
list_min: int = 0,
list_max: int = 10,
) -> List:
"""
Generates instances of a given class using the Faker library.
This can be made consisten if a `seed` is set.
Args:
cls (Type): The class type for which instances will be generated.
num_records (int): The number of records to generate.
custom_attribute_mapping (Dict[str, str], optional): Custom mapping between attribute names and Faker methods.
seed (Optional[int], optional): Seed for the random number generator to ensure consistency. Defaults to None.
list_min (int, optional): Minimum number of elements in a list attribute. Defaults to 0.
list_max (int, optional): Maximum number of elements in a list attribute. Defaults to 10.
Returns:
List: A list of instances of the specified class.
"""
# Define a mapping between attribute names and corresponding Faker methods
attribute_mapping = {**custom_attribute_mapping}
# Define a mapping between attribute types and corresponding Faker methods
type_mapping = {
int: "random_int",
str: "word",
float: "random_number",
bool: "boolean",
date: "date_between",
datetime: "date_time_between", # Add this line
}
def generate_instance(cls: Type) -> Type:
"""
Generates an instance of the specified class.
Args:
cls (Type): The class type for which an instance will be generated.
Returns:
Type: An instance of the specified class.
"""
kwargs = {}
for attribute in attr.fields(cls):
name = attribute.name
attr_type = attribute.type
faker_method_name = attribute_mapping.get(
name, name
) # First, check attribute-based mapping
# Check if the attribute type is a list of attrs classes
if get_origin(attr_type) is list:
element_type = get_args(attr_type)[0]
if attr.has(element_type):
kwargs[name] = [
generate_instance(element_type)
for _ in range(list_min, random.randint(list_min, list_max))
]
# Check if the attribute type is a list of primitive types
elif element_type in type_mapping:
faker_method = getattr(fake, type_mapping[element_type])
kwargs[name] = [faker_method() for _ in range(2)]
else:
kwargs[name] = None
# Check if the attribute type is an attrs class
elif attr.has(attr_type):
kwargs[name] = generate_instance(attr_type)
else:
# Try to call a method with the same name as the attribute
faker_method = getattr(fake, faker_method_name, None)
# If the method is not found, use the type-based mapping
if faker_method is None and attr_type in type_mapping:
faker_method = getattr(fake, type_mapping[attr_type])
if callable(faker_method):
kwargs[name] = faker_method()
else:
kwargs[name] = None # Default value if no method matches
return cls(**kwargs)
if seed is not None: # Ensure that the seed is used if provided
fake = Faker()
fake.seed_instance(seed)
else:
fake = Faker()
instances = [generate_instance(cls) for _ in range(num_records)]
return instances
def wrong_type(value: Any) -> Any:
"""
Returns a value of a wrong data type.
Args:
value (Any): The original value.
Returns:
Any: A value of a wrong data type.
"""
type_mapping = {
int: [
"wrong",
3.14,
True,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
str: [
42,
3.14,
False,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
float: [
"wrong",
42,
False,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
bool: [
"wrong",
42,
3.14,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
date: [
"wrong",
42,
True,
3.14,
datetime(1991, 9, 23, 12, 59, 59),
],
datetime: [
"wrong",
42,
True,
3.14,
date(1991, 9, 23),
],
}
wrong_values = type_mapping.get(type(value), [None])
return random.choice(wrong_values)
def emulate_bad_data(
instances: List[Type],
remove_prob: float = 0.1,
wrong_type_prob: float = 0.15,
null_prob: float = 0.2,
null_values: List = [None, "NULL", "<null>"],
seed: Optional[int] = None,
) -> List[Dict]:
"""
Emulates bad data by randomly removing or nullifying attributes of instances.
This can be made consisten if a `seed` is set.
Args:
instances (List[Type]): List of instances to be modified.
remove_prob (float, optional): Probability of removing an attribute. Defaults to 0.1.
wrong_type_prob (float, optional): Probability of setting an attribute to a different type value. Defaults to 0.15.
null_prob (float, optional): Probability of setting an attribute to a null value. Defaults to 0.2.
null_values (List, optional): List of possible null values. Defaults to [None, "NULL", "<null>"].
seed (Optional[int], optional): Seed for random number generator for reproducibility. Defaults to None.
Returns:
List[Dict]: List of dictionaries, each representing an instance with emulated bad data.
"""
if seed is not None:
random.seed(seed)
bad_data = []
for instance in instances:
instance_dict = attr.asdict(instance)
for key in list(instance_dict.keys()):
rand_val = random.random()
# Randomly remove some attributes
if rand_val < remove_prob:
del instance_dict[key]
# Randomly set some attributes to a wrong data type
elif rand_val < wrong_type_prob:
instance_dict[key] = wrong_type(instance_dict[key])
# Randomly set some attributes to a null value
elif rand_val < null_prob:
instance_dict[key] = random.choice(null_values)
bad_data.append(instance_dict)
return bad_data
if __name__ == "__main__":
@attr.s
class Hobby:
tag: str = attr.ib()
@attr.s
class Address:
street: str = attr.ib()
city: str = attr.ib()
@attr.s
class Person:
name: str = attr.ib()
age: int = attr.ib()
birth_date: date = attr.ib()
address: Address = attr.ib()
hobbies: List[Hobby] = attr.ib()
investments: List[int] = attr.ib()
last_login: datetime = attr.ib()
seed = 42
num_records = 5
people = data_factory(Person, num_records, seed=seed)
print()
print("List People Data:")
for person in people:
print(person)
print()
# Emulate bad data with custom probabilities and seed
remove_probability = 0.1
wrong_type_prob = 0.15
null_probability = 0.2
bad_people_data = emulate_bad_data(
instances=people,
remove_prob=remove_probability,
wrong_type_prob=wrong_type_prob,
null_prob=null_probability,
null_values=[None, "NULL", "<null>", 0],
seed=seed,
)
# Print bad data
print()
print("List People Bad Data:")
for person in bad_people_data:
print(person)
# %%
from typing_extensions import Literal
from typing import List, Optional
from dataclasses import dataclass
import numpy as np
import pandas as pd
@dataclass
class SeasonalityRecurrencePeriods:
"""
Data class to represent the seasonality patterns to be applied to a time series.
Attributes:
periods (int): The number of periods within which the seasonality pattern occurs.
For example, if the time series is daily and periods=7, the seasonality
pattern will repeat every 7 days.
recurrence (int, optional): The number of times the seasonality pattern occurs within
the specified periods. Default is 1.
For example, if periods=7 and recurrence=2, the pattern
will occur twice within every 7-day period.
"""
periods: int # within these periods
recurrence: int = 1 # number of times it occurs
def generate_time_series(
n: int,
freq: str = "D",
start_date: str = "2022-01-01",
seasonality_recurrence_periods: Optional[List[SeasonalityRecurrencePeriods]] = None,
trend: Literal["up", "down", "random"] = None,
noise_std: float = 0.1,
seed: Optional[int] = None,
min_value: Optional[float] = None,
max_value: Optional[float] = None,
) -> pd.Series:
"""
Generate synthetic time series data.
Args:
n (int): Number of data points.
freq (str): Frequency of the time series ('D' for daily, 'H' for hourly, etc.).
start_date (str): Start date of the time series.
seasonality_periods (Optional[List[SeasonalityRecurrencePeriods]]): List of SeasonalityRecurrencePeriods for multiple seasonality patterns.
trend (Literal['up', 'down', 'random']): Type of trend ('up', 'down', 'random').
noise_std (float): Standard deviation of the noise.
seed (Optional[int]): Optional Random seed for reproducibility.
min_value (Optional[float]): Optional Minimum value for the time series.
max_value (Optional[float]): Optional Maximum value for the time series.
Returns:
pd.Series: Generated time series data.
"""
# Set seed for consistency in output
if seed is not None:
np.random.seed(seed)
# Generate time index
time_index = pd.date_range(start=start_date, periods=n, freq=freq)
# Initialize time series
ts = np.zeros(n)
# Add multiple seasonality_recurrence_periods
if seasonality_recurrence_periods:
for season in seasonality_recurrence_periods:
freq = season.recurrence
periods = season.periods
omega = 2 * np.pi / periods
ts += np.sin(omega * np.arange(n) * freq)
# Add trend
if trend == "up":
ts += np.linspace(0, 1, n)
elif trend == "down":
ts += np.linspace(1, 0, n)
elif trend == "random":
ts += np.random.uniform(-1, 1, n).cumsum()
# Add noise
ts += np.random.normal(scale=noise_std, size=n)
# Rescale to fit within min_value and max_value
if min_value is not None and max_value is not None:
ts_min = np.min(ts)
ts_max = np.max(ts)
ts = min_value + (max_value - min_value) * (ts - ts_min) / (ts_max - ts_min)
return pd.Series(ts, index=time_index)
@dataclass
class Adjustment:
"""
Data class to represent an adjustment to be applied to a time series.
Attributes:
adjustment_name (str): The name of the adjustment, used for labeling in the DataFrame.
start_date (str): The start date for the adjustment in 'YYYY-MM-DD' format.
end_date (str): The end date for the adjustment in 'YYYY-MM-DD' format.
adjustment_type (Literal["percentage", "value"]): The type of adjustment to apply.
- 'percentage': Applies a percentage adjustment to the existing value.
- 'value': Replaces the existing value with the specified adjustment_value.
adjustment_value (float): The value of the adjustment.
- For 'percentage': A positive or negative float representing the percentage change.
- For 'value': The value to replace the existing data with.
"""
adjustment_name: str
start_date: str
end_date: str
adjustment_type: Literal["percentage", "value"] # 'percentage' or 'value'
adjustment_value: float
def apply_adjustments(ts: pd.Series, adjustments: list[Adjustment]) -> pd.DataFrame:
"""
Apply adjustments to a time series.
This will cumulatively apply the adjustments in the order they are loaded.
If `Adjustment.adjustment_type` = "percentage" it will be a percentage of the existing value
If `Adjustment.adjustment_type` = "value" it will be the explicit value and override the existing value
Args:
ts (pd.Series): Original time series.
adjustments (list[Adjustment]): List of Adjustment objects containing adjustment parameters.
Returns:
pd.DataFrame: DataFrame containing original, adjusted with column names based on the `Adjustment.adjustment_name`,
and final time series.
"""
df = pd.DataFrame({"original": ts, "final": ts})
for adj in adjustments:
name = adj.adjustment_name
start_date = adj.start_date
end_date = adj.end_date
adj_type = adj.adjustment_type
adj_value = adj.adjustment_value
mask = (df.index >= start_date) & (df.index <= end_date)
if adj_type == "percentage":
df[name] = (
df["final"] * (1 + adj_value)
if adj_value >= 0
else df["final"] * (1 - abs(adj_value))
)
elif adj_type == "value":
df[name] = adj_value
df[name].where(mask, df["final"], inplace=True)
df["final"] = df[name]
# Reorder columns to make sure 'final' is the last column
column_order = [col for col in df.columns if col != "final"] + ["final"]
df = df[column_order]
return df
# %%
if __name__ == "__main__":
# Example usage
n = 210
freq = "D"
start_date = "2022-01-01"
seasonality_recurrence_periods = [
SeasonalityRecurrencePeriods(periods=30, recurrence=1), # Monthly seasonality
SeasonalityRecurrencePeriods(periods=7, recurrence=1), # Weekly seasonality
]
trend = "up" # Can be 'up', 'down', or 'random'
noise_std = 0.3
seed = 42
min_value = 0 # Optional minimum value
max_value = 1 # Optional maximum value
time_series_df = generate_time_series(
n=n,
freq=freq,
start_date=start_date,
seasonality_recurrence_periods=seasonality_recurrence_periods,
trend=trend,
noise_std=noise_std,
seed=seed,
min_value=min_value,
max_value=max_value,
)
time_series_df.plot()
adjustments = [
Adjustment("Holiday", "2022-12-01", "2022-12-31", "percentage", 0.1),
Adjustment("Outage", "2022-06-01", "2022-06-30", "value", -0.5),
Adjustment("Holiday2", "2022-06-01", "2022-06-30", "percentage", 0.9),
]
adjusted_time_series_df = apply_adjustments(time_series_df, adjustments)
adjusted_time_series_df.plot()
print(adjusted_time_series_df)
# %%
from typing import Any, get_args, get_origin, Type, Dict, Optional, List
import random
from datetime import date, datetime
from pydantic import BaseModel, Field
from faker import Faker
def pydantic_data_factory(
cls: Type[BaseModel],
num_records: int,
custom_attribute_mapping: Dict[str, str] = {},
seed: Optional[int] = None,
list_min: int = 0,
list_max: int = 10,
) -> List[BaseModel]:
"""
Generates instances of a given class using the Faker library.
This can be made consistent if a `seed` is set.
Args:
cls (Type): The class type for which instances will be generated.
num_records (int): The number of records to generate.
custom_attribute_mapping (Dict[str, str], optional): Custom mapping between attribute names and Faker methods.
seed (Optional[int], optional): Seed for the random number generator to ensure consistency. Defaults to None.
list_min (int, optional): Minimum number of elements in a list attribute. Defaults to 0.
list_max (int, optional): Maximum number of elements in a list attribute. Defaults to 10.
Returns:
List: A list of instances of the specified class.
"""
# Define a mapping between attribute names and corresponding Faker methods
attribute_mapping = {**custom_attribute_mapping}
# Define a mapping between attribute types and corresponding Faker methods
type_mapping = {
int: "random_int",
str: "word",
float: "random_number",
bool: "boolean",
date: "date_between",
datetime: "date_time_between",
}
def generate_instance(cls: Type[BaseModel]) -> BaseModel:
"""
Generates an instance of the specified class.
Args:
cls (Type): The class type for which an instance will be generated.
Returns:
Type: An instance of the specified class.
"""
kwargs = {}
for name, field in cls.model_fields.items():
attr_type = field.annotation
faker_method_name = attribute_mapping.get(name, name)
if get_origin(attr_type) is list:
element_type = get_args(attr_type)[0]
if issubclass(element_type, BaseModel):
kwargs[name] = [
generate_instance(element_type)
for _ in range(list_min, random.randint(list_min, list_max))
]
elif element_type in type_mapping:
faker_method = getattr(fake, type_mapping[element_type])
kwargs[name] = [faker_method() for _ in range(2)]
else:
kwargs[name] = None
elif issubclass(attr_type, BaseModel):
kwargs[name] = generate_instance(attr_type)
else:
faker_method = getattr(fake, faker_method_name, None)
if faker_method is None and attr_type in type_mapping:
faker_method = getattr(fake, type_mapping[attr_type])
if callable(faker_method):
kwargs[name] = faker_method()
else:
kwargs[name] = None
return cls(**kwargs)
if seed is not None:
fake = Faker()
fake.seed_instance(seed)
else:
fake = Faker()
instances = [generate_instance(cls) for _ in range(num_records)]
return instances
def pydantic_wrong_type(value: Any) -> Any:
"""
Returns a value of a wrong data type.
Args:
value (Any): The original value.
Returns:
Any: A value of a wrong data type.
"""
type_mapping = {
int: [
"wrong",
3.14,
True,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
str: [
42,
3.14,
False,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
float: [
"wrong",
42,
False,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
bool: [
"wrong",
42,
3.14,
date(1991, 9, 23),
datetime(1991, 9, 23, 12, 59, 59),
],
date: [
"wrong",
42,
True,
3.14,
datetime(1991, 9, 23, 12, 59, 59),
],
datetime: [
"wrong",
42,
True,
3.14,
date(1991, 9, 23),
],
}
wrong_values = type_mapping.get(type(value), [None])
return random.choice(wrong_values)
def pydantic_emulate_bad_data(
instances: List[BaseModel],
remove_prob: float = 0.1,
wrong_type_prob: float = 0.15,
null_prob: float = 0.2,
null_values: List = [None, "NULL", "<null>"],
seed: Optional[int] = None,
) -> List[Dict]:
"""
Emulates bad data by randomly removing or nullifying attributes of instances.
This can be made consistent if a `seed` is set.
Args:
instances (List[Type]): List of instances to be modified.
remove_prob (float, optional): Probability of removing an attribute. Defaults to 0.1.
wrong_type_prob (float, optional): Probability of setting an attribute to a different type value. Defaults to 0.15.
null_prob (float, optional): Probability of setting an attribute to a null value. Defaults to 0.2.
null_values (List, optional): List of possible null values. Defaults to [None, "NULL", "<null>"].
seed (Optional[int], optional): Seed for random number generator for reproducibility. Defaults to None.
Returns:
List[Dict]: List of dictionaries, each representing an instance with emulated bad data.
"""
if seed is not None:
random.seed(seed)
bad_data = []
for instance in instances:
instance_dict = instance.dict()
for key in list(instance_dict.keys()):
rand_val = random.random()
if rand_val < remove_prob:
del instance_dict[key]
elif rand_val < wrong_type_prob:
instance_dict[key] = pydantic_wrong_type(instance_dict[key])
elif rand_val < null_prob:
instance_dict[key] = random.choice(null_values)
bad_data.append(instance_dict)
return bad_data
if __name__ == "__main__":
class Hobby(BaseModel):
tag: str
class Address(BaseModel):
street: str
city: str
class Person(BaseModel):
name: str
age: int
birth_date: date
address: Address
hobbies: List[Hobby]
investments: List[int]
last_login: datetime
seed = 42
num_records = 5
people = pydantic_data_factory(Person, num_records, seed=seed)
print()
print("List People Data:")
for person in people:
print(person)
print()
remove_probability = 0.1
wrong_type_prob = 0.15
null_probability = 0.2
bad_people_data = pydantic_emulate_bad_data(
instances=people,
remove_prob=remove_probability,
wrong_type_prob=wrong_type_prob,
null_prob=null_probability,
null_values=[None, "NULL", "<null>", 0],
seed=seed,
)
print()
print("List People Bad Data:")
for person in bad_people_data:
print(person)
attrs
Faker
matplotlib
pandas
pydantic
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment