Skip to content

Instantly share code, notes, and snippets.

@prrao87
Last active January 27, 2024 00:04
Show Gist options
  • Save prrao87/8714908422c8cd472a904be810bcaca0 to your computer and use it in GitHub Desktop.
Save prrao87/8714908422c8cd472a904be810bcaca0 to your computer and use it in GitHub Desktop.
gen_fake_persons.py
"""
Generate a fake dataset (csv or parquet) of persons
Two columns: name (str) and age (int)
Ensure the faker library and polars are installed:
pip install faker polars
Usage:
python gen_data.py -n 1000000 -f csv
python gen_data.py -n 1000000 -f parquet
"""
import argparse
import polars as pl
import faker
fake = faker.Faker()
# Fix seed
faker.Faker.seed(24315)
def gen_person(index: int) -> dict[str, str | int]:
last_initial = fake.random_letter().upper()
return {
"id": index,
"name": f"{fake.first_name()} {last_initial}.",
"age": fake.random_int(min=18, max=65, step=1),
}
def main(num: int, file_type: str = "parquet") -> None:
data = []
for i in range(num):
data.append(gen_person(i + 1))
df = pl.DataFrame(data)
if file_type == "csv":
df.write_csv("data.csv")
else:
df.write_parquet("data.parquet")
print(f"Generated {len(data)} records")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num", "-n", type=int, default=10)
parser.add_argument("--file_type", "-f", type=str, default="csv")
args = parser.parse_args()
main(args.num, args.file_type)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment