Skip to content

Instantly share code, notes, and snippets.

@BexTuychiev
Created March 22, 2023 05:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BexTuychiev/92a1fbbed96fa52cec47fe2cd725cf3e to your computer and use it in GitHub Desktop.
Save BexTuychiev/92a1fbbed96fa52cec47fe2cd725cf3e to your computer and use it in GitHub Desktop.
import os
import random
import time
import numpy as np
import pandas as pd
from faker import Faker
# Set seed for reproducibility
random.seed(42)
np.random.seed(42)
# Set up Faker to generate fake data
fake = Faker()
# Set up DataFrame
num_records = 50_000_000
df = pd.DataFrame(index=range(num_records))
# Add numeric columns
df["age"] = np.random.normal(40, 15, num_records).astype(
int
)
df["income"] = np.random.normal(
50000, 20000, num_records
).astype(int)
# Add categorical columns
df["gender"] = np.random.choice(
["Male", "Female"], num_records
)
df["education"] = np.random.choice(
["High School", "College", "Graduate"], num_records
)
df["marital_status"] = np.random.choice(
["Single", "Married", "Divorced", "Widowed"],
num_records,
)
df["occupation"] = np.random.choice(
[
"Manager",
"Professional",
"Clerical",
"Service",
"Sales",
"Skilled",
"Unskilled",
],
num_records,
)
# Add datetime column
df["birth_date"] = pd.to_datetime(
[
fake.date_of_birth(minimum_age=18, maximum_age=100)
for _ in range(num_records)
]
)
# Save to CSV
df.to_csv("data/census_data.csv", index=False)
# Print size of CSV
print(
f"Size of CSV: {round(os.path.getsize('data/census_data.csv') / (1024 ** 3), 2)} GB"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment