Created
March 22, 2023 05:27
-
-
Save BexTuychiev/92a1fbbed96fa52cec47fe2cd725cf3e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import random | |
import time | |
import numpy as np | |
import pandas as pd | |
from faker import Faker | |
# Set seed for reproducibility | |
random.seed(42) | |
np.random.seed(42) | |
# Set up Faker to generate fake data | |
fake = Faker() | |
# Set up DataFrame | |
num_records = 50_000_000 | |
df = pd.DataFrame(index=range(num_records)) | |
# Add numeric columns | |
df["age"] = np.random.normal(40, 15, num_records).astype( | |
int | |
) | |
df["income"] = np.random.normal( | |
50000, 20000, num_records | |
).astype(int) | |
# Add categorical columns | |
df["gender"] = np.random.choice( | |
["Male", "Female"], num_records | |
) | |
df["education"] = np.random.choice( | |
["High School", "College", "Graduate"], num_records | |
) | |
df["marital_status"] = np.random.choice( | |
["Single", "Married", "Divorced", "Widowed"], | |
num_records, | |
) | |
df["occupation"] = np.random.choice( | |
[ | |
"Manager", | |
"Professional", | |
"Clerical", | |
"Service", | |
"Sales", | |
"Skilled", | |
"Unskilled", | |
], | |
num_records, | |
) | |
# Add datetime column | |
df["birth_date"] = pd.to_datetime( | |
[ | |
fake.date_of_birth(minimum_age=18, maximum_age=100) | |
for _ in range(num_records) | |
] | |
) | |
# Save to CSV | |
df.to_csv("data/census_data.csv", index=False) | |
# Print size of CSV | |
print( | |
f"Size of CSV: {round(os.path.getsize('data/census_data.csv') / (1024 ** 3), 2)} GB" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment