Skip to content

Instantly share code, notes, and snippets.

@justinmklam
Created August 16, 2021 23:12
Show Gist options
  • Save justinmklam/7d9e6981bb7adf020f74bc65c66a651b to your computer and use it in GitHub Desktop.
Save justinmklam/7d9e6981bb7adf020f74bc65c66a651b to your computer and use it in GitHub Desktop.
Script to generate fake data in JSONL format (e.g. for uploading to BigQuery)
import json
import os
import sys
import time
from argparse import ArgumentParser
from uuid import uuid4
from faker import Faker # pip install faker
fake = Faker()
def progressbar(it, prefix="", file=sys.stdout):
"""Simple progress bar to provide feedback during long running loops"""
count = len(it)
num_bar_chars = 5
width = os.get_terminal_size().columns - (
len(prefix) + len(str(count)) * 2 + num_bar_chars
)
def show(j):
x = int(width * j / count)
file.write("%s[%s%s] %i/%i\r" % (prefix, "#" * x, "." * (width - x), j, count))
file.flush()
show(0)
for i, item in enumerate(it):
yield item
show(i + 1)
file.write("\n")
file.flush()
if __name__ == "__main__":
parser = ArgumentParser("Generate fake data for importing into a database.")
parser.add_argument("size", type=int, help="Number of rows/records to generate")
parser.add_argument("filename", help="Example: fake_data.json")
args = parser.parse_args()
start_time = time.time()
with open(args.filename, "w") as f:
for i in progressbar(range(0, args.size), "Rows created:"):
profile = fake.profile()
row = {
"id": i,
"full_name": profile["name"],
"address": profile["address"],
"email": profile["mail"],
"website": profile["website"][0],
"account_uuid": str(uuid4()),
"personal": {
"ssn": profile["ssn"],
"sex": profile["sex"],
"blood_group": profile["blood_group"]
},
"phone": fake.phone_number(),
"credit_card": int(fake.credit_card_number())
}
# Stringify any nested dicts
for key, val in row.items():
if type(val) is dict:
row[key] = json.dumps(val)
out_row = json.dumps(row) + "\n"
f.write(out_row)
print(f"Wrote {args.size} rows to {args.filename} in {time.time() - start_time:.1f} sec.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment