Skip to content

Instantly share code, notes, and snippets.

@myui
Created May 9, 2024 09:47
Show Gist options
  • Save myui/2d5c9a4b81d9fed879f5e0ba5fcd3d8f to your computer and use it in GitHub Desktop.
Save myui/2d5c9a4b81d9fed879f5e0ba5fcd3d8f to your computer and use it in GitHub Desktop.
import random
from faker import Faker
import datetime
import uuid
import pandas as pd
from tqdm import tqdm
def randint_gauss(start, end):
"""
Generate integers within a specific range using a standard distribution.
Args:
start (int): The starting value of the range (inclusive).
end (int): The ending value of the range (exclusive).
num_samples (int): The number of integers to generate.
Returns:
list: A list of integers generated within the specified range.
"""
mean = (start + end) / 2
std_dev = (end - start) / 6 # Adjust the standard deviation based on the range
value = int(random.gauss(mean, std_dev))
# Ensure the generated value is within the specified range
return max(start, min(value, end - 1))
def generate_fake_data():
write_mode = 'overwrite'
fake = Faker()
now = datetime.datetime.now()
users = []
transactions = []
for i in tqdm(range(NUM_USERS)):
user_id = uuid.uuid4()
_, name, sex, address, mail, birthdate = fake.simple_profile().values()
birthdate = birthdate.isoformat()
user_name = fake.name()
num_txn = randint_gauss(*NUM_TXN_PER_USER)
for _ in range(num_txn):
category = random.choice(PRODUCT_CATEGORIES)
tstamp = fake.past_datetime(start_date='-9y')
amount = random.randint(*AMOUNT_RANGE)
transactions.append((i, user_id, tstamp, category, amount))
users.append((i, user_id, name, sex, address, mail, birthdate, num_txn))
if i > 0 and i % 500_000 == 0:
master_df = pd.DataFrame(users, columns=['user_seq', 'user_id', 'name', 'sex', 'address', 'mail', 'birthdate', 'num_txn'])
client.load_table_from_dataframe(master_df, 'myui.master_profile', writer='bulk_import', if_exists=write_mode, fmt='msgpack')
users = []
transaction_df = pd.DataFrame(transactions, columns=['user_seq', 'user_id', 'tstamp', 'category', 'amount']).sample(frac=1).sort_values(by=['tstamp']).reset_index(drop=True)
client.load_table_from_dataframe(transaction_df, 'myui.transactions', writer='bulk_import', if_exists=write_mode, fmt='msgpack')
transactions = []
write_mode = 'append'
if len(users) > 0:
master_df = pd.DataFrame(users, columns=['user_seq', 'user_id', 'name', 'sex', 'address', 'mail', 'birthdate', 'num_txn'])
client.load_table_from_dataframe(master_df, 'myui.master_profile', writer='bulk_import', if_exists=write_mode, fmt='msgpack')
if len(transactions) > 0:
transaction_df = pd.DataFrame(transactions, columns=['user_seq', 'user_id', 'tstamp', 'category', 'amount']).sample(frac=1).sort_values(by=['tstamp']).reset_index(drop=True)
client.load_table_from_dataframe(transaction_df, 'myui.transactions', writer='bulk_import', if_exists=write_mode, fmt='msgpack')
return users, transactions
import os
os.environ["TD_API_KEY"] = td_apikey
os.environ["TD_API_SERVER"] = td_endpoint
os.environ["TD_PRESTO_API"] = td_presto_endpoint
import pytd
client = pytd.Client(database='myui', retry_post_requests=True)
NUM_USERS = 200_000_000 # 100x transactions are generated on average
#NUM_USERS = 10_000 # 100x transactions are generated on average
NUM_TXN_PER_USER = (1, 200)
AMOUNT_RANGE = (1, 10000)
PRODUCT_CATEGORIES = [
"Books",
"Movies",
"Music",
"Games",
"Electronics",
"Computers",
"Home",
"Garden",
"Tools",
"Grocery",
"Health",
"Beauty",
"Toys",
"Kids",
"Baby",
"Clothing",
"Shoes",
"Jewelery",
"Sports",
"Outdoors",
"Automotive",
"Industrial"
]
generate_fake_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment