Skip to content

Instantly share code, notes, and snippets.

@jgoerner
Created March 21, 2020 21:56
Show Gist options
  • Save jgoerner/0a86480b6c7c265a752c12f0e13ace42 to your computer and use it in GitHub Desktop.
Save jgoerner/0a86480b6c7c265a752c12f0e13ace42 to your computer and use it in GitHub Desktop.
Corona Hackathon Mock Data Generation
### IMPORTS ###
from datetime import datetime
from random import random, randint
import uuid
from faker import Faker
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from tqdm import tqdm
### CONFIG ###
DB_URL = "postgres://<USR>:<PWD>@<HOST>:5432/<DB>"
con = create_engine(DB_URL)
f = Faker("de_DE")
AGE_GROUPS = ["18-20", "21-30", "31-40", "41-50", "50+"]
### FAKER METHODS ###
def fill_employee(n=100):
pd.DataFrame([
{
"id": str(uuid.uuid4()),
"age_group": np.random.choice(AGE_GROUPS, p=[0.2, 0.2, 0.3, 0.2, .1]),
"zip_code": f.postcode(),
"student": random() > 0.5,
"german_speaking": np.random.rand() > 0.5,
"drivers_license": np.random.rand() > 0.5,
"max_h_per_week": np.random.randint(10, 40),
"available_from": f.date_time_this_month(before_now=False, after_now=True, tzinfo=None),
"registration": datetime.now()
}
for x
in tqdm(range(n))
]).to_sql("employee", con=con, if_exists="replace", index=False)
def fill_employer(n=100):
pd.DataFrame([
{
"id": str(uuid.uuid4()),
"name": f.company(),
"domain": f.bs(),
"website": f.uri()
}
for x
in tqdm(range(n))
]).to_sql("employer", con=con, if_exists="replace", index=False)
def fill_job(n=500):
# prep
employer_ids = pd.read_sql_table("employer", con=con)["id"].values
return pd.DataFrame([
{
"id": str(uuid.uuid4()),
"employer_id": np.random.choice(employer_ids),
"title": f.job(),
"description": f.catch_phrase(),
"zip_code": f.postcode(),
"initial_demand_qty": np.random.randint(50, 100),
"current_demand_qty": np.random.randint(0, 50),
"salary_per_h": np.random.randint(12, 20),
"start_date": f.date_time_this_month(before_now=False, after_now=True, tzinfo=None),
"hours_per_week": np.random.randint(10, 40),
"german_required": np.random.rand() > 0.5,
"drivers_license_required": np.random.rand() > 0.5,
"timestamp_posting": datetime.now(),
"welcome_info": f.sentence(nb_words=6, variable_nb_words=True, ext_word_list=None)
}
for x
in tqdm(range(n))
]).to_sql("job", con=con, if_exists="replace", index=False)
### EXECUTION ###
fill_employee()
fill_employer()
fill_job()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment