Last active
November 29, 2022 11:50
-
-
Save minrk/ea0c2102ffe3f09165f8970e647b9976 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Run with: | |
export PGHOST=127.0.0.1 | |
export MYSQL_HOST=127.0.0.1 | |
export PGUSER=user | |
export PGPASSWORD=pass | |
for DB in postgres mysql; do | |
export DB | |
bash ci/docker-db.sh | |
bash ci/init-db.sh | |
done | |
for branch in main metricssss; do | |
git switch $branch | |
for DB in sqlite postgres mysql; do | |
export DB | |
if [ $DB != sqlite ]; then | |
bash ci/docker-db.sh | |
bash ci/init-db.sh | |
fi | |
python profile_index.py -f ${DB}-${branch}.pkl | |
done | |
done | |
""" | |
import os | |
import tempfile | |
import time | |
from datetime import datetime, timedelta | |
from functools import partial | |
from urllib.parse import quote | |
import click | |
import numpy as np | |
import pandas as pd | |
from jupyterhub import orm | |
# user distribution: | |
# 30% chance uniformly in the last 2 days | |
# 20% chance uniformly in the last 2 weeks | |
# 50% chance uniformly in the last 2 months | |
# that should gives return counts: | |
# 30-day: 75% of users | |
# 7-day: 46% | |
# 1-day: 32% | |
HOURS = 3600 | |
DAYS = 24 * HOURS | |
WEEKS = 7 * DAYS | |
MONTHS = 30 * DAYS | |
ago_weights = { | |
2 * DAYS: 0.3, | |
2 * WEEKS: 0.2, | |
2 * MONTHS: 0.5, | |
} | |
user_counter = 0 | |
np.random.seed(1) | |
def populate_db(db, user_count): | |
global user_counter | |
have_users = db.query(orm.User).count() | |
# rely on dict ordering | |
offset_choices = list(ago_weights.keys()) | |
weights = list(ago_weights.values()) | |
new_users = user_count - have_users | |
offsets = np.random.choice( | |
offset_choices, size=new_users, p=weights | |
) * np.random.random(size=new_users) | |
now = datetime.utcnow() | |
for offset in offsets: | |
user = orm.User(name=f"test-{user_counter}") | |
user_counter += 1 | |
seconds = offset | |
user.last_activity = now - timedelta(seconds=int(offset)) | |
db.add(user) | |
db.commit() | |
def run_query(db): | |
now = datetime.utcnow() | |
cutoffs = { | |
"24h": now - timedelta(hours=24), | |
"7d": now - timedelta(days=7), | |
"30d": now - timedelta(days=30), | |
} | |
sample = {} | |
for period, cutoff in cutoffs.items(): | |
value = db.query(orm.User).filter(orm.User.last_activity >= cutoff).count() | |
sample[period] = value | |
return sample | |
def time_one(f, min_samples=3, max_time=1, max_samples=20): | |
start = toc = time.perf_counter() | |
samples = 0 | |
while samples < min_samples and samples < max_samples and toc - start < max_time: | |
tic = time.perf_counter() | |
sample = f() | |
toc = time.perf_counter() | |
t = toc - tic | |
sample["time"] = t | |
yield sample | |
samples += 1 | |
def collect_data(max_n=100_000, max_runtime=60, scale=1.5): | |
records = [] | |
start = time.perf_counter() | |
start_users = total_users = 100 | |
db_type = os.environ.get("DB", "sqlite") | |
try: | |
with tempfile.TemporaryDirectory() as td: | |
# db and env initialized via jupyterhub ci/docker-db.sh | |
if db_type == "sqlite": | |
db_url = f"sqlite:///{td}/test.sqlite" | |
elif db_type == "postgres": | |
db_url = f"postgresql://{os.environ['PGUSER']}:{quote(os.environ['PGPASSWORD'])}@{os.environ['PGHOST']}:5432/jupyterhub" | |
elif db_type == "mysql": | |
db_url = f"mysql+mysqlconnector://root@{os.environ['MYSQL_HOST']}:3306/jupyterhub" | |
db = orm.new_session_factory(db_url)() | |
# delete all users to start | |
have_users = db.query(orm.User).count() | |
if have_users: | |
print(f"Deleting {have_users} users") | |
sql = orm.User.__table__.delete() | |
db.execute(sql) | |
have_users = db.query(orm.User).count() | |
assert have_users == 0 | |
f = partial(run_query, db) | |
while time.perf_counter() - start < max_runtime and total_users < max_n: | |
populate_db(db, total_users) | |
time.sleep(1) | |
for sample in time_one(f): | |
sample["users"] = total_users | |
sample["db"] = db_type | |
print(f"{sample}") | |
records.append(sample) | |
total_users = int(scale * total_users) | |
except KeyboardInterrupt: | |
return records | |
return records | |
@click.command() | |
@click.option( | |
"--fname", "-f", default="samples.pkl", help="filename to save dataframe to" | |
) | |
@click.option("--max-n", "-n", default=100_000, help="Max number of users") | |
@click.option("--max-runtime", "-t", default=60, help="Max time to run") | |
@click.option( | |
"--scale", | |
"-s", | |
default=1.5, | |
help="Scale number of users between samples (1.5 = increase by 50% each run)", | |
) | |
def main(fname, max_n, max_runtime, scale): | |
db = os.environ.get("DB", "sqlite") | |
print(f"{fname=}, {max_n=}, {max_runtime=}, {scale=}, {db=}") | |
records = collect_data(max_n=max_n, max_runtime=max_runtime, scale=scale) | |
df = pd.DataFrame.from_dict(records) | |
df.to_pickle(fname) | |
print(f"Wrote {fname}, read with `pandas.from_pickle('{fname}')") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment