Skip to content

Instantly share code, notes, and snippets.

@minrk
Last active November 29, 2022 11:50
Show Gist options
  • Save minrk/ea0c2102ffe3f09165f8970e647b9976 to your computer and use it in GitHub Desktop.
Save minrk/ea0c2102ffe3f09165f8970e647b9976 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
"""
Run with:
export PGHOST=127.0.0.1
export MYSQL_HOST=127.0.0.1
export PGUSER=user
export PGPASSWORD=pass
for DB in postgres mysql; do
export DB
bash ci/docker-db.sh
bash ci/init-db.sh
done
for branch in main metricssss; do
git switch $branch
for DB in sqlite postgres mysql; do
export DB
if [ $DB != sqlite ]; then
bash ci/docker-db.sh
bash ci/init-db.sh
fi
python profile_index.py -f ${DB}-${branch}.pkl
done
done
"""
import os
import tempfile
import time
from datetime import datetime, timedelta
from functools import partial
from urllib.parse import quote
import click
import numpy as np
import pandas as pd
from jupyterhub import orm
# user distribution:
# 30% chance uniformly in the last 2 days
# 20% chance uniformly in the last 2 weeks
# 50% chance uniformly in the last 2 months
# that should gives return counts:
# 30-day: 75% of users
# 7-day: 46%
# 1-day: 32%
HOURS = 3600
DAYS = 24 * HOURS
WEEKS = 7 * DAYS
MONTHS = 30 * DAYS
ago_weights = {
2 * DAYS: 0.3,
2 * WEEKS: 0.2,
2 * MONTHS: 0.5,
}
user_counter = 0
np.random.seed(1)
def populate_db(db, user_count):
global user_counter
have_users = db.query(orm.User).count()
# rely on dict ordering
offset_choices = list(ago_weights.keys())
weights = list(ago_weights.values())
new_users = user_count - have_users
offsets = np.random.choice(
offset_choices, size=new_users, p=weights
) * np.random.random(size=new_users)
now = datetime.utcnow()
for offset in offsets:
user = orm.User(name=f"test-{user_counter}")
user_counter += 1
seconds = offset
user.last_activity = now - timedelta(seconds=int(offset))
db.add(user)
db.commit()
def run_query(db):
now = datetime.utcnow()
cutoffs = {
"24h": now - timedelta(hours=24),
"7d": now - timedelta(days=7),
"30d": now - timedelta(days=30),
}
sample = {}
for period, cutoff in cutoffs.items():
value = db.query(orm.User).filter(orm.User.last_activity >= cutoff).count()
sample[period] = value
return sample
def time_one(f, min_samples=3, max_time=1, max_samples=20):
start = toc = time.perf_counter()
samples = 0
while samples < min_samples and samples < max_samples and toc - start < max_time:
tic = time.perf_counter()
sample = f()
toc = time.perf_counter()
t = toc - tic
sample["time"] = t
yield sample
samples += 1
def collect_data(max_n=100_000, max_runtime=60, scale=1.5):
records = []
start = time.perf_counter()
start_users = total_users = 100
db_type = os.environ.get("DB", "sqlite")
try:
with tempfile.TemporaryDirectory() as td:
# db and env initialized via jupyterhub ci/docker-db.sh
if db_type == "sqlite":
db_url = f"sqlite:///{td}/test.sqlite"
elif db_type == "postgres":
db_url = f"postgresql://{os.environ['PGUSER']}:{quote(os.environ['PGPASSWORD'])}@{os.environ['PGHOST']}:5432/jupyterhub"
elif db_type == "mysql":
db_url = f"mysql+mysqlconnector://root@{os.environ['MYSQL_HOST']}:3306/jupyterhub"
db = orm.new_session_factory(db_url)()
# delete all users to start
have_users = db.query(orm.User).count()
if have_users:
print(f"Deleting {have_users} users")
sql = orm.User.__table__.delete()
db.execute(sql)
have_users = db.query(orm.User).count()
assert have_users == 0
f = partial(run_query, db)
while time.perf_counter() - start < max_runtime and total_users < max_n:
populate_db(db, total_users)
time.sleep(1)
for sample in time_one(f):
sample["users"] = total_users
sample["db"] = db_type
print(f"{sample}")
records.append(sample)
total_users = int(scale * total_users)
except KeyboardInterrupt:
return records
return records
@click.command()
@click.option(
"--fname", "-f", default="samples.pkl", help="filename to save dataframe to"
)
@click.option("--max-n", "-n", default=100_000, help="Max number of users")
@click.option("--max-runtime", "-t", default=60, help="Max time to run")
@click.option(
"--scale",
"-s",
default=1.5,
help="Scale number of users between samples (1.5 = increase by 50% each run)",
)
def main(fname, max_n, max_runtime, scale):
db = os.environ.get("DB", "sqlite")
print(f"{fname=}, {max_n=}, {max_runtime=}, {scale=}, {db=}")
records = collect_data(max_n=max_n, max_runtime=max_runtime, scale=scale)
df = pd.DataFrame.from_dict(records)
df.to_pickle(fname)
print(f"Wrote {fname}, read with `pandas.from_pickle('{fname}')")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment