minrk/plot-profile.ipynb

## plot-profile.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              plot-profile.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## plot_profile.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              plot_profile.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## profile_index.py
"""
Run with:

export PGHOST=127.0.0.1
export MYSQL_HOST=127.0.0.1
export PGUSER=user
export PGPASSWORD=pass

for DB in postgres mysql; do
    export DB
    bash ci/docker-db.sh
    bash ci/init-db.sh
done

for branch in main metricssss; do
    git switch $branch
    for DB in sqlite postgres mysql; do
        export DB
        if [ $DB != sqlite ]; then
          bash ci/docker-db.sh
          bash ci/init-db.sh
        fi
        python profile_index.py -f ${DB}-${branch}.pkl
    done
done
"""

import os
import tempfile
import time
from datetime import datetime, timedelta
from functools import partial
from urllib.parse import quote

import click
import numpy as np
import pandas as pd

from jupyterhub import orm

# user distribution:
# 30% chance uniformly in the last 2 days
# 20% chance uniformly in the last 2 weeks
# 50% chance uniformly in the last 2 months

# that should gives return counts:
# 30-day: 75% of users
# 7-day: 46%
# 1-day: 32%

HOURS = 3600
DAYS = 24 * HOURS
WEEKS = 7 * DAYS
MONTHS = 30 * DAYS
ago_weights = {
    2 * DAYS: 0.3,
    2 * WEEKS: 0.2,
    2 * MONTHS: 0.5,
}

user_counter = 0

np.random.seed(1)


def populate_db(db, user_count):
    global user_counter
    have_users = db.query(orm.User).count()
    # rely on dict ordering
    offset_choices = list(ago_weights.keys())
    weights = list(ago_weights.values())

    new_users = user_count - have_users
    offsets = np.random.choice(
        offset_choices, size=new_users, p=weights
    ) * np.random.random(size=new_users)
    now = datetime.utcnow()
    for offset in offsets:
        user = orm.User(name=f"test-{user_counter}")
        user_counter += 1
        seconds = offset
        user.last_activity = now - timedelta(seconds=int(offset))
        db.add(user)
    db.commit()


def run_query(db):
    now = datetime.utcnow()
    cutoffs = {
        "24h": now - timedelta(hours=24),
        "7d": now - timedelta(days=7),
        "30d": now - timedelta(days=30),
    }
    sample = {}
    for period, cutoff in cutoffs.items():
        value = db.query(orm.User).filter(orm.User.last_activity >= cutoff).count()
        sample[period] = value
    return sample


def time_one(f, min_samples=3, max_time=1, max_samples=20):
    start = toc = time.perf_counter()
    samples = 0
    while samples < min_samples and samples < max_samples and toc - start < max_time:
        tic = time.perf_counter()
        sample = f()
        toc = time.perf_counter()
        t = toc - tic
        sample["time"] = t
        yield sample
        samples += 1


def collect_data(max_n=100_000, max_runtime=60, scale=1.5):
    records = []
    start = time.perf_counter()
    start_users = total_users = 100
    db_type = os.environ.get("DB", "sqlite")
    try:
        with tempfile.TemporaryDirectory() as td:
            # db and env initialized via jupyterhub ci/docker-db.sh
            if db_type == "sqlite":
                db_url = f"sqlite:///{td}/test.sqlite"
            elif db_type == "postgres":
                db_url = f"postgresql://{os.environ['PGUSER']}:{quote(os.environ['PGPASSWORD'])}@{os.environ['PGHOST']}:5432/jupyterhub"
            elif db_type == "mysql":
                db_url = f"mysql+mysqlconnector://root@{os.environ['MYSQL_HOST']}:3306/jupyterhub"
            db = orm.new_session_factory(db_url)()
            # delete all users to start
            have_users = db.query(orm.User).count()
            if have_users:
                print(f"Deleting {have_users} users")
                sql = orm.User.__table__.delete()
                db.execute(sql)
                have_users = db.query(orm.User).count()
            assert have_users == 0
            f = partial(run_query, db)
            while time.perf_counter() - start < max_runtime and total_users < max_n:
                populate_db(db, total_users)
                time.sleep(1)
                for sample in time_one(f):
                    sample["users"] = total_users
                    sample["db"] = db_type
                    print(f"{sample}")
                    records.append(sample)
                total_users = int(scale * total_users)
    except KeyboardInterrupt:
        return records
    return records


@click.command()
@click.option(
    "--fname", "-f", default="samples.pkl", help="filename to save dataframe to"
)
@click.option("--max-n", "-n", default=100_000, help="Max number of users")
@click.option("--max-runtime", "-t", default=60, help="Max time to run")
@click.option(
    "--scale",
    "-s",
    default=1.5,
    help="Scale number of users between samples (1.5 = increase by 50% each run)",
)
def main(fname, max_n, max_runtime, scale):
    db = os.environ.get("DB", "sqlite")
    print(f"{fname=}, {max_n=}, {max_runtime=}, {scale=}, {db=}")
    records = collect_data(max_n=max_n, max_runtime=max_runtime, scale=scale)
    df = pd.DataFrame.from_dict(records)
    df.to_pickle(fname)
    print(f"Wrote {fname}, read with `pandas.from_pickle('{fname}')")


if __name__ == "__main__":
    main()
	"""
	Run with:

	export PGHOST=127.0.0.1
	export MYSQL_HOST=127.0.0.1
	export PGUSER=user
	export PGPASSWORD=pass

	for DB in postgres mysql; do
	export DB
	bash ci/docker-db.sh
	bash ci/init-db.sh
	done

	for branch in main metricssss; do
	git switch $branch
	for DB in sqlite postgres mysql; do
	export DB
	if [ $DB != sqlite ]; then
	bash ci/docker-db.sh
	bash ci/init-db.sh
	fi
	python profile_index.py -f ${DB}-${branch}.pkl
	done
	done
	"""

	import os
	import tempfile
	import time
	from datetime import datetime, timedelta
	from functools import partial
	from urllib.parse import quote

	import click
	import numpy as np
	import pandas as pd

	from jupyterhub import orm

	# user distribution:
	# 30% chance uniformly in the last 2 days
	# 20% chance uniformly in the last 2 weeks
	# 50% chance uniformly in the last 2 months

	# that should gives return counts:
	# 30-day: 75% of users
	# 7-day: 46%
	# 1-day: 32%

	HOURS = 3600
	DAYS = 24 * HOURS
	WEEKS = 7 * DAYS
	MONTHS = 30 * DAYS
	ago_weights = {
	2 * DAYS: 0.3,
	2 * WEEKS: 0.2,
	2 * MONTHS: 0.5,
	}

	user_counter = 0

	np.random.seed(1)


	def populate_db(db, user_count):
	global user_counter
	have_users = db.query(orm.User).count()
	# rely on dict ordering
	offset_choices = list(ago_weights.keys())
	weights = list(ago_weights.values())

	new_users = user_count - have_users
	offsets = np.random.choice(
	offset_choices, size=new_users, p=weights
	) * np.random.random(size=new_users)
	now = datetime.utcnow()
	for offset in offsets:
	user = orm.User(name=f"test-{user_counter}")
	user_counter += 1
	seconds = offset
	user.last_activity = now - timedelta(seconds=int(offset))
	db.add(user)
	db.commit()


	def run_query(db):
	now = datetime.utcnow()
	cutoffs = {
	"24h": now - timedelta(hours=24),
	"7d": now - timedelta(days=7),
	"30d": now - timedelta(days=30),
	}
	sample = {}
	for period, cutoff in cutoffs.items():
	value = db.query(orm.User).filter(orm.User.last_activity >= cutoff).count()
	sample[period] = value
	return sample


	def time_one(f, min_samples=3, max_time=1, max_samples=20):
	start = toc = time.perf_counter()
	samples = 0
	while samples < min_samples and samples < max_samples and toc - start < max_time:
	tic = time.perf_counter()
	sample = f()
	toc = time.perf_counter()
	t = toc - tic
	sample["time"] = t
	yield sample
	samples += 1


	def collect_data(max_n=100_000, max_runtime=60, scale=1.5):
	records = []
	start = time.perf_counter()
	start_users = total_users = 100
	db_type = os.environ.get("DB", "sqlite")
	try:
	with tempfile.TemporaryDirectory() as td:
	# db and env initialized via jupyterhub ci/docker-db.sh
	if db_type == "sqlite":
	db_url = f"sqlite:///{td}/test.sqlite"
	elif db_type == "postgres":
	db_url = f"postgresql://{os.environ['PGUSER']}:{quote(os.environ['PGPASSWORD'])}@{os.environ['PGHOST']}:5432/jupyterhub"
	elif db_type == "mysql":
	db_url = f"mysql+mysqlconnector://root@{os.environ['MYSQL_HOST']}:3306/jupyterhub"
	db = orm.new_session_factory(db_url)()
	# delete all users to start
	have_users = db.query(orm.User).count()
	if have_users:
	print(f"Deleting {have_users} users")
	sql = orm.User.__table__.delete()
	db.execute(sql)
	have_users = db.query(orm.User).count()
	assert have_users == 0
	f = partial(run_query, db)
	while time.perf_counter() - start < max_runtime and total_users < max_n:
	populate_db(db, total_users)
	time.sleep(1)
	for sample in time_one(f):
	sample["users"] = total_users
	sample["db"] = db_type
	print(f"{sample}")
	records.append(sample)
	total_users = int(scale * total_users)
	except KeyboardInterrupt:
	return records
	return records


	@click.command()
	@click.option(
	"--fname", "-f", default="samples.pkl", help="filename to save dataframe to"
	)
	@click.option("--max-n", "-n", default=100_000, help="Max number of users")
	@click.option("--max-runtime", "-t", default=60, help="Max time to run")
	@click.option(
	"--scale",
	"-s",
	default=1.5,
	help="Scale number of users between samples (1.5 = increase by 50% each run)",
	)
	def main(fname, max_n, max_runtime, scale):
	db = os.environ.get("DB", "sqlite")
	print(f"{fname=}, {max_n=}, {max_runtime=}, {scale=}, {db=}")
	records = collect_data(max_n=max_n, max_runtime=max_runtime, scale=scale)
	df = pd.DataFrame.from_dict(records)
	df.to_pickle(fname)
	print(f"Wrote {fname}, read with `pandas.from_pickle('{fname}')")


	if __name__ == "__main__":
	main()