iaindillingham/join_strategies.py

## join_strategies.py
# A common pattern when using OpenSAFELY for time series analysis is to extract one
# cohort for slow-to-extract variables that we don't expect to change over time, and
# multiple cohorts (e.g. by week or by month) for fast-to-extract variables that we
# expect to change over time. Each "fast" cohort is then joined to the "slow" cohort for
# analysis.
#
# In this gist, we compare the memory profiles of two join strategies found in the
# OpenSAFELY documentation: a map strategy and a merge strategy. We find that on a
# dataset with an order of magnitude difference between the population size and the
# sample size, the map strategy uses roughly 2.9 times more memory than the merge
# strategy.
#
# https://www.opensafely.org/
# https://docs.opensafely.org/
import sys

import pandas
from memory_profiler import profile
from numpy import random

rng = random.default_rng(seed=1)


def get_all_patients(n=1_000_000):
    """Gets a set of all patients from a slow cohort-extractor extract."""
    return pandas.DataFrame(
        {
            "ethnicity": rng.integers(1, 5, size=n, endpoint=True),
        },
        index=pandas.RangeIndex(n, name="patient_id"),
    ).reset_index()


def get_some_patients(n=100_000):
    """Gets a subset of some patients from a fast cohort-extractor extract."""
    return pandas.DataFrame(
        {
            "age": rng.integers(100, size=n),
            "sex": rng.choice(["F", "M"], size=n),
        },
        index=pandas.RangeIndex(n, name="patient_id"),
    ).reset_index()


@profile
def with_map(all_patients, some_patients):
    """Joins with the map strategy."""
    mapping = dict(zip(all_patients["patient_id"], all_patients["ethnicity"]))
    return some_patients["patient_id"].map(mapping)


@profile
def with_merge(all_patients, some_patients):
    """Joins with the merge strategy."""
    return some_patients.merge(all_patients, how="left", on="patient_id")


if __name__ == "__main__":
    try:
        strategy = sys.argv[1]
    except IndexError:
        strategy = None

    if strategy not in ["with_map", "with_merge"]:
        print("Please supply a valid strategy: either 'with_map' or 'with_merge'")
        sys.exit(1)

    all_patients = get_all_patients()
    print(f"There are {len(all_patients):,} patients in the population.")
    some_patients = get_some_patients()
    print(f"There are {len(some_patients):,} patients in the sample.")

    if strategy == "with_map":
        with_map(all_patients, some_patients)
    else:
        with_merge(all_patients, some_patients)
	# A common pattern when using OpenSAFELY for time series analysis is to extract one
	# cohort for slow-to-extract variables that we don't expect to change over time, and
	# multiple cohorts (e.g. by week or by month) for fast-to-extract variables that we
	# expect to change over time. Each "fast" cohort is then joined to the "slow" cohort for
	# analysis.
	#
	# In this gist, we compare the memory profiles of two join strategies found in the
	# OpenSAFELY documentation: a map strategy and a merge strategy. We find that on a
	# dataset with an order of magnitude difference between the population size and the
	# sample size, the map strategy uses roughly 2.9 times more memory than the merge
	# strategy.
	#
	# https://www.opensafely.org/
	# https://docs.opensafely.org/
	import sys

	import pandas
	from memory_profiler import profile
	from numpy import random

	rng = random.default_rng(seed=1)


	def get_all_patients(n=1_000_000):
	"""Gets a set of all patients from a slow cohort-extractor extract."""
	return pandas.DataFrame(
	{
	"ethnicity": rng.integers(1, 5, size=n, endpoint=True),
	},
	index=pandas.RangeIndex(n, name="patient_id"),
	).reset_index()


	def get_some_patients(n=100_000):
	"""Gets a subset of some patients from a fast cohort-extractor extract."""
	return pandas.DataFrame(
	{
	"age": rng.integers(100, size=n),
	"sex": rng.choice(["F", "M"], size=n),
	},
	index=pandas.RangeIndex(n, name="patient_id"),
	).reset_index()


	@profile
	def with_map(all_patients, some_patients):
	"""Joins with the map strategy."""
	mapping = dict(zip(all_patients["patient_id"], all_patients["ethnicity"]))
	return some_patients["patient_id"].map(mapping)


	@profile
	def with_merge(all_patients, some_patients):
	"""Joins with the merge strategy."""
	return some_patients.merge(all_patients, how="left", on="patient_id")


	if __name__ == "__main__":
	try:
	strategy = sys.argv[1]
	except IndexError:
	strategy = None

	if strategy not in ["with_map", "with_merge"]:
	print("Please supply a valid strategy: either 'with_map' or 'with_merge'")
	sys.exit(1)

	all_patients = get_all_patients()
	print(f"There are {len(all_patients):,} patients in the population.")
	some_patients = get_some_patients()
	print(f"There are {len(some_patients):,} patients in the sample.")

	if strategy == "with_map":
	with_map(all_patients, some_patients)
	else:
	with_merge(all_patients, some_patients)