mesejo/defaultdict_performance.py

## defaultdict_performance.py
from collections import defaultdict
from itertools import groupby, cycle
from operator import itemgetter as ig

import random
import perfplot
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("dark")


def setup(unique):
    def fun(length):
        choices = cycle(range(unique))
        result = [
            (i, random.randint(0, 10), random.randint(0, 10_000))
            for _, i in zip(range(length), choices)
        ]
        random.shuffle(result)
        return result

    return fun


def defaultdict_max_approach(lst):
    d = defaultdict(lambda: (0, 0, float("-inf")))
    for e in lst:
        first, _, last = e
        if d[first][2] < last:
            d[first] = e
    return [*d.values()]


def dict_max_approach(lst):
    # https://stackoverflow.com/a/69025193/4001592
    d = {}
    for tpl in lst:
        first, *_, last = tpl
        if first not in d or last > d[first][-1]:
            d[first] = tpl

    return [*d.values()]


def groupby_max_approach(lst):
    # https://stackoverflow.com/a/69025193/4001592
    return [max(g, key=ig(-1)) for _, g in groupby(sorted(lst), key=ig(0))]


def equality(a, b):
    da = {k: v for k, _, v in a}
    db = {k: v for k, _, v in b}
    return da == db


if __name__ == "__main__":

    res = []
    for unique in [500, 1000, 5000, 10_000]:
        b = perfplot.bench(
            setup=setup(unique),
            kernels=[
                defaultdict_max_approach,
                dict_max_approach,
                groupby_max_approach,
            ],
            labels=["defaultdict", "dict", "groupby"],
            n_range=[k for k in range(100_000, 1000_001, 100_000)],
            xlabel="len(lst)",
            equality_check=equality,
        )

        df = pd.DataFrame(
            data=b.timings_s.T,
            index=pd.MultiIndex.from_product(
                [[str(unique)], b.n_range], names=["unique", "range"]
            ),
            columns=[*b.labels],
        )
        res.append(df)

    data = pd.concat(res).reset_index()
    data = data.melt(
        id_vars=["range", "unique"], value_vars=["defaultdict", "dict", "groupby"]
    )

    mapping = {
        "unique": "Unique Keys [count]",
        "variable": "Algorithm",
        "range": "len(lst) [Ks]",
        "value": "Runtime [s]",
    }
    data = data.rename(columns=mapping)
    data["len(lst) [Ks]"] = data["len(lst) [Ks]"] // 1000

    g = sns.FacetGrid(data, sharey=False, col="Unique Keys [count]", col_wrap=2, hue="Algorithm", legend_out=False)
    g.map(sns.lineplot, "len(lst) [Ks]", "Runtime [s]")
    g.tight_layout()
    g.add_legend()
    plt.show()
	from collections import defaultdict
	from itertools import groupby, cycle
	from operator import itemgetter as ig

	import random
	import perfplot
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt

	sns.set_style("dark")


	def setup(unique):
	def fun(length):
	choices = cycle(range(unique))
	result = [
	(i, random.randint(0, 10), random.randint(0, 10_000))
	for _, i in zip(range(length), choices)
	]
	random.shuffle(result)
	return result

	return fun


	def defaultdict_max_approach(lst):
	d = defaultdict(lambda: (0, 0, float("-inf")))
	for e in lst:
	first, _, last = e
	if d[first][2] < last:
	d[first] = e
	return [*d.values()]


	def dict_max_approach(lst):
	# https://stackoverflow.com/a/69025193/4001592
	d = {}
	for tpl in lst:
	first, *_, last = tpl
	if first not in d or last > d[first][-1]:
	d[first] = tpl

	return [*d.values()]


	def groupby_max_approach(lst):
	# https://stackoverflow.com/a/69025193/4001592
	return [max(g, key=ig(-1)) for _, g in groupby(sorted(lst), key=ig(0))]


	def equality(a, b):
	da = {k: v for k, _, v in a}
	db = {k: v for k, _, v in b}
	return da == db


	if __name__ == "__main__":

	res = []
	for unique in [500, 1000, 5000, 10_000]:
	b = perfplot.bench(
	setup=setup(unique),
	kernels=[
	defaultdict_max_approach,
	dict_max_approach,
	groupby_max_approach,
	],
	labels=["defaultdict", "dict", "groupby"],
	n_range=[k for k in range(100_000, 1000_001, 100_000)],
	xlabel="len(lst)",
	equality_check=equality,
	)

	df = pd.DataFrame(
	data=b.timings_s.T,
	index=pd.MultiIndex.from_product(
	[[str(unique)], b.n_range], names=["unique", "range"]
	),
	columns=[*b.labels],
	)
	res.append(df)

	data = pd.concat(res).reset_index()
	data = data.melt(
	id_vars=["range", "unique"], value_vars=["defaultdict", "dict", "groupby"]
	)

	mapping = {
	"unique": "Unique Keys [count]",
	"variable": "Algorithm",
	"range": "len(lst) [Ks]",
	"value": "Runtime [s]",
	}
	data = data.rename(columns=mapping)
	data["len(lst) [Ks]"] = data["len(lst) [Ks]"] // 1000

	g = sns.FacetGrid(data, sharey=False, col="Unique Keys [count]", col_wrap=2, hue="Algorithm", legend_out=False)
	g.map(sns.lineplot, "len(lst) [Ks]", "Runtime [s]")
	g.tight_layout()
	g.add_legend()
	plt.show()