braaannigan/queries.py

## queries.py
import pandas as pd
import polars as pl
import dask.dataframe as dd

# You can get the data from here:
# s3://coiled-datasets/h2o/G1_1e8_1e2_0_0/csv/G1_1e8_1e2_0_0.csv
fl = "data/G1_1e8_1e2_0_0.csv"
# Pandas and Dask examples based on this blog post
# https://coiled.io/blog/speed-up-pandas-query-10x-with-dask/


def pandasGroupby(engine="c"):
    # Group by with pandas using pyarrow and specified dtypes
    better_dtypes = {
        "id1": "string[pyarrow]",
        "id2": "string[pyarrow]",
        "id3": "string[pyarrow]",
        "id4": "int64",
        "id5": "int64",
        "id6": "int64",
        "v1": "int64",
        "v2": "int64",
        "v3": "float64",
    }

    df = pd.read_csv(fl, engine=engine, dtype=better_dtypes,usecols=['id1','v1'])
    return df.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"})


def polarsGroupby(lazy=False):
    # Groupby with polars
    # Supports both lazy evaluation (with query optimisation) and eager evaluation
    if lazy:
        df = pl.scan_csv(fl)
    else:
        df = pl.read_csv(fl)
    gb = df.groupby("id1").agg([pl.sum("v1")]).sort("id1")
    # If lazy then evaluate the query
    if isinstance(gb, pl.LazyFrame):
        gb = gb.collect()
    return gb


def daskGroupby():
    # Groupby with Dask
    better_dtypes = {
        "id1": "string[pyarrow]",
        "id2": "string[pyarrow]",
        "id3": "string[pyarrow]",
        "id4": "int64",
        "id5": "int64",
        "id6": "int64",
        "v1": "int64",
        "v2": "int64",
        "v3": "float64",
    }
    ddf = dd.read_csv(fl, dtype=better_dtypes)
    return ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()


# Check the outputs are the same
pan = pandasGroupby()
# Convert polars output to pandas and so some minor transformations before comparing
pol = polarsGroupby().to_pandas().set_index("id1")
pol.index = pol.index.astype("string[pyarrow]")

das = daskGroupby()
# Test that they provide the same output
pd.testing.assert_frame_equal(pan, das)
pd.testing.assert_frame_equal(pan, pol)
	import pandas as pd
	import polars as pl
	import dask.dataframe as dd

	# You can get the data from here:
	# s3://coiled-datasets/h2o/G1_1e8_1e2_0_0/csv/G1_1e8_1e2_0_0.csv
	fl = "data/G1_1e8_1e2_0_0.csv"
	# Pandas and Dask examples based on this blog post
	# https://coiled.io/blog/speed-up-pandas-query-10x-with-dask/


	def pandasGroupby(engine="c"):
	# Group by with pandas using pyarrow and specified dtypes
	better_dtypes = {
	"id1": "string[pyarrow]",
	"id2": "string[pyarrow]",
	"id3": "string[pyarrow]",
	"id4": "int64",
	"id5": "int64",
	"id6": "int64",
	"v1": "int64",
	"v2": "int64",
	"v3": "float64",
	}

	df = pd.read_csv(fl, engine=engine, dtype=better_dtypes,usecols=['id1','v1'])
	return df.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"})


	def polarsGroupby(lazy=False):
	# Groupby with polars
	# Supports both lazy evaluation (with query optimisation) and eager evaluation
	if lazy:
	df = pl.scan_csv(fl)
	else:
	df = pl.read_csv(fl)
	gb = df.groupby("id1").agg([pl.sum("v1")]).sort("id1")
	# If lazy then evaluate the query
	if isinstance(gb, pl.LazyFrame):
	gb = gb.collect()
	return gb


	def daskGroupby():
	# Groupby with Dask
	better_dtypes = {
	"id1": "string[pyarrow]",
	"id2": "string[pyarrow]",
	"id3": "string[pyarrow]",
	"id4": "int64",
	"id5": "int64",
	"id6": "int64",
	"v1": "int64",
	"v2": "int64",
	"v3": "float64",
	}
	ddf = dd.read_csv(fl, dtype=better_dtypes)
	return ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()


	# Check the outputs are the same
	pan = pandasGroupby()
	# Convert polars output to pandas and so some minor transformations before comparing
	pol = polarsGroupby().to_pandas().set_index("id1")
	pol.index = pol.index.astype("string[pyarrow]")

	das = daskGroupby()
	# Test that they provide the same output
	pd.testing.assert_frame_equal(pan, das)
	pd.testing.assert_frame_equal(pan, pol)