Skip to content

Instantly share code, notes, and snippets.

@braaannigan
Last active April 19, 2022 14:45
Show Gist options
  • Save braaannigan/2955d81cb27fb0f1a5ff4f990ef6d551 to your computer and use it in GitHub Desktop.
Save braaannigan/2955d81cb27fb0f1a5ff4f990ef6d551 to your computer and use it in GitHub Desktop.
Query optimisation with Polars in python
import pandas as pd
import polars as pl
import dask.dataframe as dd
# You can get the data from here:
# s3://coiled-datasets/h2o/G1_1e8_1e2_0_0/csv/G1_1e8_1e2_0_0.csv
fl = "data/G1_1e8_1e2_0_0.csv"
# Pandas and Dask examples based on this blog post
# https://coiled.io/blog/speed-up-pandas-query-10x-with-dask/
def pandasGroupby(engine="c"):
# Group by with pandas using pyarrow and specified dtypes
better_dtypes = {
"id1": "string[pyarrow]",
"id2": "string[pyarrow]",
"id3": "string[pyarrow]",
"id4": "int64",
"id5": "int64",
"id6": "int64",
"v1": "int64",
"v2": "int64",
"v3": "float64",
}
df = pd.read_csv(fl, engine=engine, dtype=better_dtypes,usecols=['id1','v1'])
return df.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"})
def polarsGroupby(lazy=False):
# Groupby with polars
# Supports both lazy evaluation (with query optimisation) and eager evaluation
if lazy:
df = pl.scan_csv(fl)
else:
df = pl.read_csv(fl)
gb = df.groupby("id1").agg([pl.sum("v1")]).sort("id1")
# If lazy then evaluate the query
if isinstance(gb, pl.LazyFrame):
gb = gb.collect()
return gb
def daskGroupby():
# Groupby with Dask
better_dtypes = {
"id1": "string[pyarrow]",
"id2": "string[pyarrow]",
"id3": "string[pyarrow]",
"id4": "int64",
"id5": "int64",
"id6": "int64",
"v1": "int64",
"v2": "int64",
"v3": "float64",
}
ddf = dd.read_csv(fl, dtype=better_dtypes)
return ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()
# Check the outputs are the same
pan = pandasGroupby()
# Convert polars output to pandas and so some minor transformations before comparing
pol = polarsGroupby().to_pandas().set_index("id1")
pol.index = pol.index.astype("string[pyarrow]")
das = daskGroupby()
# Test that they provide the same output
pd.testing.assert_frame_equal(pan, das)
pd.testing.assert_frame_equal(pan, pol)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment