Skip to content

Instantly share code, notes, and snippets.

@jaychia
Created March 29, 2024 20:06
Show Gist options
  • Save jaychia/039bd724e351aac449adad75ba791cd7 to your computer and use it in GitHub Desktop.
Save jaychia/039bd724e351aac449adad75ba791cd7 to your computer and use it in GitHub Desktop.
import boto3
import daft
import polars as pl
import time
import os
import pandas as pd
import duckdb
from deltalake import DeltaTable
creds = boto3.session.Session().get_credentials()
os.environ["AWS_ACCESS_KEY_ID"] = creds.access_key
os.environ["AWS_SECRET_ACCESS_KEY"] = creds.secret_key
os.environ["AWS_SESSION_TOKEN"] = creds.token
os.environ["AWS_REGION"] = "us-west-2"
io_config = daft.io.IOConfig(s3=daft.io.S3Config(verify_ssl=False))
daft.set_planning_config(default_io_config=io_config)
deltalake_storage_config = {"allow_invalid_certificates": "true"}
JSONL_PATH = "s3://daft-public-datasets/red-pajamas/stackexchange-sample-jsonl/stackexchange.jsonl"
PARQUET_PATH = "s3://daft-public-datasets/red-pajamas/stackexchange-sample-parquet-2/"
DELTA_PATH = "s3://daft-public-datasets/red-pajamas/stackexchange-sample-deltalake-zorder/"
con = duckdb.connect(database=':memory:', read_only=False)
con.execute(f"""
SET s3_region='us-west-2';
SET s3_access_key_id='{creds.access_key}';
SET s3_secret_access_key='{creds.secret_key}';
""")
def run_daft_query(get_df):
df = get_df()
df = df.where((df["language"] == "en") & (df["question_score"] > 5))
return df.to_pandas()
def run_daft_query_json(get_df):
df = get_df()
df = df.where((df["meta"].struct.get("language") == "en") & (df["meta"].struct.get("question_score").cast(daft.DataType.int32()) > 5))
return df.to_pandas()
def run_polars_query(get_df):
df = get_df()
df = df.filter((pl.col("language") == "en") & (pl.col("question_score") > 5))
return df.collect().to_pandas()
def run_polars_query_json(get_df):
df = get_df()
df = df.filter((pl.col("meta").struct.field("language") == "en") & (pl.col("meta").struct.field("question_score").cast(pl.Int32) > 5))
return df.to_pandas()
def run_pandas_query(get_df):
df = get_df()
df = df[(df["language"] == "en") & (df["question_score"] > 5)]
return df
def run_pandas_query_json(get_df):
df = get_df()
df = df[(df["meta"].apply(lambda row: row["language"]) == "en") & (df["meta"].apply(lambda row: int(row["question_score"])) > 5)]
return df
def get_daft_jsonl():
return daft.read_json(JSONL_PATH)
def get_daft_parquet():
return daft.read_parquet(PARQUET_PATH)
def get_daft_delta():
return daft.read_delta_lake(DELTA_PATH)
def get_pandas_jsonl():
return pd.read_json(JSONL_PATH, lines=True)
def get_pandas_parquet():
return pd.read_parquet(PARQUET_PATH)
def get_pandas_delta():
return DeltaTable(DELTA_PATH, storage_options=deltalake_storage_config).to_pandas()
def get_pl_jsonl():
# scan_ndjson doesn't support S3 paths, so we have to use pyarrow to download it first
import pyarrow.json as pajs
from pyarrow.fs import S3FileSystem
pafs = S3FileSystem()
with pafs.open_input_file(JSONL_PATH.strip("s3://")) as f:
tbl = pajs.read_json(f)
return pl.from_arrow(tbl)
def get_pl_parquet():
return pl.scan_parquet(PARQUET_PATH + "*")
def get_pl_delta():
return pl.scan_delta(DELTA_PATH, storage_options=deltalake_storage_config)
def run_duckdb_query_parquet():
return con.execute(f"""select * from parquet_scan('{PARQUET_PATH}*') where "language" == 'en' and "question_score" > 5;""").df()
def run_duckdb_query_json():
return con.execute(f"""select * from read_json('{JSONL_PATH}') where meta.language == 'en' and cast(meta.question_score AS integer) > 5;""").df()
def run_duckdb_query_delta():
# DuckDB doesn't support reading from delta lake, so we have to use the deltalake sdk
# This Github discussion (https://github.com/duckdb/duckdb/discussions/4463#discussioncomment-3448122) recommends
# loading it in as a PyArrow dataset
dt = DeltaTable(DELTA_PATH, storage_options=deltalake_storage_config)
pa_ds = dt.to_pyarrow_dataset()
return con.execute(f"""select * from pa_ds where language == 'en' and question_score > 5;""").df()
queries = {
"daft-jsonl": lambda: run_daft_query_json(get_daft_jsonl),
"daft-parquet": lambda: run_daft_query(get_daft_parquet),
"daft-delta": lambda: run_daft_query(get_daft_delta),
"polars-jsonl": lambda: run_polars_query_json(get_pl_jsonl),
"polars-parquet": lambda: run_polars_query(get_pl_parquet),
"polars-delta": lambda: run_polars_query(get_pl_delta),
"pandas-jsonl": lambda: run_pandas_query_json(get_pandas_jsonl),
"pandas-parquet": lambda: run_pandas_query(get_pandas_parquet),
"pandas-delta": lambda: run_pandas_query(get_pandas_delta),
"duckdb-jsonl": run_duckdb_query_json,
"duckdb-parquet": run_duckdb_query_parquet,
"duckdb-delta": run_duckdb_query_delta,
}
TRIES = 3
for name in queries:
err = None
results = []
for i in range(TRIES):
start = time.time()
try:
queries[name]()
results.append(time.time() - start)
except Exception as e:
err = e
if len(results) == 0:
print(f"Error encountered: {err}")
else:
result = min(results)
print(f"{name}: {result}s")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment