Created
March 29, 2024 20:06
-
-
Save jaychia/039bd724e351aac449adad75ba791cd7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import daft | |
import polars as pl | |
import time | |
import os | |
import pandas as pd | |
import duckdb | |
from deltalake import DeltaTable | |
creds = boto3.session.Session().get_credentials() | |
os.environ["AWS_ACCESS_KEY_ID"] = creds.access_key | |
os.environ["AWS_SECRET_ACCESS_KEY"] = creds.secret_key | |
os.environ["AWS_SESSION_TOKEN"] = creds.token | |
os.environ["AWS_REGION"] = "us-west-2" | |
io_config = daft.io.IOConfig(s3=daft.io.S3Config(verify_ssl=False)) | |
daft.set_planning_config(default_io_config=io_config) | |
deltalake_storage_config = {"allow_invalid_certificates": "true"} | |
JSONL_PATH = "s3://daft-public-datasets/red-pajamas/stackexchange-sample-jsonl/stackexchange.jsonl" | |
PARQUET_PATH = "s3://daft-public-datasets/red-pajamas/stackexchange-sample-parquet-2/" | |
DELTA_PATH = "s3://daft-public-datasets/red-pajamas/stackexchange-sample-deltalake-zorder/" | |
con = duckdb.connect(database=':memory:', read_only=False) | |
con.execute(f""" | |
SET s3_region='us-west-2'; | |
SET s3_access_key_id='{creds.access_key}'; | |
SET s3_secret_access_key='{creds.secret_key}'; | |
""") | |
def run_daft_query(get_df): | |
df = get_df() | |
df = df.where((df["language"] == "en") & (df["question_score"] > 5)) | |
return df.to_pandas() | |
def run_daft_query_json(get_df): | |
df = get_df() | |
df = df.where((df["meta"].struct.get("language") == "en") & (df["meta"].struct.get("question_score").cast(daft.DataType.int32()) > 5)) | |
return df.to_pandas() | |
def run_polars_query(get_df): | |
df = get_df() | |
df = df.filter((pl.col("language") == "en") & (pl.col("question_score") > 5)) | |
return df.collect().to_pandas() | |
def run_polars_query_json(get_df): | |
df = get_df() | |
df = df.filter((pl.col("meta").struct.field("language") == "en") & (pl.col("meta").struct.field("question_score").cast(pl.Int32) > 5)) | |
return df.to_pandas() | |
def run_pandas_query(get_df): | |
df = get_df() | |
df = df[(df["language"] == "en") & (df["question_score"] > 5)] | |
return df | |
def run_pandas_query_json(get_df): | |
df = get_df() | |
df = df[(df["meta"].apply(lambda row: row["language"]) == "en") & (df["meta"].apply(lambda row: int(row["question_score"])) > 5)] | |
return df | |
def get_daft_jsonl(): | |
return daft.read_json(JSONL_PATH) | |
def get_daft_parquet(): | |
return daft.read_parquet(PARQUET_PATH) | |
def get_daft_delta(): | |
return daft.read_delta_lake(DELTA_PATH) | |
def get_pandas_jsonl(): | |
return pd.read_json(JSONL_PATH, lines=True) | |
def get_pandas_parquet(): | |
return pd.read_parquet(PARQUET_PATH) | |
def get_pandas_delta(): | |
return DeltaTable(DELTA_PATH, storage_options=deltalake_storage_config).to_pandas() | |
def get_pl_jsonl(): | |
# scan_ndjson doesn't support S3 paths, so we have to use pyarrow to download it first | |
import pyarrow.json as pajs | |
from pyarrow.fs import S3FileSystem | |
pafs = S3FileSystem() | |
with pafs.open_input_file(JSONL_PATH.strip("s3://")) as f: | |
tbl = pajs.read_json(f) | |
return pl.from_arrow(tbl) | |
def get_pl_parquet(): | |
return pl.scan_parquet(PARQUET_PATH + "*") | |
def get_pl_delta(): | |
return pl.scan_delta(DELTA_PATH, storage_options=deltalake_storage_config) | |
def run_duckdb_query_parquet(): | |
return con.execute(f"""select * from parquet_scan('{PARQUET_PATH}*') where "language" == 'en' and "question_score" > 5;""").df() | |
def run_duckdb_query_json(): | |
return con.execute(f"""select * from read_json('{JSONL_PATH}') where meta.language == 'en' and cast(meta.question_score AS integer) > 5;""").df() | |
def run_duckdb_query_delta(): | |
# DuckDB doesn't support reading from delta lake, so we have to use the deltalake sdk | |
# This Github discussion (https://github.com/duckdb/duckdb/discussions/4463#discussioncomment-3448122) recommends | |
# loading it in as a PyArrow dataset | |
dt = DeltaTable(DELTA_PATH, storage_options=deltalake_storage_config) | |
pa_ds = dt.to_pyarrow_dataset() | |
return con.execute(f"""select * from pa_ds where language == 'en' and question_score > 5;""").df() | |
queries = { | |
"daft-jsonl": lambda: run_daft_query_json(get_daft_jsonl), | |
"daft-parquet": lambda: run_daft_query(get_daft_parquet), | |
"daft-delta": lambda: run_daft_query(get_daft_delta), | |
"polars-jsonl": lambda: run_polars_query_json(get_pl_jsonl), | |
"polars-parquet": lambda: run_polars_query(get_pl_parquet), | |
"polars-delta": lambda: run_polars_query(get_pl_delta), | |
"pandas-jsonl": lambda: run_pandas_query_json(get_pandas_jsonl), | |
"pandas-parquet": lambda: run_pandas_query(get_pandas_parquet), | |
"pandas-delta": lambda: run_pandas_query(get_pandas_delta), | |
"duckdb-jsonl": run_duckdb_query_json, | |
"duckdb-parquet": run_duckdb_query_parquet, | |
"duckdb-delta": run_duckdb_query_delta, | |
} | |
TRIES = 3 | |
for name in queries: | |
err = None | |
results = [] | |
for i in range(TRIES): | |
start = time.time() | |
try: | |
queries[name]() | |
results.append(time.time() - start) | |
except Exception as e: | |
err = e | |
if len(results) == 0: | |
print(f"Error encountered: {err}") | |
else: | |
result = min(results) | |
print(f"{name}: {result}s") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment