Skip to content

Instantly share code, notes, and snippets.

@alexkyllo
Created September 11, 2022 07:00
Show Gist options
  • Save alexkyllo/16276a0dcdec7e6ddff6d39821ec2730 to your computer and use it in GitHub Desktop.
Save alexkyllo/16276a0dcdec7e6ddff6d39821ec2730 to your computer and use it in GitHub Desktop.
Use PyPika to generate a SQL query on DuckDB over parquet files in cloud storage.
"""Use PyPika to generate a SQL query on DuckDB over parquet files in cloud storage."""
import adlfs
import duckdb
import polars as pl
from polars import col
from pyarrow import dataset as ds
from pypika import Query, Table
from pypika import analytics as an
# Create a filesystem representing an Azure Blob Storage account
fs = adlfs.AzureBlobFileSystem(
account_name="azureopendatastorage", sas_token="", container_name="mlsamples"
)
# Register it as a pyarrow dataset
dt = ds.dataset("mlsamples/diabetes", filesystem=fs)
# Register the pyarrow dataset in DuckDB as a view
conn = duckdb.connect(":memory:")
conn.register("diabetes", dt)
# Create a PyPika table expression
tbl = Table("diabetes")
# Build a query on the table and display the generated SQL
y_zscore_sex = (tbl.Y - an.Avg(tbl.Y).over(tbl.SEX)) / an.StdDev(tbl.Y).over(tbl.SEX)
q = Query.from_(tbl).select(tbl.star, y_zscore_sex.as_("Y_ZSCORE_SEX"))
q
# Execute the SQL on DuckDB and show the result as a Pandas DF
conn.execute(str(q)).fetch_df()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment