Created
September 11, 2022 07:00
-
-
Save alexkyllo/16276a0dcdec7e6ddff6d39821ec2730 to your computer and use it in GitHub Desktop.
Use PyPika to generate a SQL query on DuckDB over parquet files in cloud storage.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Use PyPika to generate a SQL query on DuckDB over parquet files in cloud storage.""" | |
import adlfs | |
import duckdb | |
import polars as pl | |
from polars import col | |
from pyarrow import dataset as ds | |
from pypika import Query, Table | |
from pypika import analytics as an | |
# Create a filesystem representing an Azure Blob Storage account | |
fs = adlfs.AzureBlobFileSystem( | |
account_name="azureopendatastorage", sas_token="", container_name="mlsamples" | |
) | |
# Register it as a pyarrow dataset | |
dt = ds.dataset("mlsamples/diabetes", filesystem=fs) | |
# Register the pyarrow dataset in DuckDB as a view | |
conn = duckdb.connect(":memory:") | |
conn.register("diabetes", dt) | |
# Create a PyPika table expression | |
tbl = Table("diabetes") | |
# Build a query on the table and display the generated SQL | |
y_zscore_sex = (tbl.Y - an.Avg(tbl.Y).over(tbl.SEX)) / an.StdDev(tbl.Y).over(tbl.SEX) | |
q = Query.from_(tbl).select(tbl.star, y_zscore_sex.as_("Y_ZSCORE_SEX")) | |
q | |
# Execute the SQL on DuckDB and show the result as a Pandas DF | |
conn.execute(str(q)).fetch_df() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment