Skip to content

Instantly share code, notes, and snippets.

@mattppal
Created February 14, 2024 14:49
Show Gist options
  • Save mattppal/5f58a9f85af0efb0566568550fa7e9ec to your computer and use it in GitHub Desktop.
Save mattppal/5f58a9f85af0efb0566568550fa7e9ec to your computer and use it in GitHub Desktop.
Partitioned Parquet Demo - FDIC Failed Banks 🏦
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "96362929",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import duckdb\n",
"import polars as pl\n",
"\n",
"import pyarrow.parquet as pq\n",
"import pyarrow as pa\n",
"import pyarrow.dataset as ds"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "53ea934d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Bank Name</th>\n",
" <th>City</th>\n",
" <th>State</th>\n",
" <th>Cert</th>\n",
" <th>Acquiring Institution</th>\n",
" <th>Closing Date</th>\n",
" <th>Fund</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Citizens Bank</td>\n",
" <td>Sac City</td>\n",
" <td>IA</td>\n",
" <td>8758</td>\n",
" <td>Iowa Trust &amp; Savings Bank</td>\n",
" <td>3-Nov-23</td>\n",
" <td>10545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Heartland Tri-State Bank</td>\n",
" <td>Elkhart</td>\n",
" <td>KS</td>\n",
" <td>25851</td>\n",
" <td>Dream First Bank, N.A.</td>\n",
" <td>28-Jul-23</td>\n",
" <td>10544</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>First Republic Bank</td>\n",
" <td>San Francisco</td>\n",
" <td>CA</td>\n",
" <td>59017</td>\n",
" <td>JPMorgan Chase Bank, N.A.</td>\n",
" <td>1-May-23</td>\n",
" <td>10543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Signature Bank</td>\n",
" <td>New York</td>\n",
" <td>NY</td>\n",
" <td>57053</td>\n",
" <td>Flagstar Bank, N.A.</td>\n",
" <td>12-Mar-23</td>\n",
" <td>10540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Silicon Valley Bank</td>\n",
" <td>Santa Clara</td>\n",
" <td>CA</td>\n",
" <td>24735</td>\n",
" <td>First–Citizens Bank &amp; Trust Company</td>\n",
" <td>10-Mar-23</td>\n",
" <td>10539</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Bank Name City State Cert \\\n",
"0 Citizens Bank Sac City IA 8758 \n",
"1 Heartland Tri-State Bank Elkhart KS 25851 \n",
"2 First Republic Bank San Francisco CA 59017 \n",
"3 Signature Bank New York NY 57053 \n",
"4 Silicon Valley Bank Santa Clara CA 24735 \n",
"\n",
" Acquiring Institution Closing Date Fund \n",
"0 Iowa Trust & Savings Bank 3-Nov-23 10545 \n",
"1 Dream First Bank, N.A. 28-Jul-23 10544 \n",
"2 JPMorgan Chase Bank, N.A. 1-May-23 10543 \n",
"3 Flagstar Bank, N.A. 12-Mar-23 10540 \n",
"4 First–Citizens Bank & Trust Company 10-Mar-23 10539 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = 'https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/banklist.csv'\n",
"\n",
"bank_df = pd.read_csv(url, encoding=\"windows-1251\")\n",
"\n",
"bank_df.columns = [c.strip() for c in bank_df.columns]\n",
"\n",
"bank_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4ca3e2f6",
"metadata": {},
"outputs": [],
"source": [
"conn = duckdb.connect()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e931a36e-8b86-49c5-a3b4-ff6e046f78f8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10.7 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"conn.sql(\n",
" \"\"\"\n",
" COPY bank_df TO 'duckdb_bank_data' (FORMAT PARQUET, PARTITION_BY (State), OVERWRITE_OR_IGNORE 1);\n",
" \"\"\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9c9a883d-6a07-43bf-9fa3-2cb0d103d8fc",
"metadata": {},
"outputs": [],
"source": [
"# conn.sql(\"SELECT * FROM read_parquet('duckdb_bank_data/**/*.parquet')\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "38132e85",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f32208d5",
"metadata": {},
"outputs": [],
"source": [
"pl_bank_df = pl.from_pandas(bank_df)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "92095f7b-1064-4c21-8cd3-0782c582c348",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"23.1 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"pl_bank_df.write_parquet(\n",
" \"polars_bank_data\",\n",
" use_pyarrow=True,\n",
" pyarrow_options={\"partition_cols\": [\"State\"]},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "37ae5977",
"metadata": {},
"outputs": [],
"source": [
"# pl.scan_parquet(\"polars_bank_data/**/*.parquet\").collect()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "011e3f6c-7d1c-4dda-813d-070dcecc7195",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"27.3 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"bank_df.to_parquet(\n",
" \"pandas_bank_data\",\n",
" partition_cols=[\"State\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "feea7512-7285-4930-9cbf-6112c12f1b16",
"metadata": {},
"outputs": [],
"source": [
"# conn.sql(\"SELECT * FROM read_parquet('pandas_bank_data/**/*.parquet')\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4b402e7a-fccf-44b0-aec4-ae1755314326",
"metadata": {},
"outputs": [],
"source": [
"bank_table = pa.Table.from_pandas(bank_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "538ce8d5-73e6-41c7-9ffc-492e8ad07b05",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"25.8 ms ± 1.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"pq.write_to_dataset(bank_table, root_path='pyarrow_bank_data',\n",
" partition_cols=['State'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "53a9e664-a8cf-49b1-9cb1-f5686e592484",
"metadata": {},
"outputs": [],
"source": [
"table = pq.read_table('pyarrow_bank_data')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py3-default",
"language": "python",
"name": "pyenv_py3-default"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment