-
-
Save mattppal/5f58a9f85af0efb0566568550fa7e9ec to your computer and use it in GitHub Desktop.
Partitioned Parquet Demo - FDIC Failed Banks 🏦
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "96362929", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"import duckdb\n", | |
"import polars as pl\n", | |
"\n", | |
"import pyarrow.parquet as pq\n", | |
"import pyarrow as pa\n", | |
"import pyarrow.dataset as ds" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "53ea934d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Bank Name</th>\n", | |
" <th>City</th>\n", | |
" <th>State</th>\n", | |
" <th>Cert</th>\n", | |
" <th>Acquiring Institution</th>\n", | |
" <th>Closing Date</th>\n", | |
" <th>Fund</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Citizens Bank</td>\n", | |
" <td>Sac City</td>\n", | |
" <td>IA</td>\n", | |
" <td>8758</td>\n", | |
" <td>Iowa Trust & Savings Bank</td>\n", | |
" <td>3-Nov-23</td>\n", | |
" <td>10545</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Heartland Tri-State Bank</td>\n", | |
" <td>Elkhart</td>\n", | |
" <td>KS</td>\n", | |
" <td>25851</td>\n", | |
" <td>Dream First Bank, N.A.</td>\n", | |
" <td>28-Jul-23</td>\n", | |
" <td>10544</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>First Republic Bank</td>\n", | |
" <td>San Francisco</td>\n", | |
" <td>CA</td>\n", | |
" <td>59017</td>\n", | |
" <td>JPMorgan Chase Bank, N.A.</td>\n", | |
" <td>1-May-23</td>\n", | |
" <td>10543</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Signature Bank</td>\n", | |
" <td>New York</td>\n", | |
" <td>NY</td>\n", | |
" <td>57053</td>\n", | |
" <td>Flagstar Bank, N.A.</td>\n", | |
" <td>12-Mar-23</td>\n", | |
" <td>10540</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Silicon Valley Bank</td>\n", | |
" <td>Santa Clara</td>\n", | |
" <td>CA</td>\n", | |
" <td>24735</td>\n", | |
" <td>First–Citizens Bank & Trust Company</td>\n", | |
" <td>10-Mar-23</td>\n", | |
" <td>10539</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Bank Name City State Cert \\\n", | |
"0 Citizens Bank Sac City IA 8758 \n", | |
"1 Heartland Tri-State Bank Elkhart KS 25851 \n", | |
"2 First Republic Bank San Francisco CA 59017 \n", | |
"3 Signature Bank New York NY 57053 \n", | |
"4 Silicon Valley Bank Santa Clara CA 24735 \n", | |
"\n", | |
" Acquiring Institution Closing Date Fund \n", | |
"0 Iowa Trust & Savings Bank 3-Nov-23 10545 \n", | |
"1 Dream First Bank, N.A. 28-Jul-23 10544 \n", | |
"2 JPMorgan Chase Bank, N.A. 1-May-23 10543 \n", | |
"3 Flagstar Bank, N.A. 12-Mar-23 10540 \n", | |
"4 First–Citizens Bank & Trust Company 10-Mar-23 10539 " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"url = 'https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/banklist.csv'\n", | |
"\n", | |
"bank_df = pd.read_csv(url, encoding=\"windows-1251\")\n", | |
"\n", | |
"bank_df.columns = [c.strip() for c in bank_df.columns]\n", | |
"\n", | |
"bank_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "4ca3e2f6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"conn = duckdb.connect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "e931a36e-8b86-49c5-a3b4-ff6e046f78f8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10.7 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"conn.sql(\n", | |
" \"\"\"\n", | |
" COPY bank_df TO 'duckdb_bank_data' (FORMAT PARQUET, PARTITION_BY (State), OVERWRITE_OR_IGNORE 1);\n", | |
" \"\"\"\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "9c9a883d-6a07-43bf-9fa3-2cb0d103d8fc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# conn.sql(\"SELECT * FROM read_parquet('duckdb_bank_data/**/*.parquet')\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "38132e85", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"conn.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "f32208d5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pl_bank_df = pl.from_pandas(bank_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "92095f7b-1064-4c21-8cd3-0782c582c348", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"23.1 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"pl_bank_df.write_parquet(\n", | |
" \"polars_bank_data\",\n", | |
" use_pyarrow=True,\n", | |
" pyarrow_options={\"partition_cols\": [\"State\"]},\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "37ae5977", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# pl.scan_parquet(\"polars_bank_data/**/*.parquet\").collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "011e3f6c-7d1c-4dda-813d-070dcecc7195", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"27.3 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"bank_df.to_parquet(\n", | |
" \"pandas_bank_data\",\n", | |
" partition_cols=[\"State\"],\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "feea7512-7285-4930-9cbf-6112c12f1b16", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# conn.sql(\"SELECT * FROM read_parquet('pandas_bank_data/**/*.parquet')\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "4b402e7a-fccf-44b0-aec4-ae1755314326", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bank_table = pa.Table.from_pandas(bank_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "538ce8d5-73e6-41c7-9ffc-492e8ad07b05", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"25.8 ms ± 1.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"pq.write_to_dataset(bank_table, root_path='pyarrow_bank_data',\n", | |
" partition_cols=['State'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "53a9e664-a8cf-49b1-9cb1-f5686e592484", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"table = pq.read_table('pyarrow_bank_data')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "py3-default", | |
"language": "python", | |
"name": "pyenv_py3-default" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment