Skip to content

Instantly share code, notes, and snippets.

@mattppal
Created February 14, 2024 14:49
Show Gist options
  • Save mattppal/1e1eef95bf7cb29d1342566e5d2969dd to your computer and use it in GitHub Desktop.
Save mattppal/1e1eef95bf7cb29d1342566e5d2969dd to your computer and use it in GitHub Desktop.
Partitioned Parquet Demo - NYC Taxi Dataset 🚕
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "96362929",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import duckdb\n",
"import polars as pl\n",
"\n",
"import pyarrow.parquet as pq\n",
"import pyarrow as pa\n",
"import pyarrow.dataset as ds\n",
"\n",
"import glob\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "53ea934d",
"metadata": {},
"outputs": [],
"source": [
"# https://mavenanalytics.io/data-playground?order=-fields.numberOfRecords\n",
"\n",
"dtypes = {\n",
" 'VendorID': pd.Int64Dtype(),\n",
" 'passenger_count': pd.Int64Dtype(),\n",
" 'trip_distance': float,\n",
" 'RatecodeID': pd.Int64Dtype(),\n",
" 'store_and_fwd_flag': str,\n",
" 'PULocationID': pd.Int64Dtype(),\n",
" 'DOLocationID': pd.Int64Dtype(),\n",
" 'payment_type': pd.Int64Dtype(),\n",
" 'fare_amount': float,\n",
" 'extra': float,\n",
" 'mta_tax': float,\n",
" 'tip_amount': float,\n",
" 'tolls_amount': float,\n",
" 'improvement_surcharge': float,\n",
" 'total_amount': float,\n",
" 'congestion_surcharge': float \n",
" }\n",
"\n",
"parse_dates = ['lpep_pickup_datetime', 'lpep_dropoff_datetime']\n",
"\n",
"all_files = glob.glob(os.path.join(\"./NYC_Taxi_Trips/taxi_trips\", \"*.csv\"))\n",
"\n",
"# taxi_df = pd.concat((pd.read_csv(f, dtype=dtypes, low_memory=False, parse_dates=parse_dates) for f in all_files), ignore_index=True)\n",
"\n",
"taxi_df = pd.read_csv(\"/NYC_Taxi_Trips/taxi_trips/2019_taxi_trips.csv\", dtype=dtypes, low_memory=False, parse_dates=parse_dates)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7071a7a4-db74-48db-8026-fa51843c60f5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6044050\n"
]
}
],
"source": [
"print(len(taxi_df))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "30dccbd6-6635-4fb0-b8b9-422007763ec3",
"metadata": {},
"outputs": [],
"source": [
"taxi_df['lpep_pickup_date'] = taxi_df['lpep_pickup_datetime'].dt.date"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4ca3e2f6",
"metadata": {},
"outputs": [],
"source": [
"conn = duckdb.connect()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e931a36e-8b86-49c5-a3b4-ff6e046f78f8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.71 s ± 50.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"conn.sql(\n",
" \"\"\"\n",
" COPY taxi_df TO 'duckdb_taxi_data' (FORMAT PARQUET, PARTITION_BY (lpep_pickup_date), OVERWRITE_OR_IGNORE 1);\n",
" \"\"\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "38132e85",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"conn.close()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f32208d5",
"metadata": {},
"outputs": [],
"source": [
"pl_taxi_df = pl.from_pandas(taxi_df)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "92095f7b-1064-4c21-8cd3-0782c582c348",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10.6 s ± 247 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"pl_taxi_df.write_parquet(\n",
" \"polars_taxi_data\",\n",
" use_pyarrow=True,\n",
" pyarrow_options={\"partition_cols\": [\"lpep_pickup_date\"]},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2c63472d-9416-47e4-89ee-18412c984916",
"metadata": {},
"outputs": [],
"source": [
"del pl_taxi_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "011e3f6c-7d1c-4dda-813d-070dcecc7195",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"11.3 s ± 878 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"taxi_df.to_parquet(\n",
" \"pandas_taxi_data\",\n",
" partition_cols=[\"lpep_pickup_date\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "4b402e7a-fccf-44b0-aec4-ae1755314326",
"metadata": {},
"outputs": [],
"source": [
"taxi_table = pa.Table.from_pandas(taxi_df)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "538ce8d5-73e6-41c7-9ffc-492e8ad07b05",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"11.2 s ± 1.14 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"pq.write_to_dataset(taxi_table, root_path='pyarrow_taxi_data',\n",
" partition_cols=['lpep_pickup_date'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "8ef94379-da00-41d7-99ff-0be0eed4b75a",
"metadata": {},
"outputs": [],
"source": [
"del taxi_table"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py3-default",
"language": "python",
"name": "pyenv_py3-default"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment