Last active
June 2, 2024 13:48
-
-
Save bobhaffner/cbba7e2ddb20c50de852b6db85e8f5d5 to your computer and use it in GitHub Desktop.
Pandas io performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Testing some Pandas IO options with a fairly large dataset (~12 million rows) \n", | |
"## Using the May 2016 csv file from the [NYC TLC Open Data Portal](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:11:43.785531Z", | |
"start_time": "2017-07-21T18:11:43.780915Z" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import dask.dataframe as dd\n", | |
"import fastparquet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:11:58.120182Z", | |
"start_time": "2017-07-21T18:11:57.996739Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total 3629936\r\n", | |
"-rw-r--r-- 1 bob staff 1.7G Aug 11 2016 yellow_tripdata_2016-05.csv\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! ls -lh data/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Using Pandas' [read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:22:13.753919Z", | |
"start_time": "2017-07-21T18:21:34.371620Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 36.3 s, sys: 2.92 s, total: 39.2 s\n", | |
"Wall time: 39.4 s\n", | |
"11,836,853 rows\n" | |
] | |
} | |
], | |
"source": [ | |
"%time pd_frame = pd.read_csv('data/yellow_tripdata_2016-05.csv')\n", | |
"print ('{:,} rows'.format(len(pd_frame)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Using dask's [read_csv](http://dask.pydata.org/en/latest/dataframe-api.html#dask.dataframe.read_csv)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:24:51.597370Z", | |
"start_time": "2017-07-21T18:24:35.114436Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 54 s, sys: 7 s, total: 1min\n", | |
"Wall time: 16.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time dd_to_pd_frame = dd.read_csv('data/yellow_tripdata_2016-05.csv').compute()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Using [fastparquet's](https://fastparquet.readthedocs.io/en/latest/index.html) [writing](https://fastparquet.readthedocs.io/en/latest/quickstart.html#writing) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:25:58.520399Z", | |
"start_time": "2017-07-21T18:25:33.031776Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 23.7 s, sys: 1.62 s, total: 25.3 s\n", | |
"Wall time: 25.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time fastparquet.write('data/yellow_tripdata_2016-05.parq', pd_frame, compression='SNAPPY')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:26:14.226555Z", | |
"start_time": "2017-07-21T18:26:14.055281Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total 4604176\r\n", | |
"-rw-r--r-- 1 bob staff 1.7G Aug 11 2016 yellow_tripdata_2016-05.csv\r\n", | |
"-rw-r--r-- 1 bob staff 476M Jul 21 13:25 yellow_tripdata_2016-05.parq\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! ls -lhtr data/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Using fastparquet's [reading](https://fastparquet.readthedocs.io/en/latest/quickstart.html#reading)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:28:00.550479Z", | |
"start_time": "2017-07-21T18:27:44.707624Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 12 s, sys: 2.82 s, total: 14.8 s\n", | |
"Wall time: 15.8 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time parq_to_pd = fastparquet.ParquetFile('data/yellow_tripdata_2016-05.parq').to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Using Pandas [to_hdf](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_hdf.html)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:27:35.266000Z", | |
"start_time": "2017-07-21T18:27:24.154811Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 6.72 s, sys: 3.18 s, total: 9.89 s\n", | |
"Wall time: 11.1 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time pd_frame.to_hdf('data/yellow_tripdata_2016-05.h5','tripdata')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:28:00.722204Z", | |
"start_time": "2017-07-21T18:28:00.552419Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total 8214616\r\n", | |
"-rw-r--r-- 1 bob staff 1.7G Aug 11 2016 yellow_tripdata_2016-05.csv\r\n", | |
"-rw-r--r-- 1 bob staff 476M Jul 21 13:25 yellow_tripdata_2016-05.parq\r\n", | |
"-rw-r--r-- 1 bob staff 1.7G Jul 21 13:27 yellow_tripdata_2016-05.h5\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! ls -lhtr data/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Using Pandas [read_hdf](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_hdf.html)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-07-21T18:28:26.718348Z", | |
"start_time": "2017-07-21T18:28:19.800973Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.89 s, sys: 2.77 s, total: 5.66 s\n", | |
"Wall time: 6.91 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time hdf_to_pd_frame = pd.read_hdf('data/yellow_tripdata_2016-05.h5','tripdata')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment