bobhaffner/pandas_io_performance.ipynb

## pandas_io_performance.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Testing some Pandas IO options with a fairly large dataset (~12 million rows) \n",
    "## Using the May 2016 csv file from the [NYC TLC Open Data Portal](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:11:43.785531Z",
     "start_time": "2017-07-21T18:11:43.780915Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import dask.dataframe as dd\n",
    "import fastparquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:11:58.120182Z",
     "start_time": "2017-07-21T18:11:57.996739Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 3629936\r\n",
      "-rw-r--r--  1 bob  staff   1.7G Aug 11  2016 yellow_tripdata_2016-05.csv\r\n"
     ]
    }
   ],
   "source": [
    "! ls -lh data/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using Pandas' [read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:22:13.753919Z",
     "start_time": "2017-07-21T18:21:34.371620Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 36.3 s, sys: 2.92 s, total: 39.2 s\n",
      "Wall time: 39.4 s\n",
      "11,836,853 rows\n"
     ]
    }
   ],
   "source": [
    "%time pd_frame = pd.read_csv('data/yellow_tripdata_2016-05.csv')\n",
    "print ('{:,} rows'.format(len(pd_frame)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using dask's [read_csv](http://dask.pydata.org/en/latest/dataframe-api.html#dask.dataframe.read_csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:24:51.597370Z",
     "start_time": "2017-07-21T18:24:35.114436Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 54 s, sys: 7 s, total: 1min\n",
      "Wall time: 16.5 s\n"
     ]
    }
   ],
   "source": [
    "%time dd_to_pd_frame = dd.read_csv('data/yellow_tripdata_2016-05.csv').compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using [fastparquet's](https://fastparquet.readthedocs.io/en/latest/index.html) [writing](https://fastparquet.readthedocs.io/en/latest/quickstart.html#writing) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:25:58.520399Z",
     "start_time": "2017-07-21T18:25:33.031776Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 23.7 s, sys: 1.62 s, total: 25.3 s\n",
      "Wall time: 25.5 s\n"
     ]
    }
   ],
   "source": [
    "%time fastparquet.write('data/yellow_tripdata_2016-05.parq', pd_frame, compression='SNAPPY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:26:14.226555Z",
     "start_time": "2017-07-21T18:26:14.055281Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 4604176\r\n",
      "-rw-r--r--  1 bob  staff   1.7G Aug 11  2016 yellow_tripdata_2016-05.csv\r\n",
      "-rw-r--r--  1 bob  staff   476M Jul 21 13:25 yellow_tripdata_2016-05.parq\r\n"
     ]
    }
   ],
   "source": [
    "! ls -lhtr data/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using fastparquet's [reading](https://fastparquet.readthedocs.io/en/latest/quickstart.html#reading)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:28:00.550479Z",
     "start_time": "2017-07-21T18:27:44.707624Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12 s, sys: 2.82 s, total: 14.8 s\n",
      "Wall time: 15.8 s\n"
     ]
    }
   ],
   "source": [
    "%time parq_to_pd = fastparquet.ParquetFile('data/yellow_tripdata_2016-05.parq').to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using Pandas [to_hdf](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_hdf.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:27:35.266000Z",
     "start_time": "2017-07-21T18:27:24.154811Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6.72 s, sys: 3.18 s, total: 9.89 s\n",
      "Wall time: 11.1 s\n"
     ]
    }
   ],
   "source": [
    "%time pd_frame.to_hdf('data/yellow_tripdata_2016-05.h5','tripdata')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:28:00.722204Z",
     "start_time": "2017-07-21T18:28:00.552419Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 8214616\r\n",
      "-rw-r--r--  1 bob  staff   1.7G Aug 11  2016 yellow_tripdata_2016-05.csv\r\n",
      "-rw-r--r--  1 bob  staff   476M Jul 21 13:25 yellow_tripdata_2016-05.parq\r\n",
      "-rw-r--r--  1 bob  staff   1.7G Jul 21 13:27 yellow_tripdata_2016-05.h5\r\n"
     ]
    }
   ],
   "source": [
    "! ls -lhtr data/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using Pandas [read_hdf](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_hdf.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-21T18:28:26.718348Z",
     "start_time": "2017-07-21T18:28:19.800973Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.89 s, sys: 2.77 s, total: 5.66 s\n",
      "Wall time: 6.91 s\n"
     ]
    }
   ],
   "source": [
    "%time hdf_to_pd_frame = pd.read_hdf('data/yellow_tripdata_2016-05.h5','tripdata')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Testing some Pandas IO options with a fairly large dataset (~12 million rows) \n",
	"## Using the May 2016 csv file from the [NYC TLC Open Data Portal](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:11:43.785531Z",
	"start_time": "2017-07-21T18:11:43.780915Z"
	},
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import dask.dataframe as dd\n",
	"import fastparquet"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:11:58.120182Z",
	"start_time": "2017-07-21T18:11:57.996739Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"total 3629936\r\n",
	"-rw-r--r-- 1 bob staff 1.7G Aug 11 2016 yellow_tripdata_2016-05.csv\r\n"
	]
	}
	],
	"source": [
	"! ls -lh data/"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Using Pandas' [read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:22:13.753919Z",
	"start_time": "2017-07-21T18:21:34.371620Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 36.3 s, sys: 2.92 s, total: 39.2 s\n",
	"Wall time: 39.4 s\n",
	"11,836,853 rows\n"
	]
	}
	],
	"source": [
	"%time pd_frame = pd.read_csv('data/yellow_tripdata_2016-05.csv')\n",
	"print ('{:,} rows'.format(len(pd_frame)))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Using dask's [read_csv](http://dask.pydata.org/en/latest/dataframe-api.html#dask.dataframe.read_csv)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:24:51.597370Z",
	"start_time": "2017-07-21T18:24:35.114436Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 54 s, sys: 7 s, total: 1min\n",
	"Wall time: 16.5 s\n"
	]
	}
	],
	"source": [
	"%time dd_to_pd_frame = dd.read_csv('data/yellow_tripdata_2016-05.csv').compute()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Using [fastparquet's](https://fastparquet.readthedocs.io/en/latest/index.html) [writing](https://fastparquet.readthedocs.io/en/latest/quickstart.html#writing) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:25:58.520399Z",
	"start_time": "2017-07-21T18:25:33.031776Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 23.7 s, sys: 1.62 s, total: 25.3 s\n",
	"Wall time: 25.5 s\n"
	]
	}
	],
	"source": [
	"%time fastparquet.write('data/yellow_tripdata_2016-05.parq', pd_frame, compression='SNAPPY')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:26:14.226555Z",
	"start_time": "2017-07-21T18:26:14.055281Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"total 4604176\r\n",
	"-rw-r--r-- 1 bob staff 1.7G Aug 11 2016 yellow_tripdata_2016-05.csv\r\n",
	"-rw-r--r-- 1 bob staff 476M Jul 21 13:25 yellow_tripdata_2016-05.parq\r\n"
	]
	}
	],
	"source": [
	"! ls -lhtr data/"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Using fastparquet's [reading](https://fastparquet.readthedocs.io/en/latest/quickstart.html#reading)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:28:00.550479Z",
	"start_time": "2017-07-21T18:27:44.707624Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 12 s, sys: 2.82 s, total: 14.8 s\n",
	"Wall time: 15.8 s\n"
	]
	}
	],
	"source": [
	"%time parq_to_pd = fastparquet.ParquetFile('data/yellow_tripdata_2016-05.parq').to_pandas()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Using Pandas [to_hdf](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_hdf.html)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:27:35.266000Z",
	"start_time": "2017-07-21T18:27:24.154811Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 6.72 s, sys: 3.18 s, total: 9.89 s\n",
	"Wall time: 11.1 s\n"
	]
	}
	],
	"source": [
	"%time pd_frame.to_hdf('data/yellow_tripdata_2016-05.h5','tripdata')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:28:00.722204Z",
	"start_time": "2017-07-21T18:28:00.552419Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"total 8214616\r\n",
	"-rw-r--r-- 1 bob staff 1.7G Aug 11 2016 yellow_tripdata_2016-05.csv\r\n",
	"-rw-r--r-- 1 bob staff 476M Jul 21 13:25 yellow_tripdata_2016-05.parq\r\n",
	"-rw-r--r-- 1 bob staff 1.7G Jul 21 13:27 yellow_tripdata_2016-05.h5\r\n"
	]
	}
	],
	"source": [
	"! ls -lhtr data/"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Using Pandas [read_hdf](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_hdf.html)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2017-07-21T18:28:26.718348Z",
	"start_time": "2017-07-21T18:28:19.800973Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 2.89 s, sys: 2.77 s, total: 5.66 s\n",
	"Wall time: 6.91 s\n"
	]
	}
	],
	"source": [
	"%time hdf_to_pd_frame = pd.read_hdf('data/yellow_tripdata_2016-05.h5','tripdata')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}