-
-
Save wesm/0cb5531b1c2e346a0007 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Test data generation" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>group</th>\n", | |
| " <th>data0</th>\n", | |
| " <th>data1</th>\n", | |
| " <th>data2</th>\n", | |
| " <th>data3</th>\n", | |
| " <th>data4</th>\n", | |
| " <th>data5</th>\n", | |
| " <th>data6</th>\n", | |
| " <th>data7</th>\n", | |
| " <th>data8</th>\n", | |
| " <th>data9</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.703424</td>\n", | |
| " <td>-1.536075</td>\n", | |
| " <td>-0.446158</td>\n", | |
| " <td>0.455960</td>\n", | |
| " <td>-0.556396</td>\n", | |
| " <td>1.218725</td>\n", | |
| " <td>1.969005</td>\n", | |
| " <td>-0.042579</td>\n", | |
| " <td>0.605303</td>\n", | |
| " <td>2.927195</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-1.289577</td>\n", | |
| " <td>0.421690</td>\n", | |
| " <td>1.198543</td>\n", | |
| " <td>-0.910678</td>\n", | |
| " <td>-1.103254</td>\n", | |
| " <td>-1.009867</td>\n", | |
| " <td>-0.444909</td>\n", | |
| " <td>1.100392</td>\n", | |
| " <td>0.234861</td>\n", | |
| " <td>-1.405328</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.703413</td>\n", | |
| " <td>0.191029</td>\n", | |
| " <td>-0.475114</td>\n", | |
| " <td>-0.882956</td>\n", | |
| " <td>0.985129</td>\n", | |
| " <td>1.133779</td>\n", | |
| " <td>-0.925346</td>\n", | |
| " <td>-0.412362</td>\n", | |
| " <td>0.003123</td>\n", | |
| " <td>1.309303</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.658299</td>\n", | |
| " <td>-1.649641</td>\n", | |
| " <td>1.261421</td>\n", | |
| " <td>-0.167431</td>\n", | |
| " <td>-0.070937</td>\n", | |
| " <td>-0.280642</td>\n", | |
| " <td>1.333661</td>\n", | |
| " <td>1.174445</td>\n", | |
| " <td>0.686412</td>\n", | |
| " <td>-1.308651</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.594049</td>\n", | |
| " <td>1.683654</td>\n", | |
| " <td>-0.935114</td>\n", | |
| " <td>-1.750663</td>\n", | |
| " <td>0.216827</td>\n", | |
| " <td>0.284704</td>\n", | |
| " <td>-0.647471</td>\n", | |
| " <td>0.698593</td>\n", | |
| " <td>1.379473</td>\n", | |
| " <td>-0.286060</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " group data0 data1 data2 data3 data4 data5 \\\n", | |
| "0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
| "0 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
| "0 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
| "0 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
| "0 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
| "\n", | |
| " data6 data7 data8 data9 \n", | |
| "0 1.969005 -0.042579 0.605303 2.927195 \n", | |
| "0 -0.444909 1.100392 0.234861 -1.405328 \n", | |
| "0 -0.925346 -0.412362 0.003123 1.309303 \n", | |
| "0 1.333661 1.174445 0.686412 -1.308651 \n", | |
| "0 -0.647471 0.698593 1.379473 -0.286060 " | |
| ] | |
| }, | |
| "execution_count": 42, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# 1mm rows for now\n", | |
| "nrows = 1000000\n", | |
| "\n", | |
| "ndata_columns = 10\n", | |
| "\n", | |
| "# Generate some string categories\n", | |
| "ncategories = 10\n", | |
| "\n", | |
| "categories = [pd.util.testing.rands(10) for _ in range(ncategories)]\n", | |
| "cats = pd.Series(categories).repeat(nrows / ncategories)\n", | |
| "\n", | |
| "data = {\n", | |
| " 'group': cats\n", | |
| "}\n", | |
| "column_names = ['group']\n", | |
| "\n", | |
| "# Generate some double data\n", | |
| "for i in range(ndata_columns):\n", | |
| " name = 'data{}'.format(i)\n", | |
| " data[name] = np.random.randn(nrows)\n", | |
| " column_names.append(name)\n", | |
| "\n", | |
| "df = pd.DataFrame(data, columns=column_names)\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 204 ms, sys: 140 ms, total: 344 ms\n", | |
| "Wall time: 348 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>data0</th>\n", | |
| " <th>data1</th>\n", | |
| " <th>data2</th>\n", | |
| " <th>data3</th>\n", | |
| " <th>data4</th>\n", | |
| " <th>data5</th>\n", | |
| " <th>data6</th>\n", | |
| " <th>data7</th>\n", | |
| " <th>data8</th>\n", | |
| " <th>data9</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>group</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>4CsrjKcKbo</th>\n", | |
| " <td>11.831979</td>\n", | |
| " <td>136.187324</td>\n", | |
| " <td>-295.388315</td>\n", | |
| " <td>142.704850</td>\n", | |
| " <td>-41.881516</td>\n", | |
| " <td>-29.665339</td>\n", | |
| " <td>-207.635804</td>\n", | |
| " <td>-155.094945</td>\n", | |
| " <td>105.283159</td>\n", | |
| " <td>767.466670</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>73VX4pdH4Y</th>\n", | |
| " <td>175.005549</td>\n", | |
| " <td>547.486156</td>\n", | |
| " <td>199.117899</td>\n", | |
| " <td>548.247042</td>\n", | |
| " <td>225.798938</td>\n", | |
| " <td>-452.535554</td>\n", | |
| " <td>-500.471072</td>\n", | |
| " <td>424.562159</td>\n", | |
| " <td>132.303795</td>\n", | |
| " <td>566.041430</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>BmLu4cT6uF</th>\n", | |
| " <td>172.712552</td>\n", | |
| " <td>169.204947</td>\n", | |
| " <td>569.918196</td>\n", | |
| " <td>-260.546673</td>\n", | |
| " <td>510.197316</td>\n", | |
| " <td>-60.677790</td>\n", | |
| " <td>-207.590026</td>\n", | |
| " <td>-133.377280</td>\n", | |
| " <td>185.689427</td>\n", | |
| " <td>876.356624</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>HSvLJoc15E</th>\n", | |
| " <td>-511.972646</td>\n", | |
| " <td>-56.881369</td>\n", | |
| " <td>-251.202024</td>\n", | |
| " <td>251.213307</td>\n", | |
| " <td>94.269355</td>\n", | |
| " <td>47.596050</td>\n", | |
| " <td>-60.388253</td>\n", | |
| " <td>-400.746563</td>\n", | |
| " <td>238.092326</td>\n", | |
| " <td>-209.154316</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>J5Yd8irBnb</th>\n", | |
| " <td>-5.573153</td>\n", | |
| " <td>-453.530304</td>\n", | |
| " <td>513.811162</td>\n", | |
| " <td>286.185040</td>\n", | |
| " <td>855.478154</td>\n", | |
| " <td>-178.012073</td>\n", | |
| " <td>539.009204</td>\n", | |
| " <td>16.243304</td>\n", | |
| " <td>14.952687</td>\n", | |
| " <td>40.837467</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>ZmDUBhNUOy</th>\n", | |
| " <td>-246.333214</td>\n", | |
| " <td>215.914220</td>\n", | |
| " <td>-280.593283</td>\n", | |
| " <td>-90.840335</td>\n", | |
| " <td>27.458542</td>\n", | |
| " <td>-152.737684</td>\n", | |
| " <td>454.597280</td>\n", | |
| " <td>77.648862</td>\n", | |
| " <td>99.061342</td>\n", | |
| " <td>147.739728</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>lK4xrUXWN0</th>\n", | |
| " <td>-244.632800</td>\n", | |
| " <td>-356.878053</td>\n", | |
| " <td>-433.656816</td>\n", | |
| " <td>208.128759</td>\n", | |
| " <td>414.632423</td>\n", | |
| " <td>-411.897845</td>\n", | |
| " <td>449.079543</td>\n", | |
| " <td>482.190346</td>\n", | |
| " <td>-411.455465</td>\n", | |
| " <td>245.294481</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>pBI24n2F3Q</th>\n", | |
| " <td>637.710927</td>\n", | |
| " <td>-18.689120</td>\n", | |
| " <td>-480.028704</td>\n", | |
| " <td>22.972283</td>\n", | |
| " <td>441.623681</td>\n", | |
| " <td>-696.007650</td>\n", | |
| " <td>-104.017479</td>\n", | |
| " <td>8.333586</td>\n", | |
| " <td>-138.162617</td>\n", | |
| " <td>-283.277042</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>rBbN5Q4Jyl</th>\n", | |
| " <td>-409.202466</td>\n", | |
| " <td>50.410141</td>\n", | |
| " <td>128.908498</td>\n", | |
| " <td>-71.238216</td>\n", | |
| " <td>137.567974</td>\n", | |
| " <td>-173.356820</td>\n", | |
| " <td>141.111654</td>\n", | |
| " <td>115.265187</td>\n", | |
| " <td>-481.667415</td>\n", | |
| " <td>-264.309207</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>vzHLsD6Yft</th>\n", | |
| " <td>-150.564745</td>\n", | |
| " <td>-123.397999</td>\n", | |
| " <td>-52.350541</td>\n", | |
| " <td>-487.816878</td>\n", | |
| " <td>-363.032239</td>\n", | |
| " <td>46.186671</td>\n", | |
| " <td>-592.353868</td>\n", | |
| " <td>437.938846</td>\n", | |
| " <td>-34.131335</td>\n", | |
| " <td>-199.617239</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " data0 data1 data2 data3 data4 \\\n", | |
| "group \n", | |
| "4CsrjKcKbo 11.831979 136.187324 -295.388315 142.704850 -41.881516 \n", | |
| "73VX4pdH4Y 175.005549 547.486156 199.117899 548.247042 225.798938 \n", | |
| "BmLu4cT6uF 172.712552 169.204947 569.918196 -260.546673 510.197316 \n", | |
| "HSvLJoc15E -511.972646 -56.881369 -251.202024 251.213307 94.269355 \n", | |
| "J5Yd8irBnb -5.573153 -453.530304 513.811162 286.185040 855.478154 \n", | |
| "ZmDUBhNUOy -246.333214 215.914220 -280.593283 -90.840335 27.458542 \n", | |
| "lK4xrUXWN0 -244.632800 -356.878053 -433.656816 208.128759 414.632423 \n", | |
| "pBI24n2F3Q 637.710927 -18.689120 -480.028704 22.972283 441.623681 \n", | |
| "rBbN5Q4Jyl -409.202466 50.410141 128.908498 -71.238216 137.567974 \n", | |
| "vzHLsD6Yft -150.564745 -123.397999 -52.350541 -487.816878 -363.032239 \n", | |
| "\n", | |
| " data5 data6 data7 data8 data9 \n", | |
| "group \n", | |
| "4CsrjKcKbo -29.665339 -207.635804 -155.094945 105.283159 767.466670 \n", | |
| "73VX4pdH4Y -452.535554 -500.471072 424.562159 132.303795 566.041430 \n", | |
| "BmLu4cT6uF -60.677790 -207.590026 -133.377280 185.689427 876.356624 \n", | |
| "HSvLJoc15E 47.596050 -60.388253 -400.746563 238.092326 -209.154316 \n", | |
| "J5Yd8irBnb -178.012073 539.009204 16.243304 14.952687 40.837467 \n", | |
| "ZmDUBhNUOy -152.737684 454.597280 77.648862 99.061342 147.739728 \n", | |
| "lK4xrUXWN0 -411.897845 449.079543 482.190346 -411.455465 245.294481 \n", | |
| "pBI24n2F3Q -696.007650 -104.017479 8.333586 -138.162617 -283.277042 \n", | |
| "rBbN5Q4Jyl -173.356820 141.111654 115.265187 -481.667415 -264.309207 \n", | |
| "vzHLsD6Yft 46.186671 -592.353868 437.938846 -34.131335 -199.617239 " | |
| ] | |
| }, | |
| "execution_count": 47, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Aggregate 1mm row table with 10 groups\n", | |
| "%time df.groupby('group').sum()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "219.23547503591956" | |
| ] | |
| }, | |
| "execution_count": 51, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "speed = 80000000 / 0.348 / 1024**2\n", | |
| "speed" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "**On my little Lenovo X230, pandas aggregates this data at approximately 220 MB/second, single-threaded.**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Speed to read this data out of CSV and HDF5" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 52, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Save without header for easier ingest to Parquet later\n", | |
| "df.to_csv('test_dataset.csv', index=False, header=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 53, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "total 280616\r\n", | |
| "-rw-rw-r-- 1 wesm wesm 24710 Feb 21 12:53 Explorations.ipynb\r\n", | |
| "-rw-rw-r-- 1 wesm wesm 207312465 Feb 21 12:53 test_dataset.csv\r\n", | |
| "-rw-rw-r-- 1 wesm wesm 80007192 Feb 21 12:27 testing.hdf5\r\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!ls -l" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "pd.read_csv?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 2.24 s, sys: 180 ms, total: 2.42 s\n", | |
| "Wall time: 2.43 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%time df = pd.read_csv('test_dataset.csv', header=None)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CSV parse performance: 85.1851851852 MB/second\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "speed = 207 / 2.43\n", | |
| "print(\"CSV parse performance: {0} MB/second\".format(speed))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "However, the resident in-memory size of the data is only about 90MB. So let's keep this in mind." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CSV parse performance (to-in-memory data): ~37.04 MB/second\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "speed = 90 / 2.43\n", | |
| "print(\"CSV parse performance (to-in-memory data): ~%.2f MB/second\" % speed)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "hdf_store = pd.HDFStore('testing.hdf5')\n", | |
| "hdf_store['df'] = df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 60 ms, sys: 56 ms, total: 116 ms\n", | |
| "Wall time: 117 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%time df3 = hdf_store['df']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 61, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>group</th>\n", | |
| " <th>data0</th>\n", | |
| " <th>data1</th>\n", | |
| " <th>data2</th>\n", | |
| " <th>data3</th>\n", | |
| " <th>data4</th>\n", | |
| " <th>data5</th>\n", | |
| " <th>data6</th>\n", | |
| " <th>data7</th>\n", | |
| " <th>data8</th>\n", | |
| " <th>data9</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.703424</td>\n", | |
| " <td>-1.536075</td>\n", | |
| " <td>-0.446158</td>\n", | |
| " <td>0.455960</td>\n", | |
| " <td>-0.556396</td>\n", | |
| " <td>1.218725</td>\n", | |
| " <td>1.969005</td>\n", | |
| " <td>-0.042579</td>\n", | |
| " <td>0.605303</td>\n", | |
| " <td>2.927195</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-1.289577</td>\n", | |
| " <td>0.421690</td>\n", | |
| " <td>1.198543</td>\n", | |
| " <td>-0.910678</td>\n", | |
| " <td>-1.103254</td>\n", | |
| " <td>-1.009867</td>\n", | |
| " <td>-0.444909</td>\n", | |
| " <td>1.100392</td>\n", | |
| " <td>0.234861</td>\n", | |
| " <td>-1.405328</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.703413</td>\n", | |
| " <td>0.191029</td>\n", | |
| " <td>-0.475114</td>\n", | |
| " <td>-0.882956</td>\n", | |
| " <td>0.985129</td>\n", | |
| " <td>1.133779</td>\n", | |
| " <td>-0.925346</td>\n", | |
| " <td>-0.412362</td>\n", | |
| " <td>0.003123</td>\n", | |
| " <td>1.309303</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.658299</td>\n", | |
| " <td>-1.649641</td>\n", | |
| " <td>1.261421</td>\n", | |
| " <td>-0.167431</td>\n", | |
| " <td>-0.070937</td>\n", | |
| " <td>-0.280642</td>\n", | |
| " <td>1.333661</td>\n", | |
| " <td>1.174445</td>\n", | |
| " <td>0.686412</td>\n", | |
| " <td>-1.308651</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.594049</td>\n", | |
| " <td>1.683654</td>\n", | |
| " <td>-0.935114</td>\n", | |
| " <td>-1.750663</td>\n", | |
| " <td>0.216827</td>\n", | |
| " <td>0.284704</td>\n", | |
| " <td>-0.647471</td>\n", | |
| " <td>0.698593</td>\n", | |
| " <td>1.379473</td>\n", | |
| " <td>-0.286060</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " group data0 data1 data2 data3 data4 data5 \\\n", | |
| "0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
| "0 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
| "0 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
| "0 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
| "0 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
| "\n", | |
| " data6 data7 data8 data9 \n", | |
| "0 1.969005 -0.042579 0.605303 2.927195 \n", | |
| "0 -0.444909 1.100392 0.234861 -1.405328 \n", | |
| "0 -0.925346 -0.412362 0.003123 1.309303 \n", | |
| "0 1.333661 1.174445 0.686412 -1.308651 \n", | |
| "0 -0.647471 0.698593 1.379473 -0.286060 " | |
| ] | |
| }, | |
| "execution_count": 61, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 62, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CSV parse performance (to-in-memory data): ~737.70 MB/second\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "speed = 90 / 0.122\n", | |
| "print(\"CSV parse performance (to-in-memory data): ~%.2f MB/second\" % speed)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Yep, **pandas HDF5 read performance is ridiculous, almost 20 times faster**. Binary columnar data storage makes an extraordinary difference." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## HiveServer2 (Hive, Impala) to pandas" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Impala and HDFS are running locally on my laptop, so there is no network bandwidth issue here. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import ibis\n", | |
| "ibis.options.verbose = True\n", | |
| "data_dir = '/hs2-perf/csv-test'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "SHOW DATABASES LIKE '__ibis_tmp'\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "hdfs = ibis.hdfs_connect(port=5070)\n", | |
| "con = ibis.impala.connect(hdfs_client=hdfs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 81, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "hdfs.mkdir(data_dir)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 83, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'/hs2-perf/csv-test/test_dataset.csv'" | |
| ] | |
| }, | |
| "execution_count": 83, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "hdfs.put('/hs2-perf/csv-test/', 'test_dataset.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 84, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['test_dataset.csv']" | |
| ] | |
| }, | |
| "execution_count": 84, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "hdfs.ls('/hs2-perf/csv-test')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "ibis.Schema { \n", | |
| " group string\n", | |
| " data0 double\n", | |
| " data1 double\n", | |
| " data2 double\n", | |
| " data3 double\n", | |
| " data4 double\n", | |
| " data5 double\n", | |
| " data6 double\n", | |
| " data7 double\n", | |
| " data8 double\n", | |
| " data9 double\n", | |
| "}" | |
| ] | |
| }, | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from ibis.impala.pandas_interop import pandas_to_ibis_schema\n", | |
| "df.columns = ['group'] + ['data{}'.format(i) for i in range(10)]\n", | |
| "schema = pandas_to_ibis_schema(df)\n", | |
| "schema" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#con.create_database('hs2_perf_test')\n", | |
| "db = con.database('hs2_perf_test')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# csv_table = con.delimited_file(data_dir, schema)\n", | |
| "\n", | |
| "# Working around IMPALA-2069\n", | |
| "query = \"\"\"\n", | |
| "CREATE EXTERNAL TABLE hs2_perf_test.csv_table\n", | |
| "(`group` varchar(10),\n", | |
| " `data0` double,\n", | |
| " `data1` double,\n", | |
| " `data2` double,\n", | |
| " `data3` double,\n", | |
| " `data4` double,\n", | |
| " `data5` double,\n", | |
| " `data6` double,\n", | |
| " `data7` double,\n", | |
| " `data8` double,\n", | |
| " `data9` double)\n", | |
| "ROW FORMAT DELIMITED\n", | |
| "FIELDS TERMINATED BY ','\n", | |
| "LOCATION '/hs2-perf/csv-test'\n", | |
| "\"\"\"\n", | |
| "import impala\n", | |
| "impyla_con = impala.dbapi.connect()\n", | |
| "cur = impyla_con.cursor()\n", | |
| "cur.execute(query)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "DESCRIBE hs2_perf_test.`csv_table`\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "csv_table = db.csv_table" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "SELECT *\n", | |
| "FROM hs2_perf_test.`csv_table`\n", | |
| "LIMIT 5\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>group</th>\n", | |
| " <th>data0</th>\n", | |
| " <th>data1</th>\n", | |
| " <th>data2</th>\n", | |
| " <th>data3</th>\n", | |
| " <th>data4</th>\n", | |
| " <th>data5</th>\n", | |
| " <th>data6</th>\n", | |
| " <th>data7</th>\n", | |
| " <th>data8</th>\n", | |
| " <th>data9</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.703424</td>\n", | |
| " <td>-1.536075</td>\n", | |
| " <td>-0.446158</td>\n", | |
| " <td>0.455960</td>\n", | |
| " <td>-0.556396</td>\n", | |
| " <td>1.218725</td>\n", | |
| " <td>1.969005</td>\n", | |
| " <td>-0.042579</td>\n", | |
| " <td>0.605303</td>\n", | |
| " <td>2.927195</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-1.289577</td>\n", | |
| " <td>0.421690</td>\n", | |
| " <td>1.198543</td>\n", | |
| " <td>-0.910678</td>\n", | |
| " <td>-1.103254</td>\n", | |
| " <td>-1.009867</td>\n", | |
| " <td>-0.444909</td>\n", | |
| " <td>1.100392</td>\n", | |
| " <td>0.234861</td>\n", | |
| " <td>-1.405328</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.703413</td>\n", | |
| " <td>0.191029</td>\n", | |
| " <td>-0.475114</td>\n", | |
| " <td>-0.882956</td>\n", | |
| " <td>0.985129</td>\n", | |
| " <td>1.133779</td>\n", | |
| " <td>-0.925346</td>\n", | |
| " <td>-0.412362</td>\n", | |
| " <td>0.003123</td>\n", | |
| " <td>1.309303</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.658299</td>\n", | |
| " <td>-1.649641</td>\n", | |
| " <td>1.261421</td>\n", | |
| " <td>-0.167431</td>\n", | |
| " <td>-0.070937</td>\n", | |
| " <td>-0.280642</td>\n", | |
| " <td>1.333661</td>\n", | |
| " <td>1.174445</td>\n", | |
| " <td>0.686412</td>\n", | |
| " <td>-1.308651</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.594049</td>\n", | |
| " <td>1.683654</td>\n", | |
| " <td>-0.935114</td>\n", | |
| " <td>-1.750663</td>\n", | |
| " <td>0.216827</td>\n", | |
| " <td>0.284704</td>\n", | |
| " <td>-0.647471</td>\n", | |
| " <td>0.698593</td>\n", | |
| " <td>1.379473</td>\n", | |
| " <td>-0.286060</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " group data0 data1 data2 data3 data4 data5 \\\n", | |
| "0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
| "1 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
| "2 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
| "3 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
| "4 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
| "\n", | |
| " data6 data7 data8 data9 \n", | |
| "0 1.969005 -0.042579 0.605303 2.927195 \n", | |
| "1 -0.444909 1.100392 0.234861 -1.405328 \n", | |
| "2 -0.925346 -0.412362 0.003123 1.309303 \n", | |
| "3 1.333661 1.174445 0.686412 -1.308651 \n", | |
| "4 -0.647471 0.698593 1.379473 -0.286060 " | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "csv_table[:5].execute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "DESCRIBE hs2_perf_test.`csv_as_parquet`\n", | |
| "DROP TABLE hs2_perf_test.`csv_as_parquet`\n", | |
| "CREATE TABLE hs2_perf_test.`csv_as_parquet`\n", | |
| "STORED AS PARQUET\n", | |
| "AS\n", | |
| "SELECT *\n", | |
| "FROM hs2_perf_test.`csv_table`\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "db.csv_as_parquet.drop()\n", | |
| "db.create_table('csv_as_parquet', csv_table, format='parquet')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "DESCRIBE hs2_perf_test.`csv_as_parquet`\n", | |
| "SELECT *\n", | |
| "FROM hs2_perf_test.`csv_as_parquet`\n", | |
| "LIMIT 5\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>group</th>\n", | |
| " <th>data0</th>\n", | |
| " <th>data1</th>\n", | |
| " <th>data2</th>\n", | |
| " <th>data3</th>\n", | |
| " <th>data4</th>\n", | |
| " <th>data5</th>\n", | |
| " <th>data6</th>\n", | |
| " <th>data7</th>\n", | |
| " <th>data8</th>\n", | |
| " <th>data9</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.703424</td>\n", | |
| " <td>-1.536075</td>\n", | |
| " <td>-0.446158</td>\n", | |
| " <td>0.455960</td>\n", | |
| " <td>-0.556396</td>\n", | |
| " <td>1.218725</td>\n", | |
| " <td>1.969005</td>\n", | |
| " <td>-0.042579</td>\n", | |
| " <td>0.605303</td>\n", | |
| " <td>2.927195</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-1.289577</td>\n", | |
| " <td>0.421690</td>\n", | |
| " <td>1.198543</td>\n", | |
| " <td>-0.910678</td>\n", | |
| " <td>-1.103254</td>\n", | |
| " <td>-1.009867</td>\n", | |
| " <td>-0.444909</td>\n", | |
| " <td>1.100392</td>\n", | |
| " <td>0.234861</td>\n", | |
| " <td>-1.405328</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.703413</td>\n", | |
| " <td>0.191029</td>\n", | |
| " <td>-0.475114</td>\n", | |
| " <td>-0.882956</td>\n", | |
| " <td>0.985129</td>\n", | |
| " <td>1.133779</td>\n", | |
| " <td>-0.925346</td>\n", | |
| " <td>-0.412362</td>\n", | |
| " <td>0.003123</td>\n", | |
| " <td>1.309303</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>0.658299</td>\n", | |
| " <td>-1.649641</td>\n", | |
| " <td>1.261421</td>\n", | |
| " <td>-0.167431</td>\n", | |
| " <td>-0.070937</td>\n", | |
| " <td>-0.280642</td>\n", | |
| " <td>1.333661</td>\n", | |
| " <td>1.174445</td>\n", | |
| " <td>0.686412</td>\n", | |
| " <td>-1.308651</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>pBI24n2F3Q</td>\n", | |
| " <td>-0.594049</td>\n", | |
| " <td>1.683654</td>\n", | |
| " <td>-0.935114</td>\n", | |
| " <td>-1.750663</td>\n", | |
| " <td>0.216827</td>\n", | |
| " <td>0.284704</td>\n", | |
| " <td>-0.647471</td>\n", | |
| " <td>0.698593</td>\n", | |
| " <td>1.379473</td>\n", | |
| " <td>-0.286060</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " group data0 data1 data2 data3 data4 data5 \\\n", | |
| "0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
| "1 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
| "2 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
| "3 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
| "4 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
| "\n", | |
| " data6 data7 data8 data9 \n", | |
| "0 1.969005 -0.042579 0.605303 2.927195 \n", | |
| "1 -0.444909 1.100392 0.234861 -1.405328 \n", | |
| "2 -0.925346 -0.412362 0.003123 1.309303 \n", | |
| "3 1.333661 1.174445 0.686412 -1.308651 \n", | |
| "4 -0.647471 0.698593 1.379473 -0.286060 " | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "parquet_table = db.csv_as_parquet\n", | |
| "parquet_table[:5].execute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "SELECT *\n", | |
| "FROM hs2_perf_test.`csv_as_parquet`\n", | |
| "CPU times: user 11.5 s, sys: 248 ms, total: 11.7 s\n", | |
| "Wall time: 15.3 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%time df4 = parquet_table.execute(limit=None)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Wait...what? Let's go lower level to the raw HS2 connection." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import impala" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "hs2_con = impala.dbapi.connect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 10.7 s, sys: 44 ms, total: 10.8 s\n", | |
| "Wall time: 14.1 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "def get_data(query):\n", | |
| " cur = hs2_con.cursor()\n", | |
| " cur.execute(query)\n", | |
| " return cur.fetchcolumnar()\n", | |
| "\n", | |
| "query = \"SELECT * FROM hs2_perf_test.csv_as_parquet\"\n", | |
| "%time tuples = get_data(query)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "I can explain the reasons why this is so slow, but another time. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Spark DataFrames to pandas" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Let's get the Parquet files" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "DESCRIBE hs2_perf_test.`csv_as_parquet`\n", | |
| "DESCRIBE FORMATTED hs2_perf_test.`csv_as_parquet`\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'hdfs://localhost:20500/test-warehouse/hs2_perf_test.db/csv_as_parquet'" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "meta = db.csv_as_parquet.describe_formatted()\n", | |
| "meta.location" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'564d173750e684bc-eb3958cd3946d1a7_1811129494_data.0.parq',\n", | |
| " u'564d173750e684bc-eb3958cd3946d1a8_548402996_data.0.parq',\n", | |
| " u'_impala_insert_staging']" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "hdfs.ls('/test-warehouse/hs2_perf_test.db/csv_as_parquet')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "path = hdfs.get('/test-warehouse/hs2_perf_test.db/csv_as_parquet', 'csv_parquet')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Welcome to\n", | |
| " ____ __\n", | |
| " / __/__ ___ _____/ /__\n", | |
| " _\\ \\/ _ \\/ _ `/ __/ '_/\n", | |
| " /__ / .__/\\_,_/_/ /_/\\_\\ version 1.6.0\n", | |
| " /_/\n", | |
| "\n", | |
| "Using Python version 2.7.11 (default, Dec 6 2015 18:08:32)\n", | |
| "SparkContext available as sc, SQLContext available as sqlContext.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import os\n", | |
| "import sys\n", | |
| "\n", | |
| "spark_home = os.environ.get('SPARK_HOME', None)\n", | |
| "if not spark_home:\n", | |
| " raise ValueError('SPARK_HOME environment variable is not set')\n", | |
| "\n", | |
| "sys.path.insert(0, os.path.join(spark_home, 'python'))\n", | |
| "sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip'))\n", | |
| "execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = sqlContext.read.format('parquet').load('csv_parquet/')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('group', 'string'),\n", | |
| " ('data0', 'double'),\n", | |
| " ('data1', 'double'),\n", | |
| " ('data2', 'double'),\n", | |
| " ('data3', 'double'),\n", | |
| " ('data4', 'double'),\n", | |
| " ('data5', 'double'),\n", | |
| " ('data6', 'double'),\n", | |
| " ('data7', 'double'),\n", | |
| " ('data8', 'double'),\n", | |
| " ('data9', 'double')]" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.dtypes" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 8.76 s, sys: 516 ms, total: 9.28 s\n", | |
| "Wall time: 13.6 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%time df5 = df.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.11" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks