-
-
Save wesm/0cb5531b1c2e346a0007 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Test data generation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>group</th>\n", | |
" <th>data0</th>\n", | |
" <th>data1</th>\n", | |
" <th>data2</th>\n", | |
" <th>data3</th>\n", | |
" <th>data4</th>\n", | |
" <th>data5</th>\n", | |
" <th>data6</th>\n", | |
" <th>data7</th>\n", | |
" <th>data8</th>\n", | |
" <th>data9</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.703424</td>\n", | |
" <td>-1.536075</td>\n", | |
" <td>-0.446158</td>\n", | |
" <td>0.455960</td>\n", | |
" <td>-0.556396</td>\n", | |
" <td>1.218725</td>\n", | |
" <td>1.969005</td>\n", | |
" <td>-0.042579</td>\n", | |
" <td>0.605303</td>\n", | |
" <td>2.927195</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-1.289577</td>\n", | |
" <td>0.421690</td>\n", | |
" <td>1.198543</td>\n", | |
" <td>-0.910678</td>\n", | |
" <td>-1.103254</td>\n", | |
" <td>-1.009867</td>\n", | |
" <td>-0.444909</td>\n", | |
" <td>1.100392</td>\n", | |
" <td>0.234861</td>\n", | |
" <td>-1.405328</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.703413</td>\n", | |
" <td>0.191029</td>\n", | |
" <td>-0.475114</td>\n", | |
" <td>-0.882956</td>\n", | |
" <td>0.985129</td>\n", | |
" <td>1.133779</td>\n", | |
" <td>-0.925346</td>\n", | |
" <td>-0.412362</td>\n", | |
" <td>0.003123</td>\n", | |
" <td>1.309303</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.658299</td>\n", | |
" <td>-1.649641</td>\n", | |
" <td>1.261421</td>\n", | |
" <td>-0.167431</td>\n", | |
" <td>-0.070937</td>\n", | |
" <td>-0.280642</td>\n", | |
" <td>1.333661</td>\n", | |
" <td>1.174445</td>\n", | |
" <td>0.686412</td>\n", | |
" <td>-1.308651</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.594049</td>\n", | |
" <td>1.683654</td>\n", | |
" <td>-0.935114</td>\n", | |
" <td>-1.750663</td>\n", | |
" <td>0.216827</td>\n", | |
" <td>0.284704</td>\n", | |
" <td>-0.647471</td>\n", | |
" <td>0.698593</td>\n", | |
" <td>1.379473</td>\n", | |
" <td>-0.286060</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" group data0 data1 data2 data3 data4 data5 \\\n", | |
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
"0 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
"0 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
"0 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
"0 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
"\n", | |
" data6 data7 data8 data9 \n", | |
"0 1.969005 -0.042579 0.605303 2.927195 \n", | |
"0 -0.444909 1.100392 0.234861 -1.405328 \n", | |
"0 -0.925346 -0.412362 0.003123 1.309303 \n", | |
"0 1.333661 1.174445 0.686412 -1.308651 \n", | |
"0 -0.647471 0.698593 1.379473 -0.286060 " | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 1mm rows for now\n", | |
"nrows = 1000000\n", | |
"\n", | |
"ndata_columns = 10\n", | |
"\n", | |
"# Generate some string categories\n", | |
"ncategories = 10\n", | |
"\n", | |
"categories = [pd.util.testing.rands(10) for _ in range(ncategories)]\n", | |
"cats = pd.Series(categories).repeat(nrows / ncategories)\n", | |
"\n", | |
"data = {\n", | |
" 'group': cats\n", | |
"}\n", | |
"column_names = ['group']\n", | |
"\n", | |
"# Generate some double data\n", | |
"for i in range(ndata_columns):\n", | |
" name = 'data{}'.format(i)\n", | |
" data[name] = np.random.randn(nrows)\n", | |
" column_names.append(name)\n", | |
"\n", | |
"df = pd.DataFrame(data, columns=column_names)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 204 ms, sys: 140 ms, total: 344 ms\n", | |
"Wall time: 348 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>data0</th>\n", | |
" <th>data1</th>\n", | |
" <th>data2</th>\n", | |
" <th>data3</th>\n", | |
" <th>data4</th>\n", | |
" <th>data5</th>\n", | |
" <th>data6</th>\n", | |
" <th>data7</th>\n", | |
" <th>data8</th>\n", | |
" <th>data9</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>group</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>4CsrjKcKbo</th>\n", | |
" <td>11.831979</td>\n", | |
" <td>136.187324</td>\n", | |
" <td>-295.388315</td>\n", | |
" <td>142.704850</td>\n", | |
" <td>-41.881516</td>\n", | |
" <td>-29.665339</td>\n", | |
" <td>-207.635804</td>\n", | |
" <td>-155.094945</td>\n", | |
" <td>105.283159</td>\n", | |
" <td>767.466670</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>73VX4pdH4Y</th>\n", | |
" <td>175.005549</td>\n", | |
" <td>547.486156</td>\n", | |
" <td>199.117899</td>\n", | |
" <td>548.247042</td>\n", | |
" <td>225.798938</td>\n", | |
" <td>-452.535554</td>\n", | |
" <td>-500.471072</td>\n", | |
" <td>424.562159</td>\n", | |
" <td>132.303795</td>\n", | |
" <td>566.041430</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>BmLu4cT6uF</th>\n", | |
" <td>172.712552</td>\n", | |
" <td>169.204947</td>\n", | |
" <td>569.918196</td>\n", | |
" <td>-260.546673</td>\n", | |
" <td>510.197316</td>\n", | |
" <td>-60.677790</td>\n", | |
" <td>-207.590026</td>\n", | |
" <td>-133.377280</td>\n", | |
" <td>185.689427</td>\n", | |
" <td>876.356624</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>HSvLJoc15E</th>\n", | |
" <td>-511.972646</td>\n", | |
" <td>-56.881369</td>\n", | |
" <td>-251.202024</td>\n", | |
" <td>251.213307</td>\n", | |
" <td>94.269355</td>\n", | |
" <td>47.596050</td>\n", | |
" <td>-60.388253</td>\n", | |
" <td>-400.746563</td>\n", | |
" <td>238.092326</td>\n", | |
" <td>-209.154316</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>J5Yd8irBnb</th>\n", | |
" <td>-5.573153</td>\n", | |
" <td>-453.530304</td>\n", | |
" <td>513.811162</td>\n", | |
" <td>286.185040</td>\n", | |
" <td>855.478154</td>\n", | |
" <td>-178.012073</td>\n", | |
" <td>539.009204</td>\n", | |
" <td>16.243304</td>\n", | |
" <td>14.952687</td>\n", | |
" <td>40.837467</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ZmDUBhNUOy</th>\n", | |
" <td>-246.333214</td>\n", | |
" <td>215.914220</td>\n", | |
" <td>-280.593283</td>\n", | |
" <td>-90.840335</td>\n", | |
" <td>27.458542</td>\n", | |
" <td>-152.737684</td>\n", | |
" <td>454.597280</td>\n", | |
" <td>77.648862</td>\n", | |
" <td>99.061342</td>\n", | |
" <td>147.739728</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>lK4xrUXWN0</th>\n", | |
" <td>-244.632800</td>\n", | |
" <td>-356.878053</td>\n", | |
" <td>-433.656816</td>\n", | |
" <td>208.128759</td>\n", | |
" <td>414.632423</td>\n", | |
" <td>-411.897845</td>\n", | |
" <td>449.079543</td>\n", | |
" <td>482.190346</td>\n", | |
" <td>-411.455465</td>\n", | |
" <td>245.294481</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>pBI24n2F3Q</th>\n", | |
" <td>637.710927</td>\n", | |
" <td>-18.689120</td>\n", | |
" <td>-480.028704</td>\n", | |
" <td>22.972283</td>\n", | |
" <td>441.623681</td>\n", | |
" <td>-696.007650</td>\n", | |
" <td>-104.017479</td>\n", | |
" <td>8.333586</td>\n", | |
" <td>-138.162617</td>\n", | |
" <td>-283.277042</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>rBbN5Q4Jyl</th>\n", | |
" <td>-409.202466</td>\n", | |
" <td>50.410141</td>\n", | |
" <td>128.908498</td>\n", | |
" <td>-71.238216</td>\n", | |
" <td>137.567974</td>\n", | |
" <td>-173.356820</td>\n", | |
" <td>141.111654</td>\n", | |
" <td>115.265187</td>\n", | |
" <td>-481.667415</td>\n", | |
" <td>-264.309207</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>vzHLsD6Yft</th>\n", | |
" <td>-150.564745</td>\n", | |
" <td>-123.397999</td>\n", | |
" <td>-52.350541</td>\n", | |
" <td>-487.816878</td>\n", | |
" <td>-363.032239</td>\n", | |
" <td>46.186671</td>\n", | |
" <td>-592.353868</td>\n", | |
" <td>437.938846</td>\n", | |
" <td>-34.131335</td>\n", | |
" <td>-199.617239</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" data0 data1 data2 data3 data4 \\\n", | |
"group \n", | |
"4CsrjKcKbo 11.831979 136.187324 -295.388315 142.704850 -41.881516 \n", | |
"73VX4pdH4Y 175.005549 547.486156 199.117899 548.247042 225.798938 \n", | |
"BmLu4cT6uF 172.712552 169.204947 569.918196 -260.546673 510.197316 \n", | |
"HSvLJoc15E -511.972646 -56.881369 -251.202024 251.213307 94.269355 \n", | |
"J5Yd8irBnb -5.573153 -453.530304 513.811162 286.185040 855.478154 \n", | |
"ZmDUBhNUOy -246.333214 215.914220 -280.593283 -90.840335 27.458542 \n", | |
"lK4xrUXWN0 -244.632800 -356.878053 -433.656816 208.128759 414.632423 \n", | |
"pBI24n2F3Q 637.710927 -18.689120 -480.028704 22.972283 441.623681 \n", | |
"rBbN5Q4Jyl -409.202466 50.410141 128.908498 -71.238216 137.567974 \n", | |
"vzHLsD6Yft -150.564745 -123.397999 -52.350541 -487.816878 -363.032239 \n", | |
"\n", | |
" data5 data6 data7 data8 data9 \n", | |
"group \n", | |
"4CsrjKcKbo -29.665339 -207.635804 -155.094945 105.283159 767.466670 \n", | |
"73VX4pdH4Y -452.535554 -500.471072 424.562159 132.303795 566.041430 \n", | |
"BmLu4cT6uF -60.677790 -207.590026 -133.377280 185.689427 876.356624 \n", | |
"HSvLJoc15E 47.596050 -60.388253 -400.746563 238.092326 -209.154316 \n", | |
"J5Yd8irBnb -178.012073 539.009204 16.243304 14.952687 40.837467 \n", | |
"ZmDUBhNUOy -152.737684 454.597280 77.648862 99.061342 147.739728 \n", | |
"lK4xrUXWN0 -411.897845 449.079543 482.190346 -411.455465 245.294481 \n", | |
"pBI24n2F3Q -696.007650 -104.017479 8.333586 -138.162617 -283.277042 \n", | |
"rBbN5Q4Jyl -173.356820 141.111654 115.265187 -481.667415 -264.309207 \n", | |
"vzHLsD6Yft 46.186671 -592.353868 437.938846 -34.131335 -199.617239 " | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Aggregate 1mm row table with 10 groups\n", | |
"%time df.groupby('group').sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"219.23547503591956" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"speed = 80000000 / 0.348 / 1024**2\n", | |
"speed" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**On my little Lenovo X230, pandas aggregates this data at approximately 220 MB/second, single-threaded.**" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Speed to read this data out of CSV and HDF5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Save without header for easier ingest to Parquet later\n", | |
"df.to_csv('test_dataset.csv', index=False, header=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total 280616\r\n", | |
"-rw-rw-r-- 1 wesm wesm 24710 Feb 21 12:53 Explorations.ipynb\r\n", | |
"-rw-rw-r-- 1 wesm wesm 207312465 Feb 21 12:53 test_dataset.csv\r\n", | |
"-rw-rw-r-- 1 wesm wesm 80007192 Feb 21 12:27 testing.hdf5\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls -l" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"pd.read_csv?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.24 s, sys: 180 ms, total: 2.42 s\n", | |
"Wall time: 2.43 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time df = pd.read_csv('test_dataset.csv', header=None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CSV parse performance: 85.1851851852 MB/second\n" | |
] | |
} | |
], | |
"source": [ | |
"speed = 207 / 2.43\n", | |
"print(\"CSV parse performance: {0} MB/second\".format(speed))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"However, the resident in-memory size of the data is only about 90MB. So let's keep this in mind." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CSV parse performance (to-in-memory data): ~37.04 MB/second\n" | |
] | |
} | |
], | |
"source": [ | |
"speed = 90 / 2.43\n", | |
"print(\"CSV parse performance (to-in-memory data): ~%.2f MB/second\" % speed)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"hdf_store = pd.HDFStore('testing.hdf5')\n", | |
"hdf_store['df'] = df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 60 ms, sys: 56 ms, total: 116 ms\n", | |
"Wall time: 117 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%time df3 = hdf_store['df']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>group</th>\n", | |
" <th>data0</th>\n", | |
" <th>data1</th>\n", | |
" <th>data2</th>\n", | |
" <th>data3</th>\n", | |
" <th>data4</th>\n", | |
" <th>data5</th>\n", | |
" <th>data6</th>\n", | |
" <th>data7</th>\n", | |
" <th>data8</th>\n", | |
" <th>data9</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.703424</td>\n", | |
" <td>-1.536075</td>\n", | |
" <td>-0.446158</td>\n", | |
" <td>0.455960</td>\n", | |
" <td>-0.556396</td>\n", | |
" <td>1.218725</td>\n", | |
" <td>1.969005</td>\n", | |
" <td>-0.042579</td>\n", | |
" <td>0.605303</td>\n", | |
" <td>2.927195</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-1.289577</td>\n", | |
" <td>0.421690</td>\n", | |
" <td>1.198543</td>\n", | |
" <td>-0.910678</td>\n", | |
" <td>-1.103254</td>\n", | |
" <td>-1.009867</td>\n", | |
" <td>-0.444909</td>\n", | |
" <td>1.100392</td>\n", | |
" <td>0.234861</td>\n", | |
" <td>-1.405328</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.703413</td>\n", | |
" <td>0.191029</td>\n", | |
" <td>-0.475114</td>\n", | |
" <td>-0.882956</td>\n", | |
" <td>0.985129</td>\n", | |
" <td>1.133779</td>\n", | |
" <td>-0.925346</td>\n", | |
" <td>-0.412362</td>\n", | |
" <td>0.003123</td>\n", | |
" <td>1.309303</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.658299</td>\n", | |
" <td>-1.649641</td>\n", | |
" <td>1.261421</td>\n", | |
" <td>-0.167431</td>\n", | |
" <td>-0.070937</td>\n", | |
" <td>-0.280642</td>\n", | |
" <td>1.333661</td>\n", | |
" <td>1.174445</td>\n", | |
" <td>0.686412</td>\n", | |
" <td>-1.308651</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.594049</td>\n", | |
" <td>1.683654</td>\n", | |
" <td>-0.935114</td>\n", | |
" <td>-1.750663</td>\n", | |
" <td>0.216827</td>\n", | |
" <td>0.284704</td>\n", | |
" <td>-0.647471</td>\n", | |
" <td>0.698593</td>\n", | |
" <td>1.379473</td>\n", | |
" <td>-0.286060</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" group data0 data1 data2 data3 data4 data5 \\\n", | |
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
"0 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
"0 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
"0 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
"0 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
"\n", | |
" data6 data7 data8 data9 \n", | |
"0 1.969005 -0.042579 0.605303 2.927195 \n", | |
"0 -0.444909 1.100392 0.234861 -1.405328 \n", | |
"0 -0.925346 -0.412362 0.003123 1.309303 \n", | |
"0 1.333661 1.174445 0.686412 -1.308651 \n", | |
"0 -0.647471 0.698593 1.379473 -0.286060 " | |
] | |
}, | |
"execution_count": 61, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CSV parse performance (to-in-memory data): ~737.70 MB/second\n" | |
] | |
} | |
], | |
"source": [ | |
"speed = 90 / 0.122\n", | |
"print(\"CSV parse performance (to-in-memory data): ~%.2f MB/second\" % speed)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Yep, **pandas HDF5 read performance is ridiculous, almost 20 times faster**. Binary columnar data storage makes an extraordinary difference." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## HiveServer2 (Hive, Impala) to pandas" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Impala and HDFS are running locally on my laptop, so there is no network bandwidth issue here. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import ibis\n", | |
"ibis.options.verbose = True\n", | |
"data_dir = '/hs2-perf/csv-test'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"SHOW DATABASES LIKE '__ibis_tmp'\n" | |
] | |
} | |
], | |
"source": [ | |
"hdfs = ibis.hdfs_connect(port=5070)\n", | |
"con = ibis.impala.connect(hdfs_client=hdfs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 81, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"hdfs.mkdir(data_dir)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'/hs2-perf/csv-test/test_dataset.csv'" | |
] | |
}, | |
"execution_count": 83, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hdfs.put('/hs2-perf/csv-test/', 'test_dataset.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 84, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['test_dataset.csv']" | |
] | |
}, | |
"execution_count": 84, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hdfs.ls('/hs2-perf/csv-test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"ibis.Schema { \n", | |
" group string\n", | |
" data0 double\n", | |
" data1 double\n", | |
" data2 double\n", | |
" data3 double\n", | |
" data4 double\n", | |
" data5 double\n", | |
" data6 double\n", | |
" data7 double\n", | |
" data8 double\n", | |
" data9 double\n", | |
"}" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from ibis.impala.pandas_interop import pandas_to_ibis_schema\n", | |
"df.columns = ['group'] + ['data{}'.format(i) for i in range(10)]\n", | |
"schema = pandas_to_ibis_schema(df)\n", | |
"schema" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#con.create_database('hs2_perf_test')\n", | |
"db = con.database('hs2_perf_test')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# csv_table = con.delimited_file(data_dir, schema)\n", | |
"\n", | |
"# Working around IMPALA-2069\n", | |
"query = \"\"\"\n", | |
"CREATE EXTERNAL TABLE hs2_perf_test.csv_table\n", | |
"(`group` varchar(10),\n", | |
" `data0` double,\n", | |
" `data1` double,\n", | |
" `data2` double,\n", | |
" `data3` double,\n", | |
" `data4` double,\n", | |
" `data5` double,\n", | |
" `data6` double,\n", | |
" `data7` double,\n", | |
" `data8` double,\n", | |
" `data9` double)\n", | |
"ROW FORMAT DELIMITED\n", | |
"FIELDS TERMINATED BY ','\n", | |
"LOCATION '/hs2-perf/csv-test'\n", | |
"\"\"\"\n", | |
"import impala\n", | |
"impyla_con = impala.dbapi.connect()\n", | |
"cur = impyla_con.cursor()\n", | |
"cur.execute(query)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"DESCRIBE hs2_perf_test.`csv_table`\n" | |
] | |
} | |
], | |
"source": [ | |
"csv_table = db.csv_table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"SELECT *\n", | |
"FROM hs2_perf_test.`csv_table`\n", | |
"LIMIT 5\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>group</th>\n", | |
" <th>data0</th>\n", | |
" <th>data1</th>\n", | |
" <th>data2</th>\n", | |
" <th>data3</th>\n", | |
" <th>data4</th>\n", | |
" <th>data5</th>\n", | |
" <th>data6</th>\n", | |
" <th>data7</th>\n", | |
" <th>data8</th>\n", | |
" <th>data9</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.703424</td>\n", | |
" <td>-1.536075</td>\n", | |
" <td>-0.446158</td>\n", | |
" <td>0.455960</td>\n", | |
" <td>-0.556396</td>\n", | |
" <td>1.218725</td>\n", | |
" <td>1.969005</td>\n", | |
" <td>-0.042579</td>\n", | |
" <td>0.605303</td>\n", | |
" <td>2.927195</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-1.289577</td>\n", | |
" <td>0.421690</td>\n", | |
" <td>1.198543</td>\n", | |
" <td>-0.910678</td>\n", | |
" <td>-1.103254</td>\n", | |
" <td>-1.009867</td>\n", | |
" <td>-0.444909</td>\n", | |
" <td>1.100392</td>\n", | |
" <td>0.234861</td>\n", | |
" <td>-1.405328</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.703413</td>\n", | |
" <td>0.191029</td>\n", | |
" <td>-0.475114</td>\n", | |
" <td>-0.882956</td>\n", | |
" <td>0.985129</td>\n", | |
" <td>1.133779</td>\n", | |
" <td>-0.925346</td>\n", | |
" <td>-0.412362</td>\n", | |
" <td>0.003123</td>\n", | |
" <td>1.309303</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.658299</td>\n", | |
" <td>-1.649641</td>\n", | |
" <td>1.261421</td>\n", | |
" <td>-0.167431</td>\n", | |
" <td>-0.070937</td>\n", | |
" <td>-0.280642</td>\n", | |
" <td>1.333661</td>\n", | |
" <td>1.174445</td>\n", | |
" <td>0.686412</td>\n", | |
" <td>-1.308651</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.594049</td>\n", | |
" <td>1.683654</td>\n", | |
" <td>-0.935114</td>\n", | |
" <td>-1.750663</td>\n", | |
" <td>0.216827</td>\n", | |
" <td>0.284704</td>\n", | |
" <td>-0.647471</td>\n", | |
" <td>0.698593</td>\n", | |
" <td>1.379473</td>\n", | |
" <td>-0.286060</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" group data0 data1 data2 data3 data4 data5 \\\n", | |
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
"1 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
"2 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
"3 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
"4 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
"\n", | |
" data6 data7 data8 data9 \n", | |
"0 1.969005 -0.042579 0.605303 2.927195 \n", | |
"1 -0.444909 1.100392 0.234861 -1.405328 \n", | |
"2 -0.925346 -0.412362 0.003123 1.309303 \n", | |
"3 1.333661 1.174445 0.686412 -1.308651 \n", | |
"4 -0.647471 0.698593 1.379473 -0.286060 " | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"csv_table[:5].execute()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"DESCRIBE hs2_perf_test.`csv_as_parquet`\n", | |
"DROP TABLE hs2_perf_test.`csv_as_parquet`\n", | |
"CREATE TABLE hs2_perf_test.`csv_as_parquet`\n", | |
"STORED AS PARQUET\n", | |
"AS\n", | |
"SELECT *\n", | |
"FROM hs2_perf_test.`csv_table`\n" | |
] | |
} | |
], | |
"source": [ | |
"db.csv_as_parquet.drop()\n", | |
"db.create_table('csv_as_parquet', csv_table, format='parquet')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"DESCRIBE hs2_perf_test.`csv_as_parquet`\n", | |
"SELECT *\n", | |
"FROM hs2_perf_test.`csv_as_parquet`\n", | |
"LIMIT 5\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>group</th>\n", | |
" <th>data0</th>\n", | |
" <th>data1</th>\n", | |
" <th>data2</th>\n", | |
" <th>data3</th>\n", | |
" <th>data4</th>\n", | |
" <th>data5</th>\n", | |
" <th>data6</th>\n", | |
" <th>data7</th>\n", | |
" <th>data8</th>\n", | |
" <th>data9</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.703424</td>\n", | |
" <td>-1.536075</td>\n", | |
" <td>-0.446158</td>\n", | |
" <td>0.455960</td>\n", | |
" <td>-0.556396</td>\n", | |
" <td>1.218725</td>\n", | |
" <td>1.969005</td>\n", | |
" <td>-0.042579</td>\n", | |
" <td>0.605303</td>\n", | |
" <td>2.927195</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-1.289577</td>\n", | |
" <td>0.421690</td>\n", | |
" <td>1.198543</td>\n", | |
" <td>-0.910678</td>\n", | |
" <td>-1.103254</td>\n", | |
" <td>-1.009867</td>\n", | |
" <td>-0.444909</td>\n", | |
" <td>1.100392</td>\n", | |
" <td>0.234861</td>\n", | |
" <td>-1.405328</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.703413</td>\n", | |
" <td>0.191029</td>\n", | |
" <td>-0.475114</td>\n", | |
" <td>-0.882956</td>\n", | |
" <td>0.985129</td>\n", | |
" <td>1.133779</td>\n", | |
" <td>-0.925346</td>\n", | |
" <td>-0.412362</td>\n", | |
" <td>0.003123</td>\n", | |
" <td>1.309303</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>0.658299</td>\n", | |
" <td>-1.649641</td>\n", | |
" <td>1.261421</td>\n", | |
" <td>-0.167431</td>\n", | |
" <td>-0.070937</td>\n", | |
" <td>-0.280642</td>\n", | |
" <td>1.333661</td>\n", | |
" <td>1.174445</td>\n", | |
" <td>0.686412</td>\n", | |
" <td>-1.308651</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>pBI24n2F3Q</td>\n", | |
" <td>-0.594049</td>\n", | |
" <td>1.683654</td>\n", | |
" <td>-0.935114</td>\n", | |
" <td>-1.750663</td>\n", | |
" <td>0.216827</td>\n", | |
" <td>0.284704</td>\n", | |
" <td>-0.647471</td>\n", | |
" <td>0.698593</td>\n", | |
" <td>1.379473</td>\n", | |
" <td>-0.286060</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" group data0 data1 data2 data3 data4 data5 \\\n", | |
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n", | |
"1 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n", | |
"2 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n", | |
"3 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n", | |
"4 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n", | |
"\n", | |
" data6 data7 data8 data9 \n", | |
"0 1.969005 -0.042579 0.605303 2.927195 \n", | |
"1 -0.444909 1.100392 0.234861 -1.405328 \n", | |
"2 -0.925346 -0.412362 0.003123 1.309303 \n", | |
"3 1.333661 1.174445 0.686412 -1.308651 \n", | |
"4 -0.647471 0.698593 1.379473 -0.286060 " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"parquet_table = db.csv_as_parquet\n", | |
"parquet_table[:5].execute()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"SELECT *\n", | |
"FROM hs2_perf_test.`csv_as_parquet`\n", | |
"CPU times: user 11.5 s, sys: 248 ms, total: 11.7 s\n", | |
"Wall time: 15.3 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time df4 = parquet_table.execute(limit=None)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Wait...what? Let's go lower level to the raw HS2 connection." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import impala" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"hs2_con = impala.dbapi.connect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 10.7 s, sys: 44 ms, total: 10.8 s\n", | |
"Wall time: 14.1 s\n" | |
] | |
} | |
], | |
"source": [ | |
"def get_data(query):\n", | |
" cur = hs2_con.cursor()\n", | |
" cur.execute(query)\n", | |
" return cur.fetchcolumnar()\n", | |
"\n", | |
"query = \"SELECT * FROM hs2_perf_test.csv_as_parquet\"\n", | |
"%time tuples = get_data(query)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"I can explain the reasons why this is so slow, but another time. " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Spark DataFrames to pandas" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's get the Parquet files" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"DESCRIBE hs2_perf_test.`csv_as_parquet`\n", | |
"DESCRIBE FORMATTED hs2_perf_test.`csv_as_parquet`\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"'hdfs://localhost:20500/test-warehouse/hs2_perf_test.db/csv_as_parquet'" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"meta = db.csv_as_parquet.describe_formatted()\n", | |
"meta.location" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'564d173750e684bc-eb3958cd3946d1a7_1811129494_data.0.parq',\n", | |
" u'564d173750e684bc-eb3958cd3946d1a8_548402996_data.0.parq',\n", | |
" u'_impala_insert_staging']" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hdfs.ls('/test-warehouse/hs2_perf_test.db/csv_as_parquet')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"path = hdfs.get('/test-warehouse/hs2_perf_test.db/csv_as_parquet', 'csv_parquet')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Welcome to\n", | |
" ____ __\n", | |
" / __/__ ___ _____/ /__\n", | |
" _\\ \\/ _ \\/ _ `/ __/ '_/\n", | |
" /__ / .__/\\_,_/_/ /_/\\_\\ version 1.6.0\n", | |
" /_/\n", | |
"\n", | |
"Using Python version 2.7.11 (default, Dec 6 2015 18:08:32)\n", | |
"SparkContext available as sc, SQLContext available as sqlContext.\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import sys\n", | |
"\n", | |
"spark_home = os.environ.get('SPARK_HOME', None)\n", | |
"if not spark_home:\n", | |
" raise ValueError('SPARK_HOME environment variable is not set')\n", | |
"\n", | |
"sys.path.insert(0, os.path.join(spark_home, 'python'))\n", | |
"sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip'))\n", | |
"execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = sqlContext.read.format('parquet').load('csv_parquet/')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('group', 'string'),\n", | |
" ('data0', 'double'),\n", | |
" ('data1', 'double'),\n", | |
" ('data2', 'double'),\n", | |
" ('data3', 'double'),\n", | |
" ('data4', 'double'),\n", | |
" ('data5', 'double'),\n", | |
" ('data6', 'double'),\n", | |
" ('data7', 'double'),\n", | |
" ('data8', 'double'),\n", | |
" ('data9', 'double')]" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 8.76 s, sys: 516 ms, total: 9.28 s\n", | |
"Wall time: 13.6 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time df5 = df.toPandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks