Skip to content

Instantly share code, notes, and snippets.

@wesm
Last active July 6, 2022 04:11
Show Gist options
  • Save wesm/0cb5531b1c2e346a0007 to your computer and use it in GitHub Desktop.
Save wesm/0cb5531b1c2e346a0007 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test data generation"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>group</th>\n",
" <th>data0</th>\n",
" <th>data1</th>\n",
" <th>data2</th>\n",
" <th>data3</th>\n",
" <th>data4</th>\n",
" <th>data5</th>\n",
" <th>data6</th>\n",
" <th>data7</th>\n",
" <th>data8</th>\n",
" <th>data9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.703424</td>\n",
" <td>-1.536075</td>\n",
" <td>-0.446158</td>\n",
" <td>0.455960</td>\n",
" <td>-0.556396</td>\n",
" <td>1.218725</td>\n",
" <td>1.969005</td>\n",
" <td>-0.042579</td>\n",
" <td>0.605303</td>\n",
" <td>2.927195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-1.289577</td>\n",
" <td>0.421690</td>\n",
" <td>1.198543</td>\n",
" <td>-0.910678</td>\n",
" <td>-1.103254</td>\n",
" <td>-1.009867</td>\n",
" <td>-0.444909</td>\n",
" <td>1.100392</td>\n",
" <td>0.234861</td>\n",
" <td>-1.405328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.703413</td>\n",
" <td>0.191029</td>\n",
" <td>-0.475114</td>\n",
" <td>-0.882956</td>\n",
" <td>0.985129</td>\n",
" <td>1.133779</td>\n",
" <td>-0.925346</td>\n",
" <td>-0.412362</td>\n",
" <td>0.003123</td>\n",
" <td>1.309303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.658299</td>\n",
" <td>-1.649641</td>\n",
" <td>1.261421</td>\n",
" <td>-0.167431</td>\n",
" <td>-0.070937</td>\n",
" <td>-0.280642</td>\n",
" <td>1.333661</td>\n",
" <td>1.174445</td>\n",
" <td>0.686412</td>\n",
" <td>-1.308651</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.594049</td>\n",
" <td>1.683654</td>\n",
" <td>-0.935114</td>\n",
" <td>-1.750663</td>\n",
" <td>0.216827</td>\n",
" <td>0.284704</td>\n",
" <td>-0.647471</td>\n",
" <td>0.698593</td>\n",
" <td>1.379473</td>\n",
" <td>-0.286060</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" group data0 data1 data2 data3 data4 data5 \\\n",
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n",
"0 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n",
"0 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n",
"0 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n",
"0 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n",
"\n",
" data6 data7 data8 data9 \n",
"0 1.969005 -0.042579 0.605303 2.927195 \n",
"0 -0.444909 1.100392 0.234861 -1.405328 \n",
"0 -0.925346 -0.412362 0.003123 1.309303 \n",
"0 1.333661 1.174445 0.686412 -1.308651 \n",
"0 -0.647471 0.698593 1.379473 -0.286060 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 1mm rows for now\n",
"nrows = 1000000\n",
"\n",
"ndata_columns = 10\n",
"\n",
"# Generate some string categories\n",
"ncategories = 10\n",
"\n",
"categories = [pd.util.testing.rands(10) for _ in range(ncategories)]\n",
"cats = pd.Series(categories).repeat(nrows / ncategories)\n",
"\n",
"data = {\n",
" 'group': cats\n",
"}\n",
"column_names = ['group']\n",
"\n",
"# Generate some double data\n",
"for i in range(ndata_columns):\n",
" name = 'data{}'.format(i)\n",
" data[name] = np.random.randn(nrows)\n",
" column_names.append(name)\n",
"\n",
"df = pd.DataFrame(data, columns=column_names)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 204 ms, sys: 140 ms, total: 344 ms\n",
"Wall time: 348 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>data0</th>\n",
" <th>data1</th>\n",
" <th>data2</th>\n",
" <th>data3</th>\n",
" <th>data4</th>\n",
" <th>data5</th>\n",
" <th>data6</th>\n",
" <th>data7</th>\n",
" <th>data8</th>\n",
" <th>data9</th>\n",
" </tr>\n",
" <tr>\n",
" <th>group</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4CsrjKcKbo</th>\n",
" <td>11.831979</td>\n",
" <td>136.187324</td>\n",
" <td>-295.388315</td>\n",
" <td>142.704850</td>\n",
" <td>-41.881516</td>\n",
" <td>-29.665339</td>\n",
" <td>-207.635804</td>\n",
" <td>-155.094945</td>\n",
" <td>105.283159</td>\n",
" <td>767.466670</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73VX4pdH4Y</th>\n",
" <td>175.005549</td>\n",
" <td>547.486156</td>\n",
" <td>199.117899</td>\n",
" <td>548.247042</td>\n",
" <td>225.798938</td>\n",
" <td>-452.535554</td>\n",
" <td>-500.471072</td>\n",
" <td>424.562159</td>\n",
" <td>132.303795</td>\n",
" <td>566.041430</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BmLu4cT6uF</th>\n",
" <td>172.712552</td>\n",
" <td>169.204947</td>\n",
" <td>569.918196</td>\n",
" <td>-260.546673</td>\n",
" <td>510.197316</td>\n",
" <td>-60.677790</td>\n",
" <td>-207.590026</td>\n",
" <td>-133.377280</td>\n",
" <td>185.689427</td>\n",
" <td>876.356624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HSvLJoc15E</th>\n",
" <td>-511.972646</td>\n",
" <td>-56.881369</td>\n",
" <td>-251.202024</td>\n",
" <td>251.213307</td>\n",
" <td>94.269355</td>\n",
" <td>47.596050</td>\n",
" <td>-60.388253</td>\n",
" <td>-400.746563</td>\n",
" <td>238.092326</td>\n",
" <td>-209.154316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>J5Yd8irBnb</th>\n",
" <td>-5.573153</td>\n",
" <td>-453.530304</td>\n",
" <td>513.811162</td>\n",
" <td>286.185040</td>\n",
" <td>855.478154</td>\n",
" <td>-178.012073</td>\n",
" <td>539.009204</td>\n",
" <td>16.243304</td>\n",
" <td>14.952687</td>\n",
" <td>40.837467</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ZmDUBhNUOy</th>\n",
" <td>-246.333214</td>\n",
" <td>215.914220</td>\n",
" <td>-280.593283</td>\n",
" <td>-90.840335</td>\n",
" <td>27.458542</td>\n",
" <td>-152.737684</td>\n",
" <td>454.597280</td>\n",
" <td>77.648862</td>\n",
" <td>99.061342</td>\n",
" <td>147.739728</td>\n",
" </tr>\n",
" <tr>\n",
" <th>lK4xrUXWN0</th>\n",
" <td>-244.632800</td>\n",
" <td>-356.878053</td>\n",
" <td>-433.656816</td>\n",
" <td>208.128759</td>\n",
" <td>414.632423</td>\n",
" <td>-411.897845</td>\n",
" <td>449.079543</td>\n",
" <td>482.190346</td>\n",
" <td>-411.455465</td>\n",
" <td>245.294481</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pBI24n2F3Q</th>\n",
" <td>637.710927</td>\n",
" <td>-18.689120</td>\n",
" <td>-480.028704</td>\n",
" <td>22.972283</td>\n",
" <td>441.623681</td>\n",
" <td>-696.007650</td>\n",
" <td>-104.017479</td>\n",
" <td>8.333586</td>\n",
" <td>-138.162617</td>\n",
" <td>-283.277042</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rBbN5Q4Jyl</th>\n",
" <td>-409.202466</td>\n",
" <td>50.410141</td>\n",
" <td>128.908498</td>\n",
" <td>-71.238216</td>\n",
" <td>137.567974</td>\n",
" <td>-173.356820</td>\n",
" <td>141.111654</td>\n",
" <td>115.265187</td>\n",
" <td>-481.667415</td>\n",
" <td>-264.309207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>vzHLsD6Yft</th>\n",
" <td>-150.564745</td>\n",
" <td>-123.397999</td>\n",
" <td>-52.350541</td>\n",
" <td>-487.816878</td>\n",
" <td>-363.032239</td>\n",
" <td>46.186671</td>\n",
" <td>-592.353868</td>\n",
" <td>437.938846</td>\n",
" <td>-34.131335</td>\n",
" <td>-199.617239</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" data0 data1 data2 data3 data4 \\\n",
"group \n",
"4CsrjKcKbo 11.831979 136.187324 -295.388315 142.704850 -41.881516 \n",
"73VX4pdH4Y 175.005549 547.486156 199.117899 548.247042 225.798938 \n",
"BmLu4cT6uF 172.712552 169.204947 569.918196 -260.546673 510.197316 \n",
"HSvLJoc15E -511.972646 -56.881369 -251.202024 251.213307 94.269355 \n",
"J5Yd8irBnb -5.573153 -453.530304 513.811162 286.185040 855.478154 \n",
"ZmDUBhNUOy -246.333214 215.914220 -280.593283 -90.840335 27.458542 \n",
"lK4xrUXWN0 -244.632800 -356.878053 -433.656816 208.128759 414.632423 \n",
"pBI24n2F3Q 637.710927 -18.689120 -480.028704 22.972283 441.623681 \n",
"rBbN5Q4Jyl -409.202466 50.410141 128.908498 -71.238216 137.567974 \n",
"vzHLsD6Yft -150.564745 -123.397999 -52.350541 -487.816878 -363.032239 \n",
"\n",
" data5 data6 data7 data8 data9 \n",
"group \n",
"4CsrjKcKbo -29.665339 -207.635804 -155.094945 105.283159 767.466670 \n",
"73VX4pdH4Y -452.535554 -500.471072 424.562159 132.303795 566.041430 \n",
"BmLu4cT6uF -60.677790 -207.590026 -133.377280 185.689427 876.356624 \n",
"HSvLJoc15E 47.596050 -60.388253 -400.746563 238.092326 -209.154316 \n",
"J5Yd8irBnb -178.012073 539.009204 16.243304 14.952687 40.837467 \n",
"ZmDUBhNUOy -152.737684 454.597280 77.648862 99.061342 147.739728 \n",
"lK4xrUXWN0 -411.897845 449.079543 482.190346 -411.455465 245.294481 \n",
"pBI24n2F3Q -696.007650 -104.017479 8.333586 -138.162617 -283.277042 \n",
"rBbN5Q4Jyl -173.356820 141.111654 115.265187 -481.667415 -264.309207 \n",
"vzHLsD6Yft 46.186671 -592.353868 437.938846 -34.131335 -199.617239 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Aggregate 1mm row table with 10 groups\n",
"%time df.groupby('group').sum()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"219.23547503591956"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"speed = 80000000 / 0.348 / 1024**2\n",
"speed"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**On my little Lenovo X230, pandas aggregates this data at approximately 220 MB/second, single-threaded.**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Speed to read this data out of CSV and HDF5"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Save without header for easier ingest to Parquet later\n",
"df.to_csv('test_dataset.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 280616\r\n",
"-rw-rw-r-- 1 wesm wesm 24710 Feb 21 12:53 Explorations.ipynb\r\n",
"-rw-rw-r-- 1 wesm wesm 207312465 Feb 21 12:53 test_dataset.csv\r\n",
"-rw-rw-r-- 1 wesm wesm 80007192 Feb 21 12:27 testing.hdf5\r\n"
]
}
],
"source": [
"!ls -l"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pd.read_csv?"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.24 s, sys: 180 ms, total: 2.42 s\n",
"Wall time: 2.43 s\n"
]
}
],
"source": [
"%time df = pd.read_csv('test_dataset.csv', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CSV parse performance: 85.1851851852 MB/second\n"
]
}
],
"source": [
"speed = 207 / 2.43\n",
"print(\"CSV parse performance: {0} MB/second\".format(speed))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"However, the resident in-memory size of the data is only about 90MB. So let's keep this in mind."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CSV parse performance (to-in-memory data): ~37.04 MB/second\n"
]
}
],
"source": [
"speed = 90 / 2.43\n",
"print(\"CSV parse performance (to-in-memory data): ~%.2f MB/second\" % speed)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"hdf_store = pd.HDFStore('testing.hdf5')\n",
"hdf_store['df'] = df"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 60 ms, sys: 56 ms, total: 116 ms\n",
"Wall time: 117 ms\n"
]
}
],
"source": [
"%time df3 = hdf_store['df']"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>group</th>\n",
" <th>data0</th>\n",
" <th>data1</th>\n",
" <th>data2</th>\n",
" <th>data3</th>\n",
" <th>data4</th>\n",
" <th>data5</th>\n",
" <th>data6</th>\n",
" <th>data7</th>\n",
" <th>data8</th>\n",
" <th>data9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.703424</td>\n",
" <td>-1.536075</td>\n",
" <td>-0.446158</td>\n",
" <td>0.455960</td>\n",
" <td>-0.556396</td>\n",
" <td>1.218725</td>\n",
" <td>1.969005</td>\n",
" <td>-0.042579</td>\n",
" <td>0.605303</td>\n",
" <td>2.927195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-1.289577</td>\n",
" <td>0.421690</td>\n",
" <td>1.198543</td>\n",
" <td>-0.910678</td>\n",
" <td>-1.103254</td>\n",
" <td>-1.009867</td>\n",
" <td>-0.444909</td>\n",
" <td>1.100392</td>\n",
" <td>0.234861</td>\n",
" <td>-1.405328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.703413</td>\n",
" <td>0.191029</td>\n",
" <td>-0.475114</td>\n",
" <td>-0.882956</td>\n",
" <td>0.985129</td>\n",
" <td>1.133779</td>\n",
" <td>-0.925346</td>\n",
" <td>-0.412362</td>\n",
" <td>0.003123</td>\n",
" <td>1.309303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.658299</td>\n",
" <td>-1.649641</td>\n",
" <td>1.261421</td>\n",
" <td>-0.167431</td>\n",
" <td>-0.070937</td>\n",
" <td>-0.280642</td>\n",
" <td>1.333661</td>\n",
" <td>1.174445</td>\n",
" <td>0.686412</td>\n",
" <td>-1.308651</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.594049</td>\n",
" <td>1.683654</td>\n",
" <td>-0.935114</td>\n",
" <td>-1.750663</td>\n",
" <td>0.216827</td>\n",
" <td>0.284704</td>\n",
" <td>-0.647471</td>\n",
" <td>0.698593</td>\n",
" <td>1.379473</td>\n",
" <td>-0.286060</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" group data0 data1 data2 data3 data4 data5 \\\n",
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n",
"0 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n",
"0 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n",
"0 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n",
"0 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n",
"\n",
" data6 data7 data8 data9 \n",
"0 1.969005 -0.042579 0.605303 2.927195 \n",
"0 -0.444909 1.100392 0.234861 -1.405328 \n",
"0 -0.925346 -0.412362 0.003123 1.309303 \n",
"0 1.333661 1.174445 0.686412 -1.308651 \n",
"0 -0.647471 0.698593 1.379473 -0.286060 "
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3.head()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CSV parse performance (to-in-memory data): ~737.70 MB/second\n"
]
}
],
"source": [
"speed = 90 / 0.122\n",
"print(\"CSV parse performance (to-in-memory data): ~%.2f MB/second\" % speed)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Yep, **pandas HDF5 read performance is ridiculous, almost 20 times faster**. Binary columnar data storage makes an extraordinary difference."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## HiveServer2 (Hive, Impala) to pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Impala and HDFS are running locally on my laptop, so there is no network bandwidth issue here. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import ibis\n",
"ibis.options.verbose = True\n",
"data_dir = '/hs2-perf/csv-test'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SHOW DATABASES LIKE '__ibis_tmp'\n"
]
}
],
"source": [
"hdfs = ibis.hdfs_connect(port=5070)\n",
"con = ibis.impala.connect(hdfs_client=hdfs)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"hdfs.mkdir(data_dir)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'/hs2-perf/csv-test/test_dataset.csv'"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hdfs.put('/hs2-perf/csv-test/', 'test_dataset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['test_dataset.csv']"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hdfs.ls('/hs2-perf/csv-test')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"ibis.Schema { \n",
" group string\n",
" data0 double\n",
" data1 double\n",
" data2 double\n",
" data3 double\n",
" data4 double\n",
" data5 double\n",
" data6 double\n",
" data7 double\n",
" data8 double\n",
" data9 double\n",
"}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from ibis.impala.pandas_interop import pandas_to_ibis_schema\n",
"df.columns = ['group'] + ['data{}'.format(i) for i in range(10)]\n",
"schema = pandas_to_ibis_schema(df)\n",
"schema"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#con.create_database('hs2_perf_test')\n",
"db = con.database('hs2_perf_test')"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# csv_table = con.delimited_file(data_dir, schema)\n",
"\n",
"# Working around IMPALA-2069\n",
"query = \"\"\"\n",
"CREATE EXTERNAL TABLE hs2_perf_test.csv_table\n",
"(`group` varchar(10),\n",
" `data0` double,\n",
" `data1` double,\n",
" `data2` double,\n",
" `data3` double,\n",
" `data4` double,\n",
" `data5` double,\n",
" `data6` double,\n",
" `data7` double,\n",
" `data8` double,\n",
" `data9` double)\n",
"ROW FORMAT DELIMITED\n",
"FIELDS TERMINATED BY ','\n",
"LOCATION '/hs2-perf/csv-test'\n",
"\"\"\"\n",
"import impala\n",
"impyla_con = impala.dbapi.connect()\n",
"cur = impyla_con.cursor()\n",
"cur.execute(query)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DESCRIBE hs2_perf_test.`csv_table`\n"
]
}
],
"source": [
"csv_table = db.csv_table"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SELECT *\n",
"FROM hs2_perf_test.`csv_table`\n",
"LIMIT 5\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>group</th>\n",
" <th>data0</th>\n",
" <th>data1</th>\n",
" <th>data2</th>\n",
" <th>data3</th>\n",
" <th>data4</th>\n",
" <th>data5</th>\n",
" <th>data6</th>\n",
" <th>data7</th>\n",
" <th>data8</th>\n",
" <th>data9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.703424</td>\n",
" <td>-1.536075</td>\n",
" <td>-0.446158</td>\n",
" <td>0.455960</td>\n",
" <td>-0.556396</td>\n",
" <td>1.218725</td>\n",
" <td>1.969005</td>\n",
" <td>-0.042579</td>\n",
" <td>0.605303</td>\n",
" <td>2.927195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-1.289577</td>\n",
" <td>0.421690</td>\n",
" <td>1.198543</td>\n",
" <td>-0.910678</td>\n",
" <td>-1.103254</td>\n",
" <td>-1.009867</td>\n",
" <td>-0.444909</td>\n",
" <td>1.100392</td>\n",
" <td>0.234861</td>\n",
" <td>-1.405328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.703413</td>\n",
" <td>0.191029</td>\n",
" <td>-0.475114</td>\n",
" <td>-0.882956</td>\n",
" <td>0.985129</td>\n",
" <td>1.133779</td>\n",
" <td>-0.925346</td>\n",
" <td>-0.412362</td>\n",
" <td>0.003123</td>\n",
" <td>1.309303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.658299</td>\n",
" <td>-1.649641</td>\n",
" <td>1.261421</td>\n",
" <td>-0.167431</td>\n",
" <td>-0.070937</td>\n",
" <td>-0.280642</td>\n",
" <td>1.333661</td>\n",
" <td>1.174445</td>\n",
" <td>0.686412</td>\n",
" <td>-1.308651</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.594049</td>\n",
" <td>1.683654</td>\n",
" <td>-0.935114</td>\n",
" <td>-1.750663</td>\n",
" <td>0.216827</td>\n",
" <td>0.284704</td>\n",
" <td>-0.647471</td>\n",
" <td>0.698593</td>\n",
" <td>1.379473</td>\n",
" <td>-0.286060</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" group data0 data1 data2 data3 data4 data5 \\\n",
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n",
"1 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n",
"2 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n",
"3 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n",
"4 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n",
"\n",
" data6 data7 data8 data9 \n",
"0 1.969005 -0.042579 0.605303 2.927195 \n",
"1 -0.444909 1.100392 0.234861 -1.405328 \n",
"2 -0.925346 -0.412362 0.003123 1.309303 \n",
"3 1.333661 1.174445 0.686412 -1.308651 \n",
"4 -0.647471 0.698593 1.379473 -0.286060 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csv_table[:5].execute()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DESCRIBE hs2_perf_test.`csv_as_parquet`\n",
"DROP TABLE hs2_perf_test.`csv_as_parquet`\n",
"CREATE TABLE hs2_perf_test.`csv_as_parquet`\n",
"STORED AS PARQUET\n",
"AS\n",
"SELECT *\n",
"FROM hs2_perf_test.`csv_table`\n"
]
}
],
"source": [
"db.csv_as_parquet.drop()\n",
"db.create_table('csv_as_parquet', csv_table, format='parquet')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DESCRIBE hs2_perf_test.`csv_as_parquet`\n",
"SELECT *\n",
"FROM hs2_perf_test.`csv_as_parquet`\n",
"LIMIT 5\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>group</th>\n",
" <th>data0</th>\n",
" <th>data1</th>\n",
" <th>data2</th>\n",
" <th>data3</th>\n",
" <th>data4</th>\n",
" <th>data5</th>\n",
" <th>data6</th>\n",
" <th>data7</th>\n",
" <th>data8</th>\n",
" <th>data9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.703424</td>\n",
" <td>-1.536075</td>\n",
" <td>-0.446158</td>\n",
" <td>0.455960</td>\n",
" <td>-0.556396</td>\n",
" <td>1.218725</td>\n",
" <td>1.969005</td>\n",
" <td>-0.042579</td>\n",
" <td>0.605303</td>\n",
" <td>2.927195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-1.289577</td>\n",
" <td>0.421690</td>\n",
" <td>1.198543</td>\n",
" <td>-0.910678</td>\n",
" <td>-1.103254</td>\n",
" <td>-1.009867</td>\n",
" <td>-0.444909</td>\n",
" <td>1.100392</td>\n",
" <td>0.234861</td>\n",
" <td>-1.405328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.703413</td>\n",
" <td>0.191029</td>\n",
" <td>-0.475114</td>\n",
" <td>-0.882956</td>\n",
" <td>0.985129</td>\n",
" <td>1.133779</td>\n",
" <td>-0.925346</td>\n",
" <td>-0.412362</td>\n",
" <td>0.003123</td>\n",
" <td>1.309303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>0.658299</td>\n",
" <td>-1.649641</td>\n",
" <td>1.261421</td>\n",
" <td>-0.167431</td>\n",
" <td>-0.070937</td>\n",
" <td>-0.280642</td>\n",
" <td>1.333661</td>\n",
" <td>1.174445</td>\n",
" <td>0.686412</td>\n",
" <td>-1.308651</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>pBI24n2F3Q</td>\n",
" <td>-0.594049</td>\n",
" <td>1.683654</td>\n",
" <td>-0.935114</td>\n",
" <td>-1.750663</td>\n",
" <td>0.216827</td>\n",
" <td>0.284704</td>\n",
" <td>-0.647471</td>\n",
" <td>0.698593</td>\n",
" <td>1.379473</td>\n",
" <td>-0.286060</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" group data0 data1 data2 data3 data4 data5 \\\n",
"0 pBI24n2F3Q 0.703424 -1.536075 -0.446158 0.455960 -0.556396 1.218725 \n",
"1 pBI24n2F3Q -1.289577 0.421690 1.198543 -0.910678 -1.103254 -1.009867 \n",
"2 pBI24n2F3Q -0.703413 0.191029 -0.475114 -0.882956 0.985129 1.133779 \n",
"3 pBI24n2F3Q 0.658299 -1.649641 1.261421 -0.167431 -0.070937 -0.280642 \n",
"4 pBI24n2F3Q -0.594049 1.683654 -0.935114 -1.750663 0.216827 0.284704 \n",
"\n",
" data6 data7 data8 data9 \n",
"0 1.969005 -0.042579 0.605303 2.927195 \n",
"1 -0.444909 1.100392 0.234861 -1.405328 \n",
"2 -0.925346 -0.412362 0.003123 1.309303 \n",
"3 1.333661 1.174445 0.686412 -1.308651 \n",
"4 -0.647471 0.698593 1.379473 -0.286060 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parquet_table = db.csv_as_parquet\n",
"parquet_table[:5].execute()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SELECT *\n",
"FROM hs2_perf_test.`csv_as_parquet`\n",
"CPU times: user 11.5 s, sys: 248 ms, total: 11.7 s\n",
"Wall time: 15.3 s\n"
]
}
],
"source": [
"%time df4 = parquet_table.execute(limit=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Wait...what? Let's go lower level to the raw HS2 connection."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import impala"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"hs2_con = impala.dbapi.connect()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 10.7 s, sys: 44 ms, total: 10.8 s\n",
"Wall time: 14.1 s\n"
]
}
],
"source": [
"def get_data(query):\n",
" cur = hs2_con.cursor()\n",
" cur.execute(query)\n",
" return cur.fetchcolumnar()\n",
"\n",
"query = \"SELECT * FROM hs2_perf_test.csv_as_parquet\"\n",
"%time tuples = get_data(query)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I can explain the reasons why this is so slow, but another time. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Spark DataFrames to pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's get the Parquet files"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DESCRIBE hs2_perf_test.`csv_as_parquet`\n",
"DESCRIBE FORMATTED hs2_perf_test.`csv_as_parquet`\n"
]
},
{
"data": {
"text/plain": [
"'hdfs://localhost:20500/test-warehouse/hs2_perf_test.db/csv_as_parquet'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"meta = db.csv_as_parquet.describe_formatted()\n",
"meta.location"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'564d173750e684bc-eb3958cd3946d1a7_1811129494_data.0.parq',\n",
" u'564d173750e684bc-eb3958cd3946d1a8_548402996_data.0.parq',\n",
" u'_impala_insert_staging']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hdfs.ls('/test-warehouse/hs2_perf_test.db/csv_as_parquet')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"path = hdfs.get('/test-warehouse/hs2_perf_test.db/csv_as_parquet', 'csv_parquet')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Welcome to\n",
" ____ __\n",
" / __/__ ___ _____/ /__\n",
" _\\ \\/ _ \\/ _ `/ __/ '_/\n",
" /__ / .__/\\_,_/_/ /_/\\_\\ version 1.6.0\n",
" /_/\n",
"\n",
"Using Python version 2.7.11 (default, Dec 6 2015 18:08:32)\n",
"SparkContext available as sc, SQLContext available as sqlContext.\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"\n",
"spark_home = os.environ.get('SPARK_HOME', None)\n",
"if not spark_home:\n",
" raise ValueError('SPARK_HOME environment variable is not set')\n",
"\n",
"sys.path.insert(0, os.path.join(spark_home, 'python'))\n",
"sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.9-src.zip'))\n",
"execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = sqlContext.read.format('parquet').load('csv_parquet/')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('group', 'string'),\n",
" ('data0', 'double'),\n",
" ('data1', 'double'),\n",
" ('data2', 'double'),\n",
" ('data3', 'double'),\n",
" ('data4', 'double'),\n",
" ('data5', 'double'),\n",
" ('data6', 'double'),\n",
" ('data7', 'double'),\n",
" ('data8', 'double'),\n",
" ('data9', 'double')]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8.76 s, sys: 516 ms, total: 9.28 s\n",
"Wall time: 13.6 s\n"
]
}
],
"source": [
"%time df5 = df.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@OsvaldoRino
Copy link

thanks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment