Skip to content

Instantly share code, notes, and snippets.

@roaramburu
Created January 14, 2020 22:27
Show Gist options
  • Save roaramburu/3931f565a5dd0a09692512b0a5c37e1a to your computer and use it in GitHub Desktop.
Save roaramburu/3931f565a5dd0a09692512b0a5c37e1a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Imports and BlazingContext"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BlazingContext ready\n"
]
}
],
"source": [
"import cudf\n",
"from blazingsql import BlazingContext\n",
"bc = BlazingContext()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download Data and Create Table"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2020-01-14 22:15:30-- https://blazingsql-colab.s3.amazonaws.com/tpch_sf1/lineitem/0_0_0.parquet\n",
"Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.114.155\n",
"Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.114.155|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 176921950 (169M) [application/x-www-form-urlencoded]\n",
"Saving to: ‘0_0_0.parquet’\n",
"\n",
"0_0_0.parquet 100%[===================>] 168.73M 83.3MB/s in 2.0s \n",
"\n",
"2020-01-14 22:15:37 (83.3 MB/s) - ‘0_0_0.parquet’ saved [176921950/176921950]\n",
"\n"
]
}
],
"source": [
"!wget https://blazingsql-colab.s3.amazonaws.com/tpch_sf1/lineitem/0_0_0.parquet"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pyblazing.apiv2.context.BlazingTable at 0x7fb80ec856d8>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bc.create_table('lineitem', '/home/jupyter-rodrigo/0_0_0.parquet')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Write/Read ORC"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.01 s, sys: 798 ms, total: 2.81 s\n",
"Wall time: 2.35 s\n"
]
}
],
"source": [
"%%time\n",
"#BSQL Write ORC\n",
"bc.sql('select * from lineitem').to_orc('lineitem.orc', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pyblazing.apiv2.context.BlazingTable at 0x7fb80ec97898>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bc.create_table('lineitem_orc', '/home/jupyter-rodrigo/lineitem.orc')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ORC Read Benchmark"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 708 ms, sys: 327 ms, total: 1.04 s\n",
"Wall time: 1.01 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>l_orderkey</th>\n",
" <th>l_partkey</th>\n",
" <th>l_suppkey</th>\n",
" <th>l_linenumber</th>\n",
" <th>l_quantity</th>\n",
" <th>l_extendedprice</th>\n",
" <th>l_discount</th>\n",
" <th>l_tax</th>\n",
" <th>l_returnflag</th>\n",
" <th>l_linestatus</th>\n",
" <th>l_shipdate</th>\n",
" <th>l_commitdate</th>\n",
" <th>l_receiptdate</th>\n",
" <th>l_shipinstruct</th>\n",
" <th>l_shipmode</th>\n",
" <th>l_comment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>155190</td>\n",
" <td>7706</td>\n",
" <td>1</td>\n",
" <td>17.0</td>\n",
" <td>21168.230469</td>\n",
" <td>0.04</td>\n",
" <td>0.02</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-03-13</td>\n",
" <td>1996-02-12</td>\n",
" <td>1996-03-22</td>\n",
" <td>DELIVER IN PERSON</td>\n",
" <td>TRUCK</td>\n",
" <td>egular courts above the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>67310</td>\n",
" <td>7311</td>\n",
" <td>2</td>\n",
" <td>36.0</td>\n",
" <td>45983.160156</td>\n",
" <td>0.09</td>\n",
" <td>0.06</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-04-12</td>\n",
" <td>1996-02-28</td>\n",
" <td>1996-04-20</td>\n",
" <td>TAKE BACK RETURN</td>\n",
" <td>MAIL</td>\n",
" <td>ly final dependencies: slyly bold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>63700</td>\n",
" <td>3701</td>\n",
" <td>3</td>\n",
" <td>8.0</td>\n",
" <td>13309.599609</td>\n",
" <td>0.10</td>\n",
" <td>0.02</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-01-29</td>\n",
" <td>1996-03-05</td>\n",
" <td>1996-01-31</td>\n",
" <td>TAKE BACK RETURN</td>\n",
" <td>REG AIR</td>\n",
" <td>riously. regular, express dep</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2132</td>\n",
" <td>4633</td>\n",
" <td>4</td>\n",
" <td>28.0</td>\n",
" <td>28955.640625</td>\n",
" <td>0.09</td>\n",
" <td>0.06</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-04-21</td>\n",
" <td>1996-03-30</td>\n",
" <td>1996-05-16</td>\n",
" <td>NONE</td>\n",
" <td>AIR</td>\n",
" <td>lites. fluffily even de</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>24027</td>\n",
" <td>1534</td>\n",
" <td>5</td>\n",
" <td>24.0</td>\n",
" <td>22824.480469</td>\n",
" <td>0.10</td>\n",
" <td>0.04</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-03-30</td>\n",
" <td>1996-03-14</td>\n",
" <td>1996-04-01</td>\n",
" <td>NONE</td>\n",
" <td>FOB</td>\n",
" <td>pending foxes. slyly re</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" l_orderkey l_partkey l_suppkey l_linenumber l_quantity \\\n",
"0 1 155190 7706 1 17.0 \n",
"1 1 67310 7311 2 36.0 \n",
"2 1 63700 3701 3 8.0 \n",
"3 1 2132 4633 4 28.0 \n",
"4 1 24027 1534 5 24.0 \n",
"\n",
" l_extendedprice l_discount l_tax l_returnflag l_linestatus l_shipdate \\\n",
"0 21168.230469 0.04 0.02 N O 1996-03-13 \n",
"1 45983.160156 0.09 0.06 N O 1996-04-12 \n",
"2 13309.599609 0.10 0.02 N O 1996-01-29 \n",
"3 28955.640625 0.09 0.06 N O 1996-04-21 \n",
"4 22824.480469 0.10 0.04 N O 1996-03-30 \n",
"\n",
" l_commitdate l_receiptdate l_shipinstruct l_shipmode \\\n",
"0 1996-02-12 1996-03-22 DELIVER IN PERSON TRUCK \n",
"1 1996-02-28 1996-04-20 TAKE BACK RETURN MAIL \n",
"2 1996-03-05 1996-01-31 TAKE BACK RETURN REG AIR \n",
"3 1996-03-30 1996-05-16 NONE AIR \n",
"4 1996-03-14 1996-04-01 NONE FOB \n",
"\n",
" l_comment \n",
"0 egular courts above the \n",
"1 ly final dependencies: slyly bold \n",
"2 riously. regular, express dep \n",
"3 lites. fluffily even de \n",
"4 pending foxes. slyly re "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"#BSQL Read ORC\n",
"data = bc.sql('select * from lineitem')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 181 ms, sys: 35.1 ms, total: 217 ms\n",
"Wall time: 215 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>l_orderkey</th>\n",
" <th>l_partkey</th>\n",
" <th>l_suppkey</th>\n",
" <th>l_linenumber</th>\n",
" <th>l_quantity</th>\n",
" <th>l_extendedprice</th>\n",
" <th>l_discount</th>\n",
" <th>l_tax</th>\n",
" <th>l_returnflag</th>\n",
" <th>l_linestatus</th>\n",
" <th>l_shipdate</th>\n",
" <th>l_commitdate</th>\n",
" <th>l_receiptdate</th>\n",
" <th>l_shipinstruct</th>\n",
" <th>l_shipmode</th>\n",
" <th>l_comment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>155190</td>\n",
" <td>7706</td>\n",
" <td>1</td>\n",
" <td>17.0</td>\n",
" <td>21168.230469</td>\n",
" <td>0.04</td>\n",
" <td>0.02</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-03-13</td>\n",
" <td>1996-02-12</td>\n",
" <td>1996-03-22</td>\n",
" <td>DELIVER IN PERSON</td>\n",
" <td>TRUCK</td>\n",
" <td>egular courts above the</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>67310</td>\n",
" <td>7311</td>\n",
" <td>2</td>\n",
" <td>36.0</td>\n",
" <td>45983.160156</td>\n",
" <td>0.09</td>\n",
" <td>0.06</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-04-12</td>\n",
" <td>1996-02-28</td>\n",
" <td>1996-04-20</td>\n",
" <td>TAKE BACK RETURN</td>\n",
" <td>MAIL</td>\n",
" <td>ly final dependencies: slyly bold</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>63700</td>\n",
" <td>3701</td>\n",
" <td>3</td>\n",
" <td>8.0</td>\n",
" <td>13309.599609</td>\n",
" <td>0.10</td>\n",
" <td>0.02</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-01-29</td>\n",
" <td>1996-03-05</td>\n",
" <td>1996-01-31</td>\n",
" <td>TAKE BACK RETURN</td>\n",
" <td>REG AIR</td>\n",
" <td>riously. regular, express dep</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2132</td>\n",
" <td>4633</td>\n",
" <td>4</td>\n",
" <td>28.0</td>\n",
" <td>28955.640625</td>\n",
" <td>0.09</td>\n",
" <td>0.06</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-04-21</td>\n",
" <td>1996-03-30</td>\n",
" <td>1996-05-16</td>\n",
" <td>NONE</td>\n",
" <td>AIR</td>\n",
" <td>lites. fluffily even de</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>24027</td>\n",
" <td>1534</td>\n",
" <td>5</td>\n",
" <td>24.0</td>\n",
" <td>22824.480469</td>\n",
" <td>0.10</td>\n",
" <td>0.04</td>\n",
" <td>N</td>\n",
" <td>O</td>\n",
" <td>1996-03-30</td>\n",
" <td>1996-03-14</td>\n",
" <td>1996-04-01</td>\n",
" <td>NONE</td>\n",
" <td>FOB</td>\n",
" <td>pending foxes. slyly re</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" l_orderkey l_partkey l_suppkey l_linenumber l_quantity \\\n",
"0 1 155190 7706 1 17.0 \n",
"1 1 67310 7311 2 36.0 \n",
"2 1 63700 3701 3 8.0 \n",
"3 1 2132 4633 4 28.0 \n",
"4 1 24027 1534 5 24.0 \n",
"\n",
" l_extendedprice l_discount l_tax l_returnflag l_linestatus l_shipdate \\\n",
"0 21168.230469 0.04 0.02 N O 1996-03-13 \n",
"1 45983.160156 0.09 0.06 N O 1996-04-12 \n",
"2 13309.599609 0.10 0.02 N O 1996-01-29 \n",
"3 28955.640625 0.09 0.06 N O 1996-04-21 \n",
"4 22824.480469 0.10 0.04 N O 1996-03-30 \n",
"\n",
" l_commitdate l_receiptdate l_shipinstruct l_shipmode \\\n",
"0 1996-02-12 1996-03-22 DELIVER IN PERSON TRUCK \n",
"1 1996-02-28 1996-04-20 TAKE BACK RETURN MAIL \n",
"2 1996-03-05 1996-01-31 TAKE BACK RETURN REG AIR \n",
"3 1996-03-30 1996-05-16 NONE AIR \n",
"4 1996-03-14 1996-04-01 NONE FOB \n",
"\n",
" l_comment \n",
"0 egular courts above the \n",
"1 ly final dependencies: slyly bold \n",
"2 riously. regular, express dep \n",
"3 lites. fluffily even de \n",
"4 pending foxes. slyly re "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"#cuDF Read ORC\n",
"data = cudf.read_orc('lineitem.orc')\n",
"data.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment