Skip to content

Instantly share code, notes, and snippets.

@roaramburu
Created October 22, 2019 16:33
Show Gist options
  • Save roaramburu/3e265b01821eba608f85b6902f32f626 to your computer and use it in GitHub Desktop.
Save roaramburu/3e265b01821eba608f85b6902f32f626 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BlazingSQL TPC-H Benchmark Notebook\n",
"\n",
"In this notebook we will setup a distributed BlazingSQL cluster with Dask running cuDF on every node as well. We will query data residing on Google Cloud Storage (GCS) in Apache Parquet files.\n",
"\n",
"First we will import necessary packages and launch the distributed servicers."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from blazingsql import BlazingContext\n",
"from dask.distributed import Client\n",
"import time\n",
"\n",
"\n",
"client = Client('127.0.0.1:8786')\n",
"bc = BlazingContext(logs_destination=\"test_logs\", dask_client=client, run_engine=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Tables\n",
"\n",
"Here we are registering a Google Cloud Storage bucket, and then creating tables on the Parquet files inside said bucket."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pyblazing.apiv2.sql.Table at 0x7fe64eff1a50>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bc.gcs('gcs_100gb',project_id='blazingsql-enduser', bucket_name='bsql')\n",
"dir_data_fs = \"gcs://gcs_100gb\" + \"/\" + \"tpch_sf100\"\n",
"\n",
"bc.create_table('nation',dir_data_fs+'/nation/0_0_0.parquet')\n",
"bc.create_table('region',dir_data_fs+'/region/0_0_0.parquet')\n",
"\n",
"files = [dir_data_fs + '/orders/0_0_' + str(num) + '.parquet' for num in range(0,16)]\n",
"bc.create_table('orders',files)\n",
"\n",
"files = [dir_data_fs + '/lineitem/0_0_' + str(num) + '.parquet' for num in range(0,72)]\n",
"bc.create_table('lineitem',files)\n",
"\n",
"files = [dir_data_fs + '/customer/0_0_' + str(num) + '.parquet' for num in range(0,4)]\n",
"bc.create_table('customer',files)\n",
"\n",
"files = [dir_data_fs + '/part/0_0_' + str(num) + '.parquet' for num in range(0,2)]\n",
"bc.create_table('part',files)\n",
"\n",
"files = [dir_data_fs + '/partsupp/0_0_' + str(num) + '.parquet' for num in range(0,6)]\n",
"bc.create_table('partsupp',files)\n",
"\n",
"bc.create_table('supplier',dir_data_fs+'/supplier/0_0_0.parquet')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run Queries\n",
"\n",
"### Q3"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TEST_03\n",
" l_orderkey revenue o_orderdate o_shippriority\n",
"0 249739810 492136.87500 1995-02-28 0\n",
"1 165214338 485645.53125 1995-03-09 0\n",
"2 461984355 468002.50000 1995-03-13 0\n",
"3 100810368 467932.78125 1995-02-28 0\n",
"4 74015523 467786.81250 1995-03-14 0\n",
"query time\n",
"6.554334878921509\n"
]
}
],
"source": [
"t0 = time.time()\n",
"queryId = 'TEST_03'\n",
"print(queryId)\n",
"query = \"\"\" select \n",
" l.l_orderkey, sum(l.l_extendedprice * (1 - l.l_discount)) as revenue, o.o_orderdate, o.o_shippriority\n",
" from \n",
" orders as o\n",
" INNER JOIN lineitem as l ON l.l_orderkey = o.o_orderkey\n",
" INNER JOIN customer as c ON c.c_custkey = o.o_custkey\n",
" where\n",
" c.c_mktsegment = 'BUILDING'\n",
" and o.o_orderdate < date '1995-03-15' \n",
" and l.l_shipdate > date '1995-03-15'\n",
" group by\n",
" l.l_orderkey, o.o_orderdate, o.o_shippriority\n",
" order by\n",
" revenue desc, o.o_orderdate\"\"\"\n",
"result = bc.sql(query).get()\n",
"t2 = time.time()\n",
"print(result.head())\n",
"print(\"query time\")\n",
"print(t2 - t0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q4"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TEST_04\n",
" o_orderpriority order_count\n",
"0 1-URGENT 5225078\n",
"query time\n",
"7.418049335479736\n"
]
}
],
"source": [
"t0 = time.time()\n",
"queryId = 'TEST_04'\n",
"print(queryId)\n",
"query = \"\"\" select\n",
" o_orderpriority, count(*) as order_count\n",
" from\n",
" orders\n",
" where\n",
" o_orderdate >= date '1993-07-01'\n",
" and o_orderdate < date '1994-10-01'\n",
" and exists (select\n",
" *\n",
" from\n",
" lineitem\n",
" where\n",
" l_orderkey = o_orderkey\n",
" and l_commitdate < l_receiptdate)\n",
" group by\n",
" o_orderpriority \n",
" order by \n",
" o_orderpriority\"\"\"\n",
"result = bc.sql(query).get()\n",
"print(result.head(2,2))\n",
"t2 = time.time()\n",
"print(\"query time\")\n",
"print(t2 - t0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q5"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TEST_05\n",
"query time\n",
"12.486339330673218\n",
" n_name revenue\n",
"0 JAPAN 3.832385e+09\n"
]
}
],
"source": [
"t0 = time.time()\n",
"queryId = 'TEST_05'\n",
"print(queryId)\n",
"query = \"\"\" select\n",
" n_name, sum(l_extendedprice * (1 - l_discount)) as revenue\n",
" from orders\n",
" inner join lineitem on l_orderkey = o_orderkey\n",
" inner join customer on o_custkey = c_custkey\n",
" inner join supplier on l_suppkey = s_suppkey and c_nationkey = s_nationkey\n",
" inner join nation on n_nationkey = s_nationkey\n",
" inner join region on n_regionkey = r_regionkey\n",
" where\n",
" r_name = 'ASIA' \n",
" and o_orderdate >= date '1994-01-01'\n",
" and o_orderdate < date '1995-01-01' \n",
" group by\n",
" n_name\n",
" order by\n",
" revenue desc\"\"\"\n",
"result = bc.sql(query).get()\n",
"t2 = time.time()\n",
"print(\"query time\")\n",
"print(t2 - t0)\n",
"print(result.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q6"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TEST_06\n",
"query time\n",
"2.7351667881011963\n",
" revenue\n",
"0 7.534758e+09\n"
]
}
],
"source": [
"t0 = time.time()\n",
"queryId = 'TEST_06'\n",
"print(queryId)\n",
"query = \"\"\" select\n",
" sum(l_extendedprice*l_discount) as revenue\n",
" from\n",
" lineitem\n",
" where\n",
" l_shipdate >= date '1994-01-01' \n",
" and l_shipdate < date '1995-01-01'\n",
" and l_discount between 0.05 and 0.07 \n",
" and l_quantity < 24\"\"\"\n",
"result = bc.sql(query).get()\n",
"\n",
"t2 = time.time()\n",
"print(\"query time\")\n",
"print(t2 - t0)\n",
"print(result.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q8"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t0 = time.time()\n",
"queryId = 'TEST_08'\n",
"print(queryId)\n",
"query = \"\"\" select \n",
" o_year, sum(case when nationl = 'BRAZIL' then volume else 0 end) / sum(volume) as mkt_share\n",
" from (\n",
" select \n",
" extract(year from o.o_orderdate) as o_year, l.l_extendedprice * (1 - l.l_discount) as volume, n2.n_name as nationl\n",
" from \n",
" part as p\n",
" INNER JOIN lineitem as l ON p.p_partkey = l.l_partkey\n",
" INNER JOIN supplier as s ON s.s_suppkey = l.l_suppkey\n",
" INNER JOIN orders as o ON o.o_orderkey = l.l_orderkey \n",
" INNER JOIN customer as c ON c.c_custkey = o.o_custkey\n",
" INNER JOIN nation as n1 ON n1.n_nationkey = c.c_nationkey \n",
" INNER JOIN region as r ON r.r_regionkey = n1.n_regionkey\n",
" INNER JOIN nation as n2 ON n2.n_nationkey = s.s_nationkey\n",
" where \n",
" r.r_name = 'AMERICA' \n",
" and o.o_orderdate >= date '1995-01-01' and o.o_orderdate <= date '1996-12-31'\n",
" and p.p_type = 'ECONOMY ANODIZED STEEL' ) as all_nations\n",
" group by\n",
" o_year\n",
" order by\n",
" o_year\"\"\"\n",
"result = bc.sql(query).get()\n",
"t2 = time.time()\n",
"print(\"query time\")\n",
"print(t2 - t0)\n",
"print(result.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q9"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t0 = time.time()\n",
"queryId = 'TEST_09'\n",
"print(queryId)\n",
"query = \"\"\" select\n",
" nationl, o_year, sum(amount) as sum_profit\n",
" from\n",
" ( select n_name as nationl, extract(year from o_orderdate) as o_year,\n",
" l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount\n",
" from lineitem\n",
" INNER JOIN orders ON o_orderkey = l_orderkey\n",
" INNER JOIN partsupp ON ps_suppkey = l_suppkey and ps_partkey = l_partkey\n",
" INNER JOIN part ON p_partkey = l_partkey\n",
" INNER JOIN supplier ON s_suppkey = l_suppkey\n",
" INNER JOIN nation ON n_nationkey = s_nationkey \n",
" where\n",
" p_name like '%green%' ) as profit\n",
" group by \n",
" nationl, o_year \n",
" order by \n",
" nationl, o_year desc\"\"\"\n",
"result = bc.sql(query).get()\n",
"print(\"query time\")\n",
"print(t2 - t0)\n",
"print(result.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Q13"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"queryId = 'TEST_13'\n",
"print(queryId)\n",
"query = \"\"\" select\n",
" c_count, count(*) as custdist\n",
" from\n",
" (select\n",
" c.c_custkey, count(o.o_orderkey)\n",
" from\n",
" customer as c\n",
" LEFT OUTER JOIN orders as o ON c.c_custkey = o.o_custkey\n",
" where o.o_comment not like '%special%requests%'\n",
" group by\n",
" c.c_custkey) as c_orders (c_custkey, c_count)\n",
" group by\n",
" c_count\n",
" order by\n",
" custdist desc, c_count desc\"\"\"\n",
"result = bc.sql(query).get()\n",
"print(result.head())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment