Skip to content

Instantly share code, notes, and snippets.

@FavioVazquez
Created July 19, 2019 17:57
Show Gist options
  • Save FavioVazquez/576e91e2f26b55e7f52ecb2acd2deaa7 to your computer and use it in GitHub Desktop.
Save FavioVazquez/576e91e2f26b55e7f52ecb2acd2deaa7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import optimus and pandas"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"from optimus import Optimus\n",
"from pyspark.sql.functions import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"op = Optimus()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Read the data"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'\n",
" \n",
"chipo_pd = pd.read_csv(url, sep = '\\t')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"order_id int64\n",
"quantity int64\n",
"item_name object\n",
"choice_description object\n",
"item_price object\n",
"dtype: object"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chipo_pd.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We need to to this to be able to read the data from Spark"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"chipo_pd[['item_name', 'choice_description', 'item_price']] = chipo_pd[['item_name', 'choice_description', 'item_price']].astype(str)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Transform data from Pandas to Optimus (Spark)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"chipo = op.spark.createDataFrame(chipo_pd)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# See the first 10 entries"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 4622 rows / 5 columns</div>\n",
"<div class=\"info_items\">4 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">order_id</div>\n",
" <div class=\"data_type\">1 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">quantity</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">item_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">choice_description</div>\n",
" <div class=\"data_type\">4 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">item_price</div>\n",
" <div class=\"data_type\">5 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Chips⸱and⸱Fresh⸱Tomato⸱Salsa'>Chips⸱and⸱Fresh⸱Tomato⸱Salsa</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='nan'>nan</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$2.39⸱'>$2.39⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Izze'>Izze</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Clementine]'>[Clementine]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$3.39⸱'>$3.39⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Nantucket⸱Nectar'>Nantucket⸱Nectar</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Apple]'>[Apple]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$3.39⸱'>$3.39⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Chips⸱and⸱Tomatillo-Green⸱Chili⸱Salsa'>Chips⸱and⸱Tomatillo-Green⸱Chili⸱Salsa</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='nan'>nan</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$2.39⸱'>$2.39⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='2'>2</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='2'>2</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Chicken⸱Bowl'>Chicken⸱Bowl</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Tomatillo-Red⸱Chili⸱Salsa⸱(Hot),⸱[Black⸱Beans,⸱Rice,⸱Cheese,⸱Sour⸱Cream]]'>[Tomatillo-Red⸱Chili⸱Salsa⸱(Hot),⸱[Black⸱Beans,⸱Rice,⸱Cheese,⸱Sour⸱Cream]]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$16.98⸱'>$16.98⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='3'>3</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Chicken⸱Bowl'>Chicken⸱Bowl</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Fresh⸱Tomato⸱Salsa⸱(Mild),⸱[Rice,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]'>[Fresh⸱Tomato⸱Salsa⸱(Mild),⸱[Rice,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$10.98⸱'>$10.98⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='3'>3</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Side⸱of⸱Chips'>Side⸱of⸱Chips</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='nan'>nan</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$1.69⸱'>$1.69⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Steak⸱Burrito'>Steak⸱Burrito</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Tomatillo⸱Red⸱Chili⸱Salsa,⸱[Fajita⸱Vegetables,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]'>[Tomatillo⸱Red⸱Chili⸱Salsa,⸱[Fajita⸱Vegetables,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Guacamole,⸱Lettuce]]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$11.75⸱'>$11.75⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='4'>4</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Steak⸱Soft⸱Tacos'>Steak⸱Soft⸱Tacos</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Tomatillo⸱Green⸱Chili⸱Salsa,⸱[Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]'>[Tomatillo⸱Green⸱Chili⸱Salsa,⸱[Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$9.25⸱'>$9.25⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='5'>5</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Steak⸱Burrito'>Steak⸱Burrito</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Fresh⸱Tomato⸱Salsa,⸱[Rice,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]'>[Fresh⸱Tomato⸱Salsa,⸱[Rice,⸱Black⸱Beans,⸱Pinto⸱Beans,⸱Cheese,⸱Sour⸱Cream,⸱Lettuce]]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$9.25⸱'>$9.25⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 4622 rows / 5 columns</div>\n",
"<div class=\"info_items\">4 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"chipo.table(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# What is the number of observations and columns in the dataset?"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4622"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chipo.count()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'cols_count': 5,\n",
" 'rows_count': 4622,\n",
" 'missing_count': '0.0%',\n",
" 'size': '-1 Bytes'}"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"op.profiler.dataset_info(chipo)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Print the name of all the columns"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chipo.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**NOTE: Spark dataframes are not indexed.**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Which was the most-ordered item and how many items were ordered?"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 50 rows / 2 columns</div>\n",
"<div class=\"info_items\">41 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">item_name</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">quantity</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='Chicken⸱Bowl'>Chicken⸱Bowl</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='761'>761</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 50 rows / 2 columns</div>\n",
"<div class=\"info_items\">41 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Here we are renaming the column sum(quantity) to quantity with Optimus function rename inside of cols\n",
"(chipo.groupby(\"item_name\")\n",
" .sum(\"quantity\")\n",
" .cols.rename(\"sum(quantity)\", \"quantity\")\n",
" .sort(desc(\"quantity\"))\n",
" .table(1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# What was the most ordered item in the choice_description column"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 1044 rows / 2 columns</div>\n",
"<div class=\"info_items\">21 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">choice_description</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">quantity</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='nan'>nan</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1382'>1382</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 1044 rows / 2 columns</div>\n",
"<div class=\"info_items\">21 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"(chipo.groupby(\"choice_description\")\n",
" .sum(\"quantity\")\n",
" .cols.rename(\"sum(quantity)\", \"quantity\")\n",
" .sort(desc(\"quantity\"))\n",
" .table(1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here we have a problem, is showing that nan was the most order item from `choice_desccription`. Let's solve that:"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 1043 rows / 2 columns</div>\n",
"<div class=\"info_items\">20 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">choice_description</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">quantity</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='[Diet⸱Coke]'>[Diet⸱Coke]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='159'>159</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 1043 rows / 2 columns</div>\n",
"<div class=\"info_items\">20 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# First we are transforming \"nan\" strings to real nulls, and then droping them\n",
"(chipo.cols.replace(\"choice_description\",\"nan\")\n",
" .dropna()\n",
" .groupby(\"choice_description\")\n",
" .sum(\"quantity\")\n",
" .cols.rename(\"sum(quantity)\", \"quantity\")\n",
" .sort(desc(\"quantity\"))\n",
" .table(1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How many items were orderd in total?"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4972"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chipo.cols.sum(\"quantity\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Turn the item price into a float"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('order_id', 'bigint'),\n",
" ('quantity', 'bigint'),\n",
" ('item_name', 'string'),\n",
" ('choice_description', 'string'),\n",
" ('item_price', 'string')]"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chipo.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 4622 rows / 5 columns</div>\n",
"<div class=\"info_items\">4 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">order_id</div>\n",
" <div class=\"data_type\">1 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">quantity</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">item_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">choice_description</div>\n",
" <div class=\"data_type\">4 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">item_price</div>\n",
" <div class=\"data_type\">5 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Chips⸱and⸱Fresh⸱Tomato⸱Salsa'>Chips⸱and⸱Fresh⸱Tomato⸱Salsa</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='nan'>nan</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='$2.39⸱'>$2.39⸱</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 4622 rows / 5 columns</div>\n",
"<div class=\"info_items\">4 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"## Let's see the format of the price\n",
"chipo.table(1)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"# Use substr (like in SQL) to get from the first numer to the end and then cast it\n",
"chipo = chipo.withColumn(\"item_price\", chipo.item_price.substr(2,10).cast(\"float\"))"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>\n",
" .data_type {\n",
" font-size: 0.8em;\n",
" font-weight: normal;\n",
" }\n",
"\n",
" .column_name {\n",
" font-size: 1.2em;\n",
" }\n",
"\n",
" .info_items {\n",
" margin: 10px 0;\n",
" font-size: 0.8em;\n",
" }\n",
"\n",
" .optimus_table td {\n",
" padding: 2px;\n",
" border-left: 1px solid #cccccc;\n",
" border-right: 1px solid #cccccc;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(even) {\n",
" background-color: #f2f2f2 !important;\n",
" }\n",
"\n",
" .optimus_table tr:nth-child(odd) {\n",
" background-color: #ffffff !important;\n",
" }\n",
"\n",
" .optimus_table thead {\n",
" border-bottom: 1px solid black;\n",
" }\n",
" .optimus_table{\n",
" font-size: 12px;\n",
" }\n",
"\n",
" .optimus_table tbody{\n",
" font-family: monospace;\n",
" border-bottom: 1px solid #cccccc;\n",
" }\n",
"\n",
"\n",
"</style>\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 2 of 4622 rows / 5 columns</div>\n",
"<div class=\"info_items\">4 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">order_id</div>\n",
" <div class=\"data_type\">1 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">quantity</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">item_name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">choice_description</div>\n",
" <div class=\"data_type\">4 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">item_price</div>\n",
" <div class=\"data_type\">5 (float)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Chips⸱and⸱Fresh⸱Tomato⸱Salsa'>Chips⸱and⸱Fresh⸱Tomato⸱Salsa</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='nan'>nan</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='2.390000104904175'>2.390000104904175</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='1'>1</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='Izze'>Izze</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='[Clementine]'>[Clementine]</div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div title='3.390000104904175'>3.390000104904175</div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"<div class=\"info_items\">Viewing 2 of 4622 rows / 5 columns</div>\n",
"<div class=\"info_items\">4 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Let's see our data now\n",
"chipo.table(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How much was the revenue for the period in the dataset?"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Revenue was: $39237.02\n"
]
}
],
"source": [
"# The function mul takes two or more columns and multiples them\n",
"# The function sum will sum the values in a specific column\n",
"revenue = (chipo.cols.mul(columns=[\"quantity\", \"item_price\"])\n",
" .cols.sum(\"mul\"))\n",
"print('Revenue was: $' + str(np.round(revenue,2)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How many orders were made in the period?"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1834"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chipo.select(\"order_id\").distinct().count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# What is the average revenue amount per order?"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"21.39423"
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(chipo.cols.mul(columns=[\"quantity\", \"item_price\"])\n",
" .cols.rename(\"mul\", \"revenue\")\n",
" .groupby(\"order_id\").sum(\"revenue\")\n",
" .cols.rename(\"sum(revenue)\", \"revenue\")\n",
" .cols.mean(\"revenue\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How many different items are sold?"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50"
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chipo.select(\"item_name\").distinct().count()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment