Skip to content

Instantly share code, notes, and snippets.

@FavioVazquez
Created July 19, 2019 19:29
Show Gist options
  • Save FavioVazquez/6b1b4e5077c8a818cf301500d896150e to your computer and use it in GitHub Desktop.
Save FavioVazquez/6b1b4e5077c8a818cf301500d896150e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# US - Baby Names\n",
"\n",
"## Introduction:\n",
"\n",
"We are going to use a subset of [US Baby Names from](https://www.kaggle.com/kaggle/us-baby-names) Kaggle.\n",
"\n",
"In the file it will be names from 2004 until 2014"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"# Import the necessary libraries\n",
"from optimus import Optimus\n",
"from pyspark.sql.functions import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"op = Optimus()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import the dataset and assing baby_names"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"baby_names_pd = pd.read_csv('https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/US_Baby_Names/US_Baby_Names_right.csv')"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"baby_names = op.spark.createDataFrame(baby_names_pd)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<h1></h1>\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 5 rows / 8 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">summary</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Unnamed: 0</div>\n",
" <div class=\"data_type\">2 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Id</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Name</div>\n",
" <div class=\"data_type\">4 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Year</div>\n",
" <div class=\"data_type\">5 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Gender</div>\n",
" <div class=\"data_type\">6 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">State</div>\n",
" <div class=\"data_type\">7 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Count</div>\n",
" <div class=\"data_type\">8 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='count'>count\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1016395'>1016395\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1016395'>1016395\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1016395'>1016395\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1016395'>1016395\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1016395'>1016395\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1016395'>1016395\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1016395'>1016395\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='mean'>mean\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2830990.4619178567'>2830990.4619178567\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2830991.4619178567'>2830991.4619178567\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Infinity'>Infinity\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2009.0531899507573'>2009.0531899507573\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\"none \"\n",
" title='None'>None\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\"none \"\n",
" title='None'>None\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='34.85012421351935'>34.85012421351935\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='stddev'>stddev\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1652475.6514804524'>1652475.6514804524\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='1652475.6514804524'>1652475.6514804524\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='NaN'>NaN\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='3.1382928281815494'>3.1382928281815494\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\"none \"\n",
" title='None'>None\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\"none \"\n",
" title='None'>None\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='97.3973464861767'>97.3973464861767\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='min'>min\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11349'>11349\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11350'>11350\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Aaban'>Aaban\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='5'>5\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='max'>max\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='5647425'>5647425\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='5647426'>5647426\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Zyriah'>Zyriah\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2014'>2014\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='M'>M\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='WY'>WY\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='4167'>4167\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 5 rows / 8 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"baby_names.describe().table()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# See the first 10 entries"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<h1></h1>\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 1.0 million rows / 7 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Unnamed: 0</div>\n",
" <div class=\"data_type\">1 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Id</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Name</div>\n",
" <div class=\"data_type\">3 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Year</div>\n",
" <div class=\"data_type\">4 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Gender</div>\n",
" <div class=\"data_type\">5 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">State</div>\n",
" <div class=\"data_type\">6 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Count</div>\n",
" <div class=\"data_type\">7 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11349'>11349\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11350'>11350\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Emma'>Emma\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='62'>62\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11350'>11350\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11351'>11351\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Madison'>Madison\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='48'>48\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11351'>11351\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11352'>11352\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Hannah'>Hannah\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='46'>46\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11352'>11352\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11353'>11353\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Grace'>Grace\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='44'>44\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11353'>11353\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11354'>11354\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Emily'>Emily\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='41'>41\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11354'>11354\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11355'>11355\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Abigail'>Abigail\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='37'>37\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11355'>11355\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11356'>11356\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Olivia'>Olivia\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='33'>33\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11356'>11356\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11357'>11357\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Isabella'>Isabella\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='30'>30\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11357'>11357\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11358'>11358\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Alyssa'>Alyssa\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='29'>29\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11358'>11358\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11359'>11359\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Sophia'>Sophia\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2004'>2004\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='AK'>AK\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='28'>28\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 1.0 million rows / 7 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"baby_names.table(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Delete the column 'Unnamed: 0' and 'Id'"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"baby_names = baby_names.drop(\"Unnamed: 0\", \"Id\")"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-59-e3c19713dd04>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbaby_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/dataframe/extension.py\u001b[0m in \u001b[0;36mtable\u001b[0;34m(self, limit, columns)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdtypes\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 334\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 335\u001b[0m \u001b[0mfinal_columns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/optimus/dataframe/extension.py\u001b[0m in \u001b[0;36mtable_html\u001b[0;34m(self, limit, columns)\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 315\u001b[0;31m \"\"\"\n\u001b[0m\u001b[1;32m 316\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/pyspark/sql/dataframe.py\u001b[0m in \u001b[0;36mcount\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 521\u001b[0m \"\"\"\n\u001b[0;32m--> 522\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 523\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1253\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1255\u001b[0;31m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[1;32m 1257\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command, retry, binary)\u001b[0m\n\u001b[1;32m 983\u001b[0m \u001b[0mconnection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 984\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 985\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 986\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 987\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_create_connection_guard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.6/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msmart_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Answer received: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0manswer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRETURN_MESSAGE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.6/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 586\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 587\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"baby_names.table(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Are there more male or female names in the dataset?"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<h1></h1>\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 2 of 2 rows / 2 columns</div>\n",
"<div class=\"info_items\">2 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Gender</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">count</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" not nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='M'>M\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='457549'>457549\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='F'>F\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='558846'>558846\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 2 of 2 rows / 2 columns</div>\n",
"<div class=\"info_items\">2 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"baby_names.groupby(\"Gender\").count().orderBy(\"count\").table()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Group the dataset by name and assign to names"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"baby_names = baby_names.drop(\"Year\")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"names = baby_names.groupby(\"Name\").sum().cols.rename(\"sum(Count)\",\"count\")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<h1></h1>\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 17632 rows / 2 columns</div>\n",
"<div class=\"info_items\">200 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Name</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">count</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Kiana'>Kiana\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='5965'>5965\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Alayna'>Alayna\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='14171'>14171\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Ember'>Ember\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='3181'>3181\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Tyler'>Tyler\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='129989'>129989\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Maddox'>Maddox\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='20716'>20716\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 17632 rows / 2 columns</div>\n",
"<div class=\"info_items\">200 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"names.table(5)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Columns: 2 Rows: 17632\n"
]
}
],
"source": [
"print(f'Columns: {op.profiler.dataset_info(names)[\"cols_count\"]}', \n",
" f'Rows: {op.profiler.dataset_info(names)[\"rows_count\"]}')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<h1></h1>\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 17632 rows / 2 columns</div>\n",
"<div class=\"info_items\">141 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Name</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">count</div>\n",
" <div class=\"data_type\">2 (bigint)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Jacob'>Jacob\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='242874'>242874\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Emma'>Emma\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='214852'>214852\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Michael'>Michael\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='214405'>214405\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Ethan'>Ethan\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='209277'>209277\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Isabella'>Isabella\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='204798'>204798\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='William'>William\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='197894'>197894\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Joshua'>Joshua\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='191551'>191551\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Sophia'>Sophia\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='191446'>191446\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Daniel'>Daniel\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='191440'>191440\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Emily'>Emily\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='190318'>190318\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 10 of 17632 rows / 2 columns</div>\n",
"<div class=\"info_items\">141 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"names.sort(desc(\"count\")).table(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How many different names exist in the dataset?"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"17632"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# What is the name with most occurrences?"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<h1></h1>\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 17632 rows / 1 columns</div>\n",
"<div class=\"info_items\">141 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">Name</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='Jacob'>Jacob\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 1 of 17632 rows / 1 columns</div>\n",
"<div class=\"info_items\">141 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"names.sort(desc(\"count\")).select(\"Name\").table(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How many different names have the least occurrences?"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2578"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_oc = names.cols.min(\"count\") # This will give you the min ocurrence for names\n",
"names.where(col(\"count\") == min_oc).count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# What is the median name occurrence?"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"49.0"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"median_oc = names.approxQuantile(\"count\", [0.5], relativeError=0)[0] # This will give you the median ocurrence for names\n",
"median_oc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# What is the standard deviation of names?"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11006.06947"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names.cols.std(\"count\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get a summary with the mean, min, max, std and quartiles."
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<h1></h1>\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 5 rows / 2 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n",
"\n",
"<table class=\"optimus_table\">\n",
" <thead>\n",
" <tr>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">summary</div>\n",
" <div class=\"data_type\">1 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" <th>\n",
" <div class=\"column_name\">count</div>\n",
" <div class=\"data_type\">2 (string)</div>\n",
" <div class=\"data_type\">\n",
" \n",
" nullable\n",
" \n",
" </div>\n",
" </th>\n",
" \n",
" </tr>\n",
"\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='count'>count\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='17632'>17632\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='mean'>mean\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='2008.932168784029'>2008.932168784029\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='stddev'>stddev\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='11006.069467890566'>11006.069467890566\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='min'>min\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='5'>5\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" <tr>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='max'>max\n",
" </div>\n",
" </td>\n",
" \n",
" <td>\n",
" <div class=\" \"\n",
" title='242874'>242874\n",
" </div>\n",
" </td>\n",
" \n",
" </tr>\n",
" \n",
" </tbody>\n",
"</table>\n",
"\n",
"\n",
"<div class=\"info_items\">Viewing 5 of 5 rows / 2 columns</div>\n",
"<div class=\"info_items\">1 partition(s)</div>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"names.select(\"count\").describe().table()"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[11.0, 49.0, 337.0]"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names.approxQuantile(\"count\",[0.25,0.5,0.75], relativeError=0)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment