Skip to content

Instantly share code, notes, and snippets.

@JnBrymn-EB
Last active November 27, 2017 04:08
Show Gist options
  • Save JnBrymn-EB/89b472bfafa6cf75eeaf45883da98eef to your computer and use it in GitHub Desktop.
Save JnBrymn-EB/89b472bfafa6cf75eeaf45883da98eef to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Nerds Project\n",
"Corporación Favorita Grocery Sales Forecasting\n",
"\n",
"https://www.kaggle.com/c/favorita-grocery-sales-forecasting"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams['axes.labelsize'] = 14\n",
"plt.rcParams['xtick.labelsize'] = 12\n",
"plt.rcParams['ytick.labelsize'] = 12\n",
"\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ROOT_DIR = '/Users/johnb/Personal/data_science/kaggle/corporación_favorita/data'\n",
"HOLIDAYS_EVENTS = os.path.join(ROOT_DIR, 'holidays_events.csv')\n",
"TEST = os.path.join(ROOT_DIR, 'test.csv')\n",
"ITEMS = os.path.join(ROOT_DIR, 'items.csv')\n",
"SAMPLE_SUBMISSION = os.path.join(ROOT_DIR, 'sample_submission.csv')\n",
"TRAIN = os.path.join(ROOT_DIR, 'train.csv')\n",
"OIL = os.path.join(ROOT_DIR, 'oil.csv')\n",
"STORES = os.path.join(ROOT_DIR, 'stores.csv')\n",
"TRANSACTIONS = os.path.join(ROOT_DIR, 'transactions.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/johnb/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"# this takes a long time to read\n",
"train = pd.read_csv(TRAIN)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test = pd.read_csv(TEST)\n",
"sample_submission = pd.read_csv(SAMPLE_SUBMISSION)\n",
"holidays_events = pd.read_csv(HOLIDAYS_EVENTS)\n",
"items = pd.read_csv(ITEMS)\n",
"oil = pd.read_csv(OIL)\n",
"stores = pd.read_csv(STORES)\n",
"transactions = pd.read_csv(TRANSACTIONS)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Look at data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>store_nbr</th>\n",
" <th>item_nbr</th>\n",
" <th>unit_sales</th>\n",
" <th>onpromotion</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2013-01-01</td>\n",
" <td>25</td>\n",
" <td>103665</td>\n",
" <td>7.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2013-01-01</td>\n",
" <td>25</td>\n",
" <td>105574</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2013-01-01</td>\n",
" <td>25</td>\n",
" <td>105575</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2013-01-01</td>\n",
" <td>25</td>\n",
" <td>108079</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2013-01-01</td>\n",
" <td>25</td>\n",
" <td>108701</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id date store_nbr item_nbr unit_sales onpromotion\n",
"0 0 2013-01-01 25 103665 7.0 NaN\n",
"1 1 2013-01-01 25 105574 1.0 NaN\n",
"2 2 2013-01-01 25 105575 2.0 NaN\n",
"3 3 2013-01-01 25 108079 1.0 NaN\n",
"4 4 2013-01-01 25 108701 1.0 NaN"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>store_nbr</th>\n",
" <th>item_nbr</th>\n",
" <th>unit_sales</th>\n",
" <th>onpromotion</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1.254970e+08</td>\n",
" <td>125497040</td>\n",
" <td>1.254970e+08</td>\n",
" <td>1.254970e+08</td>\n",
" <td>1.254970e+08</td>\n",
" <td>103839389</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>1684</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>2017-07-01</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>118194</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>96028767</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>6.274852e+07</td>\n",
" <td>NaN</td>\n",
" <td>2.746458e+01</td>\n",
" <td>9.727692e+05</td>\n",
" <td>8.554865e+00</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3.622788e+07</td>\n",
" <td>NaN</td>\n",
" <td>1.633051e+01</td>\n",
" <td>5.205336e+05</td>\n",
" <td>2.360515e+01</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000e+00</td>\n",
" <td>NaN</td>\n",
" <td>1.000000e+00</td>\n",
" <td>9.699500e+04</td>\n",
" <td>-1.537200e+04</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>3.137426e+07</td>\n",
" <td>NaN</td>\n",
" <td>1.200000e+01</td>\n",
" <td>5.223830e+05</td>\n",
" <td>2.000000e+00</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>6.274852e+07</td>\n",
" <td>NaN</td>\n",
" <td>2.800000e+01</td>\n",
" <td>9.595000e+05</td>\n",
" <td>4.000000e+00</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>9.412278e+07</td>\n",
" <td>NaN</td>\n",
" <td>4.300000e+01</td>\n",
" <td>1.354380e+06</td>\n",
" <td>9.000000e+00</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.254970e+08</td>\n",
" <td>NaN</td>\n",
" <td>5.400000e+01</td>\n",
" <td>2.127114e+06</td>\n",
" <td>8.944000e+04</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id date store_nbr item_nbr unit_sales \\\n",
"count 1.254970e+08 125497040 1.254970e+08 1.254970e+08 1.254970e+08 \n",
"unique NaN 1684 NaN NaN NaN \n",
"top NaN 2017-07-01 NaN NaN NaN \n",
"freq NaN 118194 NaN NaN NaN \n",
"mean 6.274852e+07 NaN 2.746458e+01 9.727692e+05 8.554865e+00 \n",
"std 3.622788e+07 NaN 1.633051e+01 5.205336e+05 2.360515e+01 \n",
"min 0.000000e+00 NaN 1.000000e+00 9.699500e+04 -1.537200e+04 \n",
"25% 3.137426e+07 NaN 1.200000e+01 5.223830e+05 2.000000e+00 \n",
"50% 6.274852e+07 NaN 2.800000e+01 9.595000e+05 4.000000e+00 \n",
"75% 9.412278e+07 NaN 4.300000e+01 1.354380e+06 9.000000e+00 \n",
"max 1.254970e+08 NaN 5.400000e+01 2.127114e+06 8.944000e+04 \n",
"\n",
" onpromotion \n",
"count 103839389 \n",
"unique 2 \n",
"top False \n",
"freq 96028767 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 96028767\n",
"True 7810622\n",
"Name: onpromotion, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['onpromotion'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>store_nbr</th>\n",
" <th>item_nbr</th>\n",
" <th>onpromotion</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>125497040</td>\n",
" <td>2017-08-16</td>\n",
" <td>1</td>\n",
" <td>96995</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>125497041</td>\n",
" <td>2017-08-16</td>\n",
" <td>1</td>\n",
" <td>99197</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>125497042</td>\n",
" <td>2017-08-16</td>\n",
" <td>1</td>\n",
" <td>103501</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>125497043</td>\n",
" <td>2017-08-16</td>\n",
" <td>1</td>\n",
" <td>103520</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>125497044</td>\n",
" <td>2017-08-16</td>\n",
" <td>1</td>\n",
" <td>103665</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id date store_nbr item_nbr onpromotion\n",
"0 125497040 2017-08-16 1 96995 False\n",
"1 125497041 2017-08-16 1 99197 False\n",
"2 125497042 2017-08-16 1 103501 False\n",
"3 125497043 2017-08-16 1 103520 False\n",
"4 125497044 2017-08-16 1 103665 False"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>store_nbr</th>\n",
" <th>item_nbr</th>\n",
" <th>onpromotion</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>3.370464e+06</td>\n",
" <td>3370464</td>\n",
" <td>3.370464e+06</td>\n",
" <td>3.370464e+06</td>\n",
" <td>3370464</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>16</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>2017-08-28</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>210654</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3171867</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.271823e+08</td>\n",
" <td>NaN</td>\n",
" <td>2.750000e+01</td>\n",
" <td>1.244798e+06</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>9.729693e+05</td>\n",
" <td>NaN</td>\n",
" <td>1.558579e+01</td>\n",
" <td>5.898362e+05</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.254970e+08</td>\n",
" <td>NaN</td>\n",
" <td>1.000000e+00</td>\n",
" <td>9.699500e+04</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.263397e+08</td>\n",
" <td>NaN</td>\n",
" <td>1.400000e+01</td>\n",
" <td>8.053210e+05</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.271823e+08</td>\n",
" <td>NaN</td>\n",
" <td>2.750000e+01</td>\n",
" <td>1.294665e+06</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.280249e+08</td>\n",
" <td>NaN</td>\n",
" <td>4.100000e+01</td>\n",
" <td>1.730015e+06</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.288675e+08</td>\n",
" <td>NaN</td>\n",
" <td>5.400000e+01</td>\n",
" <td>2.134244e+06</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id date store_nbr item_nbr onpromotion\n",
"count 3.370464e+06 3370464 3.370464e+06 3.370464e+06 3370464\n",
"unique NaN 16 NaN NaN 2\n",
"top NaN 2017-08-28 NaN NaN False\n",
"freq NaN 210654 NaN NaN 3171867\n",
"mean 1.271823e+08 NaN 2.750000e+01 1.244798e+06 NaN\n",
"std 9.729693e+05 NaN 1.558579e+01 5.898362e+05 NaN\n",
"min 1.254970e+08 NaN 1.000000e+00 9.699500e+04 NaN\n",
"25% 1.263397e+08 NaN 1.400000e+01 8.053210e+05 NaN\n",
"50% 1.271823e+08 NaN 2.750000e+01 1.294665e+06 NaN\n",
"75% 1.280249e+08 NaN 4.100000e+01 1.730015e+06 NaN\n",
"max 1.288675e+08 NaN 5.400000e+01 2.134244e+06 NaN"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 3171867\n",
"True 198597\n",
"Name: onpromotion, dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test['onpromotion'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>unit_sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>125497040</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>125497041</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>125497042</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>125497043</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>125497044</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id unit_sales\n",
"0 125497040 0\n",
"1 125497041 0\n",
"2 125497042 0\n",
"3 125497043 0\n",
"4 125497044 0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_submission.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>type</th>\n",
" <th>locale</th>\n",
" <th>locale_name</th>\n",
" <th>description</th>\n",
" <th>transferred</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2012-03-02</td>\n",
" <td>Holiday</td>\n",
" <td>Local</td>\n",
" <td>Manta</td>\n",
" <td>Fundacion de Manta</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2012-04-01</td>\n",
" <td>Holiday</td>\n",
" <td>Regional</td>\n",
" <td>Cotopaxi</td>\n",
" <td>Provincializacion de Cotopaxi</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2012-04-12</td>\n",
" <td>Holiday</td>\n",
" <td>Local</td>\n",
" <td>Cuenca</td>\n",
" <td>Fundacion de Cuenca</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2012-04-14</td>\n",
" <td>Holiday</td>\n",
" <td>Local</td>\n",
" <td>Libertad</td>\n",
" <td>Cantonizacion de Libertad</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2012-04-21</td>\n",
" <td>Holiday</td>\n",
" <td>Local</td>\n",
" <td>Riobamba</td>\n",
" <td>Cantonizacion de Riobamba</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date type locale locale_name description \\\n",
"0 2012-03-02 Holiday Local Manta Fundacion de Manta \n",
"1 2012-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi \n",
"2 2012-04-12 Holiday Local Cuenca Fundacion de Cuenca \n",
"3 2012-04-14 Holiday Local Libertad Cantonizacion de Libertad \n",
"4 2012-04-21 Holiday Local Riobamba Cantonizacion de Riobamba \n",
"\n",
" transferred \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holidays_events.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>type</th>\n",
" <th>locale</th>\n",
" <th>locale_name</th>\n",
" <th>description</th>\n",
" <th>transferred</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>350</td>\n",
" <td>350</td>\n",
" <td>350</td>\n",
" <td>350</td>\n",
" <td>350</td>\n",
" <td>350</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>312</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>24</td>\n",
" <td>103</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>2014-06-25</td>\n",
" <td>Holiday</td>\n",
" <td>National</td>\n",
" <td>Ecuador</td>\n",
" <td>Carnaval</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>4</td>\n",
" <td>221</td>\n",
" <td>174</td>\n",
" <td>174</td>\n",
" <td>10</td>\n",
" <td>338</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date type locale locale_name description transferred\n",
"count 350 350 350 350 350 350\n",
"unique 312 6 3 24 103 2\n",
"top 2014-06-25 Holiday National Ecuador Carnaval False\n",
"freq 4 221 174 174 10 338"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holidays_events.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>item_nbr</th>\n",
" <th>family</th>\n",
" <th>class</th>\n",
" <th>perishable</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>96995</td>\n",
" <td>GROCERY I</td>\n",
" <td>1093</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>99197</td>\n",
" <td>GROCERY I</td>\n",
" <td>1067</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>103501</td>\n",
" <td>CLEANING</td>\n",
" <td>3008</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>103520</td>\n",
" <td>GROCERY I</td>\n",
" <td>1028</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>103665</td>\n",
" <td>BREAD/BAKERY</td>\n",
" <td>2712</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" item_nbr family class perishable\n",
"0 96995 GROCERY I 1093 0\n",
"1 99197 GROCERY I 1067 0\n",
"2 103501 CLEANING 3008 0\n",
"3 103520 GROCERY I 1028 0\n",
"4 103665 BREAD/BAKERY 2712 1"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"items.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>dcoilwtico</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2013-01-01</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2013-01-02</td>\n",
" <td>93.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2013-01-03</td>\n",
" <td>92.97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2013-01-04</td>\n",
" <td>93.12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2013-01-07</td>\n",
" <td>93.20</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date dcoilwtico\n",
"0 2013-01-01 NaN\n",
"1 2013-01-02 93.14\n",
"2 2013-01-03 92.97\n",
"3 2013-01-04 93.12\n",
"4 2013-01-07 93.20"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oil.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>store_nbr</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>type</th>\n",
" <th>cluster</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Quito</td>\n",
" <td>Pichincha</td>\n",
" <td>D</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Quito</td>\n",
" <td>Pichincha</td>\n",
" <td>D</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Quito</td>\n",
" <td>Pichincha</td>\n",
" <td>D</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Quito</td>\n",
" <td>Pichincha</td>\n",
" <td>D</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Santo Domingo</td>\n",
" <td>Santo Domingo de los Tsachilas</td>\n",
" <td>D</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" store_nbr city state type cluster\n",
"0 1 Quito Pichincha D 13\n",
"1 2 Quito Pichincha D 13\n",
"2 3 Quito Pichincha D 8\n",
"3 4 Quito Pichincha D 9\n",
"4 5 Santo Domingo Santo Domingo de los Tsachilas D 4"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stores.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>store_nbr</th>\n",
" <th>transactions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2013-01-01</td>\n",
" <td>25</td>\n",
" <td>770</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2013-01-02</td>\n",
" <td>1</td>\n",
" <td>2111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2013-01-02</td>\n",
" <td>2</td>\n",
" <td>2358</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2013-01-02</td>\n",
" <td>3</td>\n",
" <td>3487</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2013-01-02</td>\n",
" <td>4</td>\n",
" <td>1922</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date store_nbr transactions\n",
"0 2013-01-01 25 770\n",
"1 2013-01-02 1 2111\n",
"2 2013-01-02 2 2358\n",
"3 2013-01-02 3 3487\n",
"4 2013-01-02 4 1922"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transactions.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Joining Stuff\n",
"\n",
"* train is the main table possible join rows `store_nbr`, `item_nbr`, and `date`\n",
"* stores can be joined by `store_nbr`\n",
"* transactions can be joined by `store_nbr`\n",
"* items can be joined in by `item_nbr`\n",
"* oil can be joined in by `date`\n",
"--------\n",
"* holiday can be joined in by `date` and possibly `locale_name` but it might be complicated - maybe we can augment this dataset with mapzen WhosOnFirst."
]
},
{
"cell_type": "code",
"execution_count": 286,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12550"
]
},
"execution_count": 286,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get a sub set of the data so that it's easier to play with `merge`\n",
"train_small = train.iloc[range(0,125497040, 10000)]\n",
"len(train_small)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## merge easy things"
]
},
{
"cell_type": "code",
"execution_count": 289,
"metadata": {},
"outputs": [],
"source": [
"features = train_small \\\n",
" .merge(stores, on='store_nbr', how='left') \\\n",
" .merge(transactions, on=['store_nbr','date'], how='left') \\\n",
" .merge(items, on='item_nbr', how='left') \\\n",
" .merge(oil, on='date', how='left')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## merge holidays\n",
"Goal - to add 3 fields to each row which represent if the {store, date} is associated with a local, regional, or national holiday"
]
},
{
"cell_type": "code",
"execution_count": 290,
"metadata": {},
"outputs": [],
"source": [
"# These holidays matter because they exclude holidays \"transferred\" to another day and \n",
"# \"Work Days\" which aren't holidays at all\n",
"holidays_events_that_matter = holidays_events[\n",
" (holidays_events['type'] != 'Work Day') \n",
" | (holidays_events['transferred'] != True)\n",
"]\n",
"# drop redundant holidays (occasionally there are multiple local holidays on the same day)\n",
"holidays_events_that_matter.drop_duplicates(keep='first', subset=['date','locale'], inplace=True)\n",
"\n",
"# create dummy fields in the tables we're going to join\n",
"features['s'] = features.apply(lambda r: 's_{}'.format(r['state']), axis=1)\n",
"features['c'] = features.apply(lambda r: 'c_{}'.format(r['city']), axis=1)\n",
"def make_location_column(row):\n",
" if row['locale'] == \"Regional\":\n",
" return 's_{}'.format(row['locale_name'])\n",
" elif row['locale'] == \"Local\":\n",
" return 'c_{}'.format(row['locale_name'])\n",
"holidays_events_that_matter['place'] = holidays_events_that_matter.apply(make_location_column, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 295,
"metadata": {},
"outputs": [],
"source": [
"# make local, regional, and national holiday fields\n",
"temp = features.merge(holidays_events_that_matter, left_on=['c', 'date'], right_on=['place', 'date'], how='left')\n",
"features['local_holiday'] = temp['locale'] == 'Local'\n",
"\n",
"temp = features.merge(holidays_events_that_matter, left_on=['s', 'date'], right_on=['place', 'date'], how='left')\n",
"features['regional_holiday'] = temp['locale'] == 'Regional'\n",
"\n",
"temp = features.merge(\n",
" holidays_events_that_matter[holidays_events_that_matter['locale'] == 'National'],\n",
" on='date', how='left')\n",
"features['national_holiday'] = temp['locale'] == 'National'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# todo\n",
"* figure out how to impute missing data in each set before joining them\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment