Last active
November 27, 2017 04:08
-
-
Save JnBrymn-EB/89b472bfafa6cf75eeaf45883da98eef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Data Nerds Project\n", | |
"Corporación Favorita Grocery Sales Forecasting\n", | |
"\n", | |
"https://www.kaggle.com/c/favorita-grocery-sales-forecasting" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"%matplotlib inline\n", | |
"import matplotlib\n", | |
"import matplotlib.pyplot as plt\n", | |
"plt.rcParams['axes.labelsize'] = 14\n", | |
"plt.rcParams['xtick.labelsize'] = 12\n", | |
"plt.rcParams['ytick.labelsize'] = 12\n", | |
"\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ROOT_DIR = '/Users/johnb/Personal/data_science/kaggle/corporación_favorita/data'\n", | |
"HOLIDAYS_EVENTS = os.path.join(ROOT_DIR, 'holidays_events.csv')\n", | |
"TEST = os.path.join(ROOT_DIR, 'test.csv')\n", | |
"ITEMS = os.path.join(ROOT_DIR, 'items.csv')\n", | |
"SAMPLE_SUBMISSION = os.path.join(ROOT_DIR, 'sample_submission.csv')\n", | |
"TRAIN = os.path.join(ROOT_DIR, 'train.csv')\n", | |
"OIL = os.path.join(ROOT_DIR, 'oil.csv')\n", | |
"STORES = os.path.join(ROOT_DIR, 'stores.csv')\n", | |
"TRANSACTIONS = os.path.join(ROOT_DIR, 'transactions.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/johnb/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", | |
" interactivity=interactivity, compiler=compiler, result=result)\n" | |
] | |
} | |
], | |
"source": [ | |
"# this takes a long time to read\n", | |
"train = pd.read_csv(TRAIN)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test = pd.read_csv(TEST)\n", | |
"sample_submission = pd.read_csv(SAMPLE_SUBMISSION)\n", | |
"holidays_events = pd.read_csv(HOLIDAYS_EVENTS)\n", | |
"items = pd.read_csv(ITEMS)\n", | |
"oil = pd.read_csv(OIL)\n", | |
"stores = pd.read_csv(STORES)\n", | |
"transactions = pd.read_csv(TRANSACTIONS)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Look at data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>date</th>\n", | |
" <th>store_nbr</th>\n", | |
" <th>item_nbr</th>\n", | |
" <th>unit_sales</th>\n", | |
" <th>onpromotion</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>2013-01-01</td>\n", | |
" <td>25</td>\n", | |
" <td>103665</td>\n", | |
" <td>7.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2013-01-01</td>\n", | |
" <td>25</td>\n", | |
" <td>105574</td>\n", | |
" <td>1.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>2013-01-01</td>\n", | |
" <td>25</td>\n", | |
" <td>105575</td>\n", | |
" <td>2.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>2013-01-01</td>\n", | |
" <td>25</td>\n", | |
" <td>108079</td>\n", | |
" <td>1.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>2013-01-01</td>\n", | |
" <td>25</td>\n", | |
" <td>108701</td>\n", | |
" <td>1.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id date store_nbr item_nbr unit_sales onpromotion\n", | |
"0 0 2013-01-01 25 103665 7.0 NaN\n", | |
"1 1 2013-01-01 25 105574 1.0 NaN\n", | |
"2 2 2013-01-01 25 105575 2.0 NaN\n", | |
"3 3 2013-01-01 25 108079 1.0 NaN\n", | |
"4 4 2013-01-01 25 108701 1.0 NaN" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>date</th>\n", | |
" <th>store_nbr</th>\n", | |
" <th>item_nbr</th>\n", | |
" <th>unit_sales</th>\n", | |
" <th>onpromotion</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>1.254970e+08</td>\n", | |
" <td>125497040</td>\n", | |
" <td>1.254970e+08</td>\n", | |
" <td>1.254970e+08</td>\n", | |
" <td>1.254970e+08</td>\n", | |
" <td>103839389</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>1684</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>2017-07-01</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>118194</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>96028767</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>6.274852e+07</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2.746458e+01</td>\n", | |
" <td>9.727692e+05</td>\n", | |
" <td>8.554865e+00</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>3.622788e+07</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.633051e+01</td>\n", | |
" <td>5.205336e+05</td>\n", | |
" <td>2.360515e+01</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>0.000000e+00</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.000000e+00</td>\n", | |
" <td>9.699500e+04</td>\n", | |
" <td>-1.537200e+04</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>3.137426e+07</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.200000e+01</td>\n", | |
" <td>5.223830e+05</td>\n", | |
" <td>2.000000e+00</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>6.274852e+07</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2.800000e+01</td>\n", | |
" <td>9.595000e+05</td>\n", | |
" <td>4.000000e+00</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>9.412278e+07</td>\n", | |
" <td>NaN</td>\n", | |
" <td>4.300000e+01</td>\n", | |
" <td>1.354380e+06</td>\n", | |
" <td>9.000000e+00</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>1.254970e+08</td>\n", | |
" <td>NaN</td>\n", | |
" <td>5.400000e+01</td>\n", | |
" <td>2.127114e+06</td>\n", | |
" <td>8.944000e+04</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id date store_nbr item_nbr unit_sales \\\n", | |
"count 1.254970e+08 125497040 1.254970e+08 1.254970e+08 1.254970e+08 \n", | |
"unique NaN 1684 NaN NaN NaN \n", | |
"top NaN 2017-07-01 NaN NaN NaN \n", | |
"freq NaN 118194 NaN NaN NaN \n", | |
"mean 6.274852e+07 NaN 2.746458e+01 9.727692e+05 8.554865e+00 \n", | |
"std 3.622788e+07 NaN 1.633051e+01 5.205336e+05 2.360515e+01 \n", | |
"min 0.000000e+00 NaN 1.000000e+00 9.699500e+04 -1.537200e+04 \n", | |
"25% 3.137426e+07 NaN 1.200000e+01 5.223830e+05 2.000000e+00 \n", | |
"50% 6.274852e+07 NaN 2.800000e+01 9.595000e+05 4.000000e+00 \n", | |
"75% 9.412278e+07 NaN 4.300000e+01 1.354380e+06 9.000000e+00 \n", | |
"max 1.254970e+08 NaN 5.400000e+01 2.127114e+06 8.944000e+04 \n", | |
"\n", | |
" onpromotion \n", | |
"count 103839389 \n", | |
"unique 2 \n", | |
"top False \n", | |
"freq 96028767 \n", | |
"mean NaN \n", | |
"std NaN \n", | |
"min NaN \n", | |
"25% NaN \n", | |
"50% NaN \n", | |
"75% NaN \n", | |
"max NaN " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train.describe(include='all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"False 96028767\n", | |
"True 7810622\n", | |
"Name: onpromotion, dtype: int64" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train['onpromotion'].value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>date</th>\n", | |
" <th>store_nbr</th>\n", | |
" <th>item_nbr</th>\n", | |
" <th>onpromotion</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>125497040</td>\n", | |
" <td>2017-08-16</td>\n", | |
" <td>1</td>\n", | |
" <td>96995</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>125497041</td>\n", | |
" <td>2017-08-16</td>\n", | |
" <td>1</td>\n", | |
" <td>99197</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>125497042</td>\n", | |
" <td>2017-08-16</td>\n", | |
" <td>1</td>\n", | |
" <td>103501</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>125497043</td>\n", | |
" <td>2017-08-16</td>\n", | |
" <td>1</td>\n", | |
" <td>103520</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>125497044</td>\n", | |
" <td>2017-08-16</td>\n", | |
" <td>1</td>\n", | |
" <td>103665</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id date store_nbr item_nbr onpromotion\n", | |
"0 125497040 2017-08-16 1 96995 False\n", | |
"1 125497041 2017-08-16 1 99197 False\n", | |
"2 125497042 2017-08-16 1 103501 False\n", | |
"3 125497043 2017-08-16 1 103520 False\n", | |
"4 125497044 2017-08-16 1 103665 False" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>date</th>\n", | |
" <th>store_nbr</th>\n", | |
" <th>item_nbr</th>\n", | |
" <th>onpromotion</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>3.370464e+06</td>\n", | |
" <td>3370464</td>\n", | |
" <td>3.370464e+06</td>\n", | |
" <td>3.370464e+06</td>\n", | |
" <td>3370464</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>16</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>2017-08-28</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>210654</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>3171867</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>1.271823e+08</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2.750000e+01</td>\n", | |
" <td>1.244798e+06</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>9.729693e+05</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.558579e+01</td>\n", | |
" <td>5.898362e+05</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>1.254970e+08</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.000000e+00</td>\n", | |
" <td>9.699500e+04</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>1.263397e+08</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.400000e+01</td>\n", | |
" <td>8.053210e+05</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>1.271823e+08</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2.750000e+01</td>\n", | |
" <td>1.294665e+06</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>1.280249e+08</td>\n", | |
" <td>NaN</td>\n", | |
" <td>4.100000e+01</td>\n", | |
" <td>1.730015e+06</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>1.288675e+08</td>\n", | |
" <td>NaN</td>\n", | |
" <td>5.400000e+01</td>\n", | |
" <td>2.134244e+06</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id date store_nbr item_nbr onpromotion\n", | |
"count 3.370464e+06 3370464 3.370464e+06 3.370464e+06 3370464\n", | |
"unique NaN 16 NaN NaN 2\n", | |
"top NaN 2017-08-28 NaN NaN False\n", | |
"freq NaN 210654 NaN NaN 3171867\n", | |
"mean 1.271823e+08 NaN 2.750000e+01 1.244798e+06 NaN\n", | |
"std 9.729693e+05 NaN 1.558579e+01 5.898362e+05 NaN\n", | |
"min 1.254970e+08 NaN 1.000000e+00 9.699500e+04 NaN\n", | |
"25% 1.263397e+08 NaN 1.400000e+01 8.053210e+05 NaN\n", | |
"50% 1.271823e+08 NaN 2.750000e+01 1.294665e+06 NaN\n", | |
"75% 1.280249e+08 NaN 4.100000e+01 1.730015e+06 NaN\n", | |
"max 1.288675e+08 NaN 5.400000e+01 2.134244e+06 NaN" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test.describe(include='all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"False 3171867\n", | |
"True 198597\n", | |
"Name: onpromotion, dtype: int64" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test['onpromotion'].value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>unit_sales</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>125497040</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>125497041</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>125497042</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>125497043</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>125497044</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id unit_sales\n", | |
"0 125497040 0\n", | |
"1 125497041 0\n", | |
"2 125497042 0\n", | |
"3 125497043 0\n", | |
"4 125497044 0" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sample_submission.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date</th>\n", | |
" <th>type</th>\n", | |
" <th>locale</th>\n", | |
" <th>locale_name</th>\n", | |
" <th>description</th>\n", | |
" <th>transferred</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2012-03-02</td>\n", | |
" <td>Holiday</td>\n", | |
" <td>Local</td>\n", | |
" <td>Manta</td>\n", | |
" <td>Fundacion de Manta</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2012-04-01</td>\n", | |
" <td>Holiday</td>\n", | |
" <td>Regional</td>\n", | |
" <td>Cotopaxi</td>\n", | |
" <td>Provincializacion de Cotopaxi</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2012-04-12</td>\n", | |
" <td>Holiday</td>\n", | |
" <td>Local</td>\n", | |
" <td>Cuenca</td>\n", | |
" <td>Fundacion de Cuenca</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2012-04-14</td>\n", | |
" <td>Holiday</td>\n", | |
" <td>Local</td>\n", | |
" <td>Libertad</td>\n", | |
" <td>Cantonizacion de Libertad</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2012-04-21</td>\n", | |
" <td>Holiday</td>\n", | |
" <td>Local</td>\n", | |
" <td>Riobamba</td>\n", | |
" <td>Cantonizacion de Riobamba</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date type locale locale_name description \\\n", | |
"0 2012-03-02 Holiday Local Manta Fundacion de Manta \n", | |
"1 2012-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi \n", | |
"2 2012-04-12 Holiday Local Cuenca Fundacion de Cuenca \n", | |
"3 2012-04-14 Holiday Local Libertad Cantonizacion de Libertad \n", | |
"4 2012-04-21 Holiday Local Riobamba Cantonizacion de Riobamba \n", | |
"\n", | |
" transferred \n", | |
"0 False \n", | |
"1 False \n", | |
"2 False \n", | |
"3 False \n", | |
"4 False " | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"holidays_events.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date</th>\n", | |
" <th>type</th>\n", | |
" <th>locale</th>\n", | |
" <th>locale_name</th>\n", | |
" <th>description</th>\n", | |
" <th>transferred</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>350</td>\n", | |
" <td>350</td>\n", | |
" <td>350</td>\n", | |
" <td>350</td>\n", | |
" <td>350</td>\n", | |
" <td>350</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>312</td>\n", | |
" <td>6</td>\n", | |
" <td>3</td>\n", | |
" <td>24</td>\n", | |
" <td>103</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>2014-06-25</td>\n", | |
" <td>Holiday</td>\n", | |
" <td>National</td>\n", | |
" <td>Ecuador</td>\n", | |
" <td>Carnaval</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>4</td>\n", | |
" <td>221</td>\n", | |
" <td>174</td>\n", | |
" <td>174</td>\n", | |
" <td>10</td>\n", | |
" <td>338</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date type locale locale_name description transferred\n", | |
"count 350 350 350 350 350 350\n", | |
"unique 312 6 3 24 103 2\n", | |
"top 2014-06-25 Holiday National Ecuador Carnaval False\n", | |
"freq 4 221 174 174 10 338" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"holidays_events.describe(include='all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>item_nbr</th>\n", | |
" <th>family</th>\n", | |
" <th>class</th>\n", | |
" <th>perishable</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>96995</td>\n", | |
" <td>GROCERY I</td>\n", | |
" <td>1093</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>99197</td>\n", | |
" <td>GROCERY I</td>\n", | |
" <td>1067</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>103501</td>\n", | |
" <td>CLEANING</td>\n", | |
" <td>3008</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>103520</td>\n", | |
" <td>GROCERY I</td>\n", | |
" <td>1028</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>103665</td>\n", | |
" <td>BREAD/BAKERY</td>\n", | |
" <td>2712</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" item_nbr family class perishable\n", | |
"0 96995 GROCERY I 1093 0\n", | |
"1 99197 GROCERY I 1067 0\n", | |
"2 103501 CLEANING 3008 0\n", | |
"3 103520 GROCERY I 1028 0\n", | |
"4 103665 BREAD/BAKERY 2712 1" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"items.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date</th>\n", | |
" <th>dcoilwtico</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2013-01-01</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2013-01-02</td>\n", | |
" <td>93.14</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2013-01-03</td>\n", | |
" <td>92.97</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2013-01-04</td>\n", | |
" <td>93.12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2013-01-07</td>\n", | |
" <td>93.20</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date dcoilwtico\n", | |
"0 2013-01-01 NaN\n", | |
"1 2013-01-02 93.14\n", | |
"2 2013-01-03 92.97\n", | |
"3 2013-01-04 93.12\n", | |
"4 2013-01-07 93.20" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"oil.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>store_nbr</th>\n", | |
" <th>city</th>\n", | |
" <th>state</th>\n", | |
" <th>type</th>\n", | |
" <th>cluster</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>Quito</td>\n", | |
" <td>Pichincha</td>\n", | |
" <td>D</td>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>Quito</td>\n", | |
" <td>Pichincha</td>\n", | |
" <td>D</td>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Quito</td>\n", | |
" <td>Pichincha</td>\n", | |
" <td>D</td>\n", | |
" <td>8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>Quito</td>\n", | |
" <td>Pichincha</td>\n", | |
" <td>D</td>\n", | |
" <td>9</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>Santo Domingo</td>\n", | |
" <td>Santo Domingo de los Tsachilas</td>\n", | |
" <td>D</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" store_nbr city state type cluster\n", | |
"0 1 Quito Pichincha D 13\n", | |
"1 2 Quito Pichincha D 13\n", | |
"2 3 Quito Pichincha D 8\n", | |
"3 4 Quito Pichincha D 9\n", | |
"4 5 Santo Domingo Santo Domingo de los Tsachilas D 4" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"stores.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date</th>\n", | |
" <th>store_nbr</th>\n", | |
" <th>transactions</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2013-01-01</td>\n", | |
" <td>25</td>\n", | |
" <td>770</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2013-01-02</td>\n", | |
" <td>1</td>\n", | |
" <td>2111</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2013-01-02</td>\n", | |
" <td>2</td>\n", | |
" <td>2358</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2013-01-02</td>\n", | |
" <td>3</td>\n", | |
" <td>3487</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2013-01-02</td>\n", | |
" <td>4</td>\n", | |
" <td>1922</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date store_nbr transactions\n", | |
"0 2013-01-01 25 770\n", | |
"1 2013-01-02 1 2111\n", | |
"2 2013-01-02 2 2358\n", | |
"3 2013-01-02 3 3487\n", | |
"4 2013-01-02 4 1922" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"transactions.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Joining Stuff\n", | |
"\n", | |
"* train is the main table possible join rows `store_nbr`, `item_nbr`, and `date`\n", | |
"* stores can be joined by `store_nbr`\n", | |
"* transactions can be joined by `store_nbr`\n", | |
"* items can be joined in by `item_nbr`\n", | |
"* oil can be joined in by `date`\n", | |
"--------\n", | |
"* holiday can be joined in by `date` and possibly `locale_name` but it might be complicated - maybe we can augment this dataset with mapzen WhosOnFirst." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 286, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"12550" | |
] | |
}, | |
"execution_count": 286, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get a sub set of the data so that it's easier to play with `merge`\n", | |
"train_small = train.iloc[range(0,125497040, 10000)]\n", | |
"len(train_small)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## merge easy things" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 289, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"features = train_small \\\n", | |
" .merge(stores, on='store_nbr', how='left') \\\n", | |
" .merge(transactions, on=['store_nbr','date'], how='left') \\\n", | |
" .merge(items, on='item_nbr', how='left') \\\n", | |
" .merge(oil, on='date', how='left')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## merge holidays\n", | |
"Goal - to add 3 fields to each row which represent if the {store, date} is associated with a local, regional, or national holiday" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 290, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# These holidays matter because they exclude holidays \"transferred\" to another day and \n", | |
"# \"Work Days\" which aren't holidays at all\n", | |
"holidays_events_that_matter = holidays_events[\n", | |
" (holidays_events['type'] != 'Work Day') \n", | |
" | (holidays_events['transferred'] != True)\n", | |
"]\n", | |
"# drop redundant holidays (occasionally there are multiple local holidays on the same day)\n", | |
"holidays_events_that_matter.drop_duplicates(keep='first', subset=['date','locale'], inplace=True)\n", | |
"\n", | |
"# create dummy fields in the tables we're going to join\n", | |
"features['s'] = features.apply(lambda r: 's_{}'.format(r['state']), axis=1)\n", | |
"features['c'] = features.apply(lambda r: 'c_{}'.format(r['city']), axis=1)\n", | |
"def make_location_column(row):\n", | |
" if row['locale'] == \"Regional\":\n", | |
" return 's_{}'.format(row['locale_name'])\n", | |
" elif row['locale'] == \"Local\":\n", | |
" return 'c_{}'.format(row['locale_name'])\n", | |
"holidays_events_that_matter['place'] = holidays_events_that_matter.apply(make_location_column, axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 295, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# make local, regional, and national holiday fields\n", | |
"temp = features.merge(holidays_events_that_matter, left_on=['c', 'date'], right_on=['place', 'date'], how='left')\n", | |
"features['local_holiday'] = temp['locale'] == 'Local'\n", | |
"\n", | |
"temp = features.merge(holidays_events_that_matter, left_on=['s', 'date'], right_on=['place', 'date'], how='left')\n", | |
"features['regional_holiday'] = temp['locale'] == 'Regional'\n", | |
"\n", | |
"temp = features.merge(\n", | |
" holidays_events_that_matter[holidays_events_that_matter['locale'] == 'National'],\n", | |
" on='date', how='left')\n", | |
"features['national_holiday'] = temp['locale'] == 'National'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# todo\n", | |
"* figure out how to impute missing data in each set before joining them\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment