Skip to content

Instantly share code, notes, and snippets.

@artun3e
Last active May 2, 2021 08:23
Show Gist options
  • Save artun3e/76f3bb21a48cfe1031b1e4b8e5f065fb to your computer and use it in GitHub Desktop.
Save artun3e/76f3bb21a48cfe1031b1e4b8e5f065fb to your computer and use it in GitHub Desktop.
WALMART DATA ANALYSIS
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CS 210 GROUP PROJECT\n"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [],
"source": [
">>> import pandas\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import reverse_geocoder as rg\n",
"from collections import OrderedDict\n",
"import seaborn as sns # a visualization library from matplotlib\n",
"import math\n",
"import json\n",
"import pprint\n",
"import datetime\n"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Date</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>MarkDown1</th>\n",
" <th>MarkDown2</th>\n",
" <th>MarkDown3</th>\n",
" <th>MarkDown4</th>\n",
" <th>MarkDown5</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>IsHoliday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Date Temperature Fuel_Price MarkDown1 MarkDown2 \\\n",
"0 1 2010-02-05 42.31 2.572 NaN NaN \n",
"1 1 2010-02-12 38.51 2.548 NaN NaN \n",
"2 1 2010-02-19 39.93 2.514 NaN NaN \n",
"3 1 2010-02-26 46.63 2.561 NaN NaN \n",
"4 1 2010-03-05 46.50 2.625 NaN NaN \n",
"\n",
" MarkDown3 MarkDown4 MarkDown5 CPI Unemployment IsHoliday \n",
"0 NaN NaN NaN 211.096358 8.106 False \n",
"1 NaN NaN NaN 211.242170 8.106 True \n",
"2 NaN NaN NaN 211.289143 8.106 False \n",
"3 NaN NaN NaN 211.319643 8.106 False \n",
"4 NaN NaN NaN 211.350143 8.106 False "
]
},
"execution_count": 181,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features = pd.read_csv(\"/Users/artun/Downloads/walmart/features.csv\")\n",
"walmart_train = pd.read_csv(\"/Users/artun/Downloads/walmart/train.csv\")\n",
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Dept</th>\n",
" <th>Date</th>\n",
" <th>Weekly_Sales</th>\n",
" <th>IsHoliday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>24924.50</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>46039.49</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>41595.55</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>19403.54</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>21827.90</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Dept Date Weekly_Sales IsHoliday\n",
"0 1 1 2010-02-05 24924.50 False\n",
"1 1 1 2010-02-12 46039.49 True\n",
"2 1 1 2010-02-19 41595.55 False\n",
"3 1 1 2010-02-26 19403.54 False\n",
"4 1 1 2010-03-05 21827.90 False"
]
},
"execution_count": 182,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>MarkDown1</th>\n",
" <th>MarkDown2</th>\n",
" <th>MarkDown3</th>\n",
" <th>MarkDown4</th>\n",
" <th>MarkDown5</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>8190.000000</td>\n",
" <td>8190.000000</td>\n",
" <td>8190.000000</td>\n",
" <td>4032.000000</td>\n",
" <td>2921.000000</td>\n",
" <td>3613.000000</td>\n",
" <td>3464.000000</td>\n",
" <td>4050.000000</td>\n",
" <td>7605.000000</td>\n",
" <td>7605.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>23.000000</td>\n",
" <td>59.356198</td>\n",
" <td>3.405992</td>\n",
" <td>7032.371786</td>\n",
" <td>3384.176594</td>\n",
" <td>1760.100180</td>\n",
" <td>3292.935886</td>\n",
" <td>4132.216422</td>\n",
" <td>172.460809</td>\n",
" <td>7.826821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>12.987966</td>\n",
" <td>18.678607</td>\n",
" <td>0.431337</td>\n",
" <td>9262.747448</td>\n",
" <td>8793.583016</td>\n",
" <td>11276.462208</td>\n",
" <td>6792.329861</td>\n",
" <td>13086.690278</td>\n",
" <td>39.738346</td>\n",
" <td>1.877259</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>-7.290000</td>\n",
" <td>2.472000</td>\n",
" <td>-2781.450000</td>\n",
" <td>-265.760000</td>\n",
" <td>-179.260000</td>\n",
" <td>0.220000</td>\n",
" <td>-185.170000</td>\n",
" <td>126.064000</td>\n",
" <td>3.684000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>12.000000</td>\n",
" <td>45.902500</td>\n",
" <td>3.041000</td>\n",
" <td>1577.532500</td>\n",
" <td>68.880000</td>\n",
" <td>6.600000</td>\n",
" <td>304.687500</td>\n",
" <td>1440.827500</td>\n",
" <td>132.364839</td>\n",
" <td>6.634000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>23.000000</td>\n",
" <td>60.710000</td>\n",
" <td>3.513000</td>\n",
" <td>4743.580000</td>\n",
" <td>364.570000</td>\n",
" <td>36.260000</td>\n",
" <td>1176.425000</td>\n",
" <td>2727.135000</td>\n",
" <td>182.764003</td>\n",
" <td>7.806000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>34.000000</td>\n",
" <td>73.880000</td>\n",
" <td>3.743000</td>\n",
" <td>8923.310000</td>\n",
" <td>2153.350000</td>\n",
" <td>163.150000</td>\n",
" <td>3310.007500</td>\n",
" <td>4832.555000</td>\n",
" <td>213.932412</td>\n",
" <td>8.567000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>45.000000</td>\n",
" <td>101.950000</td>\n",
" <td>4.468000</td>\n",
" <td>103184.980000</td>\n",
" <td>104519.540000</td>\n",
" <td>149483.310000</td>\n",
" <td>67474.850000</td>\n",
" <td>771448.100000</td>\n",
" <td>228.976456</td>\n",
" <td>14.313000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Temperature Fuel_Price MarkDown1 MarkDown2 \\\n",
"count 8190.000000 8190.000000 8190.000000 4032.000000 2921.000000 \n",
"mean 23.000000 59.356198 3.405992 7032.371786 3384.176594 \n",
"std 12.987966 18.678607 0.431337 9262.747448 8793.583016 \n",
"min 1.000000 -7.290000 2.472000 -2781.450000 -265.760000 \n",
"25% 12.000000 45.902500 3.041000 1577.532500 68.880000 \n",
"50% 23.000000 60.710000 3.513000 4743.580000 364.570000 \n",
"75% 34.000000 73.880000 3.743000 8923.310000 2153.350000 \n",
"max 45.000000 101.950000 4.468000 103184.980000 104519.540000 \n",
"\n",
" MarkDown3 MarkDown4 MarkDown5 CPI Unemployment \n",
"count 3613.000000 3464.000000 4050.000000 7605.000000 7605.000000 \n",
"mean 1760.100180 3292.935886 4132.216422 172.460809 7.826821 \n",
"std 11276.462208 6792.329861 13086.690278 39.738346 1.877259 \n",
"min -179.260000 0.220000 -185.170000 126.064000 3.684000 \n",
"25% 6.600000 304.687500 1440.827500 132.364839 6.634000 \n",
"50% 36.260000 1176.425000 2727.135000 182.764003 7.806000 \n",
"75% 163.150000 3310.007500 4832.555000 213.932412 8.567000 \n",
"max 149483.310000 67474.850000 771448.100000 228.976456 14.313000 "
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.describe()"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Dept</th>\n",
" <th>Weekly_Sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>421570.000000</td>\n",
" <td>421570.000000</td>\n",
" <td>421570.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>22.200546</td>\n",
" <td>44.260317</td>\n",
" <td>15981.258123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>12.785297</td>\n",
" <td>30.492054</td>\n",
" <td>22711.183519</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>-4988.940000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>11.000000</td>\n",
" <td>18.000000</td>\n",
" <td>2079.650000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>22.000000</td>\n",
" <td>37.000000</td>\n",
" <td>7612.030000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>33.000000</td>\n",
" <td>74.000000</td>\n",
" <td>20205.852500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>45.000000</td>\n",
" <td>99.000000</td>\n",
" <td>693099.360000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Dept Weekly_Sales\n",
"count 421570.000000 421570.000000 421570.000000\n",
"mean 22.200546 44.260317 15981.258123\n",
"std 12.785297 30.492054 22711.183519\n",
"min 1.000000 1.000000 -4988.940000\n",
"25% 11.000000 18.000000 2079.650000\n",
"50% 22.000000 37.000000 7612.030000\n",
"75% 33.000000 74.000000 20205.852500\n",
"max 45.000000 99.000000 693099.360000"
]
},
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_train.describe()"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Store int64\n",
"Date object\n",
"Temperature float64\n",
"Fuel_Price float64\n",
"MarkDown1 float64\n",
"MarkDown2 float64\n",
"MarkDown3 float64\n",
"MarkDown4 float64\n",
"MarkDown5 float64\n",
"CPI float64\n",
"Unemployment float64\n",
"IsHoliday bool\n",
"dtype: object"
]
},
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Store int64\n",
"Dept int64\n",
"Date object\n",
"Weekly_Sales float64\n",
"IsHoliday bool\n",
"dtype: object"
]
},
"execution_count": 186,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_train.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## How sales change as year changes"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Dept</th>\n",
" <th>Date</th>\n",
" <th>Weekly_Sales</th>\n",
" <th>IsHoliday</th>\n",
" <th>Year</th>\n",
" <th>Month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>24924.50</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>46039.49</td>\n",
" <td>True</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>41595.55</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>19403.54</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>21827.90</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Dept Date Weekly_Sales IsHoliday Year Month\n",
"0 1 1 2010-02-05 24924.50 False 2010 2\n",
"1 1 1 2010-02-12 46039.49 True 2010 2\n",
"2 1 1 2010-02-19 41595.55 False 2010 2\n",
"3 1 1 2010-02-26 19403.54 False 2010 2\n",
"4 1 1 2010-03-05 21827.90 False 2010 3"
]
},
"execution_count": 187,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_train['Date'] = pd.to_datetime(walmart_train['Date'])\n",
"walmart_train['Year'], walmart_train['Month'] = walmart_train['Date'].dt.year, walmart_train['Date'].dt.month\n",
"walmart_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x1a1a79ec88>"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAEJCAYAAABsc6siAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADsJJREFUeJzt3X+MZWV9x/H3xwVtDD/8sQjbBV1tiLBELbixVNLGlppQTcQWSTEtXRoamqqFTeEPqsZak6a0IaRptTE0EFdD/VEhdVuxlG5pKAklDAiFzQbZ2lZXFlclXWg1yOK3f9yzOh1nd+53nTt35/J+JZM595znPvc7T87OZ5/7nHsmVYUkSR3Pm3YBkqTVx/CQJLUZHpKkNsNDktRmeEiS2gwPSVKb4SFJajM8JElthockqe2oaRcwKWvXrq0NGzZMuwxJWjXuu+++b1bVCeO0ndnw2LBhA3Nzc9MuQ5JWjST/NW5b37aSJLUZHpKkNsNDktRmeEiS2gwPSVKb4SFJajM8JElthockqc3wkCS1GR6SpDbDQ5LUZnhIktoMD0lSm+EhSWozPCRJbYaHJKltZv8Y1I5v7eA1W18z7TIkHQEe2vzQtEuYOc48JElthockqc3wkCS1GR6SpDbDQ5LUZnhIktoMD0lSm+EhSWozPCRJbYaHJKnN8JAktRkekqQ2w0OS1GZ4SJLaJhYeSU5JckeSnUl2JLli2P+SJLcneXT4/uJh/2lJ7k7ydJKrlupHkjQ9k5x57AeurKrTgbOBdyfZCFwNbK+qU4Htw2OAJ4DLgWvH7EeSNCUTC4+q2lNV9w/bTwE7gfXA+cDWodlW4O1Dm71VdS/wzJj9SJKmZEXWPJJsAM4E7gFOrKo9MAoG4GWH2Y8kaUomHh5JjgFuBrZU1ZOT7CfJZUnmksw9+9Szh/tSkqQlTDQ8khzN6Bf+TVV1y7D760nWDcfXAXsPs58fUlXXV9Wmqtq05tg1P/oPIEla1CSvtgpwA7Czqq6bd2gbsHnY3gx87jD7kSRNyVET7Psc4GLgoSQPDPveC1wDfCbJpcBXgAsBkpwEzAHHAd9LsgXYCLx2sX6q6tYJ1i5JOoSJhUdV3QXkIIfPXaT948DJi7Q9VD+SpCnwE+aSpDbDQ5LUZnhIktoMD0lSm+EhSWozPCRJbYaHJKnN8JAktRkekqQ2w0OS1GZ4SJLaDA9JUpvhIUlqm+Qt2afqjJeewdzmuWmXIUkzyZmHJKnN8JAktRkekqQ2w0OS1GZ4SJLaDA9JUpvhIUlqMzwkSW2GhySpzfCQJLUZHpKkNsNDktRmeEiS2gwPSVKb4SFJajM8JElthockqc3wkCS1GR6SpDbDQ5LUZnhIktoMD0lSm+EhSWozPCRJbYaHJKnN8JAktRkekqQ2w0OS1GZ4SJLaDA9JUpvhIUlqMzwkSW2GhySpzfCQJLUZHpKktqOmXcDEPPZF+ODx065Ckg7ug/umXcFhc+YhSWozPCRJbYaHJKnN8JAktRkekqQ2w0OS1GZ4SJLaDhkeSdYkuWClipEkrQ6HDI+qehbYskK1SJJWiXHetrotyZYk65Icd+Br4pVJko5Y49ye5LeG71fO21fAy5e/HEnSarBkeFTVKStRiCRp9RjrxohJTgM2Aj92YF9V/dWkipIkHdmWXPNI8n7geuCjwC8Cfwq8Y4znnZLkjiQ7k+xIcsWw/yVJbk/y6PD9xcP+05LcneTpJFct6Ou8JI8k2ZXk6sP4OSVJy2icBfNfAX4O2FNVFwOvY7wZy37gyqo6HTgbeHeSjcDVwPaqOhXYPjwGeAK4HLh2fidJ1gAfYRRcG4F3Dv1IkqZknPD4znDJ7v4kxwKPA69a6klVtaeq7h+2nwJ2AuuB84GtQ7OtwNuHNnur6l7gmQVdvQHYVVVfrqrvAp8a+pAkTck4M4gvJnkRcCMwBzwJ3N95kSQbgDOBe4ATq2oPjAImycuWePp64KvzHu8Gfqrz+pKk5TXO1VYHLtX9SJLbgOMOzCjGkeQY4GZgS1U9maRb42JPqIO81mXAZQAvP779OpKkMY11b6skFyV5X1XtAr6R5PVjPu9oRsFxU1XdMuz+epJ1w/F1wN4lutkNzL9c+GTgscUaVtX1VbWpqjad8ELDQ5ImZZyrrT7MaMH814Zd/8voyqulnhfgBmBnVV0379A2YPOwvRn43BJd3QucmuSVSZ4PXDT0IUmaknHWPN5YVWcl+SJAVT0x/BJfyjnAxcBDSR4Y9r0XuAb4TJJLga8AFwIkOYnRmspxwPeSbAE2Dm91vQe4DVgD3FhVO8b/ESVJy22c8HgmyfMY1hmSvBT43lJPqqq7WHy9AuDcRdo/zugtqcX6uhW4dYxaJUkrYJw1j48wWrc4IckfAHcBfzzRqiRJR7SDzjyS3Aq8q6o+nuQ+4BcYzSQurKqHV6pASdKR51BvW30M+IckW4E/cZ1BknTAQcOjqj6T5PPAB4C5JJ9g3lrHgiuoJEnPIUstmD/D6NLcFwDHMsZCuSRp9h1qzeM84DpGn6k4q6q+vWJVSZKOaIeaebyP0eK4ax2SpP/nUGseP7OShUiSVo+x7m0lSdJ8hockqc3wkCS1GR6SpDbDQ5LUNs5ddVenHz8TPjg37SokaSY585AktRkekqQ2w0OS1GZ4SJLaDA9JUpvhIUlqMzwkSW2GhySpzfCQJLUZHpKkNsNDktRmeEiS2gwPSVKb4SFJajM8JElthockqc3wkCS1GR6SpDbDQ5LUZnhIktoMD0lSm+EhSWozPCRJbYaHJKnN8JAktRkekqQ2w0OS1GZ4SJLaDA9JUpvhIUlqMzwkSW2GhySpzfCQJLUZHpKkNsNDktR21LQLmJSHvraPDVd/ftplSNKK+c9r3rpir+XMQ5LUZnhIktoMD0lSm+EhSWozPCRJbYaHJKnN8JAktRkekqQ2w0OS1GZ4SJLaDA9JUpvhIUlqMzwkSW2GhySpbWLhkeSUJHck2ZlkR5Irhv0vSXJ7kkeH7y8e9p+W5O4kTye5akFfNybZm+ThSdUrSRrfJGce+4Erq+p04Gzg3Uk2AlcD26vqVGD78BjgCeBy4NpF+voYcN4Ea5UkNUwsPKpqT1XdP2w/BewE1gPnA1uHZluBtw9t9lbVvcAzi/R1J6NwkSQdAVZkzSPJBuBM4B7gxKraA6OAAV62EjVIkpbPxMMjyTHAzcCWqnpywq91WZK5JHPPfnvfJF9Kkp7TJhoeSY5mFBw3VdUtw+6vJ1k3HF8H7F2u16uq66tqU1VtWvPC45erW0nSApO82irADcDOqrpu3qFtwOZhezPwuUnVIEmajEnOPM4BLgZ+PskDw9dbgGuANyd5FHjz8JgkJyXZDfwu8P4ku5McNxz7JHA38Oph/6UTrFuStISjJtVxVd0F5CCHz12k/ePAyQfp653LWJok6UfkJ8wlSW2GhySpzfCQJLUZHpKkNsNDktRmeEiS2gwPSVKb4SFJajM8JElthockqc3wkCS1GR6SpDbDQ5LUNrG76k7ba9Yfz9w1b512GZI0k5x5SJLaDA9JUpvhIUlqMzwkSW2GhySpzfCQJLUZHpKkNsNDktRmeEiS2gwPSVKb4SFJajM8JElthockqc3wkCS1GR6SpDbDQ5LUZnhIktpSVdOuYSKSPAU8Mu06pmwt8M1pFzFljoFjAI4BjDcGr6iqE8bpbGb/DC3wSFVtmnYR05RkzjFwDBwDxwCWfwx820qS1GZ4SJLaZjk8rp92AUcAx8AxAMcAHANY5jGY2QVzSdLkzPLMQ5I0Ias6PJKcl+SRJLuSXL3I8Rck+fRw/J4kG1a+yskaYwwuSfKNJA8MX785jTonKcmNSfYmefggx5Pkz4Yx+rckZ610jZM2xhi8Kcm+eefBB1a6xklLckqSO5LsTLIjyRWLtJnpc2HMMViec6GqVuUXsAb4d+BVwPOBB4GNC9q8C/josH0R8Olp1z2FMbgE+PC0a53wOPwscBbw8EGOvwX4AhDgbOCeadc8hTF4E/B3065zwmOwDjhr2D4W+NIi/x5m+lwYcwyW5VxYzTOPNwC7qurLVfVd4FPA+QvanA9sHbY/C5ybJCtY46SNMwYzr6ruBJ44RJPzgY/XyL8CL0qybmWqWxljjMHMq6o9VXX/sP0UsBNYv6DZTJ8LY47BsljN4bEe+Oq8x7v54UH6fpuq2g/sA166ItWtjHHGAOCCYYr+2SSnrExpR5Rxx2nW/XSSB5N8IckZ0y5mkoa3qM8E7llw6DlzLhxiDGAZzoXVHB6LzSAWXjo2TpvVbJyf72+BDVX1WuAf+cFM7Llk1s+DcdzP6NYTrwP+HPibKdczMUmOAW4GtlTVkwsPL/KUmTsXlhiDZTkXVnN47Abm/y/6ZOCxg7VJchRwPLM1tV9yDKrqW1X19PDwL4HXr1BtR5JxzpWZVlVPVtX/DNu3AkcnWTvlspZdkqMZ/dK8qapuWaTJzJ8LS43Bcp0Lqzk87gVOTfLKJM9ntCC+bUGbbcDmYfsdwD/VsGI0I5YcgwXv576N0XugzzXbgF8frrQ5G9hXVXumXdRKSnLSgfW+JG9g9G//W9OtankNP98NwM6quu4gzWb6XBhnDJbrXFi1N0asqv1J3gPcxuiqoxurakeSDwFzVbWN0SB+IskuRjOOi6ZX8fIbcwwuT/I2YD+jMbhkagVPSJJPMrqCZG2S3cDvA0cDVNVHgVsZXWWzC/g28BvTqXRyxhiDdwC/nWQ/8B3gohn7jxTAOcDFwENJHhj2vRd4OTxnzoVxxmBZzgU/YS5JalvNb1tJkqbE8JAktRkekqQ2w0OS1GZ4SNIMWOrmmAvaviLJ9uHOE/+c5OTu6xkekjQbPgacN2bbaxnd4+u1wIeAP+q+mOEhSTNgsZtjJvmJJH+f5L4k/5LktOHQRmD7sH0Hh3FDVcNDkmbX9cDvVNXrgauAvxj2PwhcMGz/EnBsktZNY1ftJ8wlSQc33BzxjcBfz/tLFC8Yvl8FfDjJJcCdwNcY3YVibIaHJM2m5wH/XVU/ufBAVT0G/DJ8P2QuqKp93c4lSTNmuBX7fyS5EL7/J3hfN2yvTXLg9//vATd2+zc8JGkGDDfHvBt4dZLdSS4FfhW4NMmDwA5+sDD+JuCRJF8CTgT+sP163hhRktTlzEOS1GZ4SJLaDA9JUpvhIUlqMzwkSW2GhySpzfCQJLUZHpKktv8DUHwGmafRPKYAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#Generate at least two histograms and two scatterplots from your data to describe the data set.\n",
"\n",
"Year_sales = walmart_train.groupby(walmart_train[\"Year\"]).sum()[\"Weekly_Sales\"].sort_values(ascending=False)\n",
"Year_sales.plot(kind = \"barh\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## How total number of sales varies in weekdays and holidays"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x1a1a86e198>"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAEJCAYAAABsc6siAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADl9JREFUeJzt3XuMZnV9x/H3h11EioCXpXa5rrQUb+Xmhli2IaZqo4FqKbVggFqFbpogtaXEQi8pmjQllFChROsWsFC1pBVI7CWoRSi0RWGX+0UEEepyKVrlskBLWL79Y86yA2x3nt/snDnPPPt+JZPnnOc58zyfk83OZ87vnPObVBWSJLXYZugAkqSFx/KQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktRs8dAB+rJkyZJatmzZ0DEkaUFZs2bND6pql5m2m9jyWLZsGatXrx46hiQtKEkeGGU7h60kSc0sD0lSM8tDktTM8pAkNbM8JEnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktTM8pAkNbM8JEnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSs8VDB+jNQzfB6Tv3/zmnP97/Z0jSmPHIQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktTM8pAkNbM8JEnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktTM8pAkNVs8Hx+S5HXAld3qTwDrge936wdX1bPzkUOSNDfmpTyq6r+BAwCSnA6sq6qzpm+TJECq6vn5yCRJmr1Bh62S/FSS25P8JXAjsEeSx6a9fnSS87vl1ye5LMnqJNcneftQuSVpazcO5zzeDFxQVQcCD25mu3OBM6tqOfCrwPnzEU6S9HLzMmw1g+9U1Q0jbPcuYN+p0S0AXpNk+6p6ZsMTSVYCKwH23DkvfwdJ0pwYh/J4atry88D0n/qvnLYcZji5XlWrgFUAy3ddVHMZUpK00TgMW72gO1n+oyT7JNkGOGLay/8CnLhhJckB851PkjRlrMqj83vAFUxd2rt22vMnAiuS3JrkTuA3hggnSZq6NHboDL1YvuuiWr3yVf1/0OmP9/8ZkjRPkqzpLkzarHE88pAkjTnLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktTM8pAkNbM8JEnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktRs8dABerPrgXD66qFTSNJE8shDktTM8pAkNbM8JEnNLA9JUjPLQ5LUbKTySHJ4EotGkgSMfuRxNHBPkjOTvKnPQJKk8TdSeVTVscCBwHeAzyW5LsnKJDv2mk6SNJZGHoqqqieAS4FLgKXAEcCNSU7qKZskaUyNes7jF5NcDnwd2BY4uKreC+wPnNJjPknSGBp1epIPAH9eVddMf7Kqnk7ykbmPJUkaZyOVR1X92mZeu3Lu4kiSFoJRh63enuSGJOuSPJtkfZIn+g4nSRpPo54wPw/4IHAPsD1wAvAXfYWSJI23kadkr6p7kyyqqvVMXa77Hz3mkiSNsVHL4+kkrwBuTnIm8DCwQ3+xJEnjbNRhq+OARcBHgaeAPYAj+wolSRpvo15t9UC3+Azwif7iSJIWgs2WR5LbgPr/Xq+q/eY8kSRp7M105HF493hi9/g33eMxwNO9JJIkjb3NlseG4aokK6pqxbSXTk3y78An+wwnSRpPo54w3yHJz21YSXIIXm0lSVutUS/VPR64MMnO3fpjgHNaSdJWatSrrdYA+yfZCUhVPd5vLEnSOJvpaqtjq+rzSU5+yfMAVNXZPWaTJI2pmY48NpzX8C8GSpJeMNPVVp/tHr0xUJL0gpmGrc7d3OtV9VtzG0eStBDMNGy1Zl5SSJIWlJmGrS6avp5kx6mna12vqSRJY23UvyT41iQ3AbcDdyZZk+Qt/UaTJI2rUe8wXwWcXFV7VdWewO8Cf9VfLEnSOBt5epKqumrDSlVdjdOTSNJWa9TpSe5L8kdsnFX3WOC7/USSJI27UY88PgLsAlwGXN4tf7ivUJKk8Tbq3FY/ArynQ5IEzHyT4D+w+b8k+L45TyRJGnszHXmc1T2GqaurTug3jiRpIZjpJsF/3bCcZN30dUnS1mvUE+awmeErSdLWZaZzHq+dtrooyWuYGsICoKp+2FcwSdL4GmVixGJjYdw47bUC9u4jlCRpvM10zuMN8xVEkrRwjDox4ookO3TLxyY5O8me/UaTJI2rUU+YfwZ4Osn+wMeBB9g4VYkkaSszank8V1UFvB84p6rOwb9rLklbrVEnRnwyyWlMTYh4aJJFwLb9xZIkjbNRjzyOAv4XOL6qHgF2A/6st1SSpLE26sSIjwBnT1v/T+DivkJJksbbTDcJPsmm7ywPU3/LfKdeUkmSxtpM93l4UlyS9DItc1tJkgRYHpKkWRj1Ut0F57YHH2fZqf80dAxJmlf3n3HYvHyORx6SpGaWhySpmeUhSWpmeUiSmlkekqRmlockqZnlIUlqZnlIkppZHpKkZpaHJKmZ5SFJamZ5SJKaWR6SpGaWhySpmeUhSWpmeUiSmlkekqRmlockqZnlIUlqZnlIkppZHpKkZpaHJKmZ5SFJatZbeSRZn+TmaV/LNrPtsiS395VFkjS3Fvf43s9U1QE9vr8kaSDzOmzVHWFcm+TG7uuQTWzzliTXd0crtybZp3v+2GnPfzbJovnMLknaqM/y2H7akNXl3XOPAu+uqoOAo4BzN/F9vwmc0x21LAfWJnlTt/2K7vn1wDEv/cYkK5OsTrJ6/dOP97FPkiTmf9hqW+C8JBsK4Kc38X3XAX+QZHfgsqq6J8k7gbcBNyQB2J6pInqRqloFrALYbuk+NWd7Ikl6kT7LY1N+B/gvYH+mjnr+56UbVNUXk3wTOAz4SpITgAAXVdVp8xlWkrRp832p7s7Aw1X1PHAc8LLzFkn2Bu6rqnOBLwP7AVcCv5Lkx7ttXptkr/mLLUmabr7L49PAh5J8g6khq6c2sc1RwO1JbgbeCFxcVXcCfwh8NcmtwNeApfOUWZL0EqmazFMD2y3dp5Z+6FNDx5CkeXX/GYdt0fcnWVNVy2fazjvMJUnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktTM8pAkNbM8JEnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSs8VDB+jLz+y2M6vPOGzoGJI0kTzykCQ1szwkSc0sD0lSM8tDktTM8pAkNbM8JEnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc0sD0lSM8tDktTM8pAkNbM8JEnNLA9JUjPLQ5LUzPKQJDWzPCRJzSwPSVIzy0OS1MzykCQ1szwkSc1SVUNn6EWSJ4G7h84xR5YAPxg6xByapP1xX8aT+zJ7e1XVLjNttHg+kgzk7qpaPnSIuZBk9aTsC0zW/rgv48l96Z/DVpKkZpaHJKnZJJfHqqEDzKFJ2heYrP1xX8aT+9KziT1hLknqzyQfeUiSejKR5ZHkPUnuTnJvklOHzjNbSS5M8miS24fOsqWS7JHkqiR3JbkjyceGzjRbSV6Z5Pokt3T78omhM22pJIuS3JTkH4fOsqWS3J/ktiQ3J1k9dJ4tkeTVSb6U5Fvd/52fHTrTBhM3bJVkEfBt4N3AWuAG4INVdeegwWYhyaHAOuDiqnrr0Hm2RJKlwNKqujHJjsAa4JcW6L9LgB2qal2SbYF/Az5WVd8YONqsJTkZWA7sVFWHD51nSyS5H1heVQv+Po8kFwHXVtX5SV4B/FhVPTZ0LpjMI4+DgXur6r6qeha4BHj/wJlmpaquAX44dI65UFUPV9WN3fKTwF3AbsOmmp2asq5b3bb7WrC/hSXZHTgMOH/oLNooyU7AocAFAFX17LgUB0xmeewGfG/a+loW6A+pSZVkGXAg8M1hk8xeN8xzM/Ao8LWqWrD7AnwK+Djw/NBB5kgBX02yJsnKocNsgb2B7wOf64YUz0+yw9ChNpjE8sgmnluwvxVOmiSvAi4Ffruqnhg6z2xV1fqqOgDYHTg4yYIcVkxyOPBoVa0ZOsscWlFVBwHvBU7shn8XosXAQcBnqupA4ClgbM7hTmJ5rAX2mLa+O/DQQFk0TXd+4FLgC1V12dB55kI3jHA18J6Bo8zWCuB93XmCS4CfT/L5YSNtmap6qHt8FLicqaHshWgtsHbaUe2XmCqTsTCJ5XEDsE+SN3QnmI4Gvjxwpq1ed5L5AuCuqjp76DxbIskuSV7dLW8PvAv41rCpZqeqTquq3atqGVP/V75eVccOHGvWkuzQXZBBN8TzC8CCvFqxqh4Bvpdk3+6pdwJjc4HJxE2MWFXPJfko8BVgEXBhVd0xcKxZSfK3wDuAJUnWAn9cVRcMm2rWVgDHAbd15woAfr+q/nnATLO1FLiou7JvG+DvqmrBX+I6IV4PXD71uwqLgS9W1RXDRtoiJwFf6H4Rvg/48MB5XjBxl+pKkvo3icNWkqSeWR6SpGaWhySpmeUhSWpmeUjSBGiZSDXJXkmuTHJrkqu7KWqaWB6SNBn+mtFvVj2LqQlX9wM+Cfxp64dZHpI0ATY1kWqSn0xyRTfP17VJ3ti99Gbgym75KmYxeazlIUmTaxVwUlW9DTgF+HT3/C3Akd3yEcCOSV7X8sYTd4e5JOmFSUgPAf6+u+MeYLvu8RTgvCS/DlwDPAg81/L+lockTaZtgMe62Z9fpJs88pfhhZI5sqoeb31zSdKE6f7kwXeTfACmJidNsn+3vCTJhp//pwEXtr6/5SFJE6CbSPU6YN8ka5McDxwDHJ/kFuAONp4Yfwdwd5JvMzWZ5J80f54TI0qSWnnkIUlqZnlIkppZHpKkZpaHJKmZ5SFJamZ5SJKaWR6SpGaWhySp2f8B1w0p5lHLitsAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"cols2plot = walmart_train.groupby(walmart_train[\"IsHoliday\"]).sum()[\"Weekly_Sales\"].sort_values(ascending=False)\n",
"cols2plot.plot(kind = \"barh\")"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#A scatter plot is a graph used to determine whether there is a relationship between paired data\n",
"#Now we will check whether there are any relationship between fuel price and cpi\n",
"\n",
"fuel_price = walmart_features['Fuel_Price']\n",
"consumer_price_index = walmart_features['CPI']\n",
"\n",
"colors = (0,0,0)\n",
"area = np.pi*3\n",
" \n",
"# Plot\n",
"plt.scatter(fuel_price,consumer_price_index, s=area, c=colors, alpha=0.5)\n",
"plt.title('Fuel Price-CPI')\n",
"plt.xlabel('Fuel Price')\n",
"plt.ylabel('CPI')\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can conclude from here that fuel price and CPI isn't strongly correlated "
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#Now we will check whether there are any relationship between unemployment and consumer price index\n",
"CPI = walmart_features['CPI']\n",
"unemployment = walmart_features['Unemployment']\n",
"\n",
"colors = (0,0,0)\n",
"area = np.pi*3\n",
"\n",
"# Plot\n",
"plt.scatter(CPI,unemployment, s=area, c=colors, alpha=0.5)\n",
"plt.title('CPI-Unemployment')\n",
"plt.xlabel('Consumer price index')\n",
"plt.ylabel('Unemployment')\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It can be concluded that Consumer Price Index and Unemployment in Walmart(in various states) isn't correlated"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fuel_price = walmart_features['Fuel_Price']\n",
"unemployment = walmart_features['Unemployment']\n",
"\n",
"colors = (0,0,0)\n",
"area = np.pi*3\n",
" \n",
"# Plot\n",
"plt.scatter(fuel_price,unemployment, s=area, c=colors, alpha=0.5)\n",
"plt.title('Fuel Price-Unemployment')\n",
"plt.xlabel('Fuel Price')\n",
"plt.ylabel('Unemployment')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It is obvious that fuel price and unemployment in Walmart(in various states) isn't much correlated"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Dept</th>\n",
" <th>Date</th>\n",
" <th>Weekly_Sales</th>\n",
" <th>IsHoliday</th>\n",
" <th>Year</th>\n",
" <th>Month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>24924.50</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>46039.49</td>\n",
" <td>True</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>41595.55</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>19403.54</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>21827.90</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Dept Date Weekly_Sales IsHoliday Year Month\n",
"0 1 1 2010-02-05 24924.50 False 2010 2\n",
"1 1 1 2010-02-12 46039.49 True 2010 2\n",
"2 1 1 2010-02-19 41595.55 False 2010 2\n",
"3 1 1 2010-02-26 19403.54 False 2010 2\n",
"4 1 1 2010-03-05 21827.90 False 2010 3"
]
},
"execution_count": 193,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We want to store stores weekly sales for each week. We want to eliminate department from the walmart_train dataset."
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2010-02-05 00:00:00\n",
"2012-10-26 00:00:00\n"
]
}
],
"source": [
"\n",
"columns = {'Store':[],'Date':[],'Weekly_Sales':[],'CPI':[]}\n",
"df =pd.DataFrame(columns)\n",
"mindate = min(walmart_train[\"Date\"])\n",
"maxdate = max(walmart_train[\"Date\"])\n",
"print(mindate)\n",
"print(maxdate)\n",
"#dates will be in this integer\n",
"#now we want to create a unique date list\n",
"unique_date = np.unique(walmart_train['Date'])\n",
"unique_store = np.unique(walmart_train['Store'])\n",
"copdata =walmart_train.groupby(['Date','Store']).sum() \n",
"newdata = copdata.filter(items = ['Store','Date','Weekly_Sales'])\n",
"df = newdata\n",
"df['Date'] = datetime.datetime(2010,2,5)\n",
"df['CPI'] = 0.0\n",
"df.head()\n",
"\n",
"counter = 0\n",
"for i in range(len(unique_date)):\n",
" for s in range(45) :\n",
" df['Date'][counter+s] = unique_date[i]\n",
" counter = counter+45"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {},
"outputs": [],
"source": [
"walmart_features.head()\n",
"walmart_features.groupby(['Date'])\n",
"#df[(df['column_name'] >= A) & (df['column_name'] <= B)]\n",
"x = datetime.datetime(2012,10,26)\n",
"#print(x)\n",
"dates = pd.to_datetime(walmart_features['Date'])\n",
"copdates = dates.to_frame()\n",
"Dates = copdates[(copdates['Date']<=x)]\n",
"sync = walmart_features.filter(items = ['Date','CPI'])\n",
"df['Store'] = 0\n",
"counter = 0\n",
"for x in range(45) :\n",
" x = 1\n",
" for s in range(45):\n",
" df['Store'][counter] = x\n",
" x = x+1\n",
" counter = counter + 1"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Weekly_Sales</th>\n",
" <th>Date</th>\n",
" <th>CPI</th>\n",
" <th>Store</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th>Store</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">2010-02-05</th>\n",
" <th>1</th>\n",
" <td>1643690.90</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2136989.46</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>461622.22</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2135143.87</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>317173.10</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Weekly_Sales Date CPI Store\n",
"Date Store \n",
"2010-02-05 1 1643690.90 2010-02-05 0.0 1\n",
" 2 2136989.46 2010-02-05 0.0 2\n",
" 3 461622.22 2010-02-05 0.0 3\n",
" 4 2135143.87 2010-02-05 0.0 4\n",
" 5 317173.10 2010-02-05 0.0 5"
]
},
"execution_count": 196,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Date</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>MarkDown1</th>\n",
" <th>MarkDown2</th>\n",
" <th>MarkDown3</th>\n",
" <th>MarkDown4</th>\n",
" <th>MarkDown5</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>IsHoliday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Date Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 \\\n",
"0 1 2010-02-05 42.31 2.572 NaN NaN NaN \n",
"1 1 2010-02-12 38.51 2.548 NaN NaN NaN \n",
"2 1 2010-02-19 39.93 2.514 NaN NaN NaN \n",
"3 1 2010-02-26 46.63 2.561 NaN NaN NaN \n",
"4 1 2010-03-05 46.50 2.625 NaN NaN NaN \n",
"\n",
" MarkDown4 MarkDown5 CPI Unemployment IsHoliday \n",
"0 NaN NaN 211.096358 8.106 False \n",
"1 NaN NaN 211.242170 8.106 True \n",
"2 NaN NaN 211.289143 8.106 False \n",
"3 NaN NaN 211.319643 8.106 False \n",
"4 NaN NaN 211.350143 8.106 False "
]
},
"execution_count": 197,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#I want to find the CPI corresponding for that date and store which means that there are 143*45 \n",
"#first of all I need to eliminate the cpis of which the date is greater than my maxdate then i can take the cpis consecutively\n",
"\n",
"#so now the motivation is to eliminate the cpi\n",
"#so I can apply a fiter maybe then\n",
"\n",
"walmart_features.filter(items =['Store','Date','CPI'])\n",
"#now i want to eliminate wrt to date\n",
"#df = df[df.score > 50]\n",
"walmart_features['Date'] = pd.to_datetime(walmart_features['Date'])\n",
"walmart_features = walmart_features[walmart_features['Date']<=maxdate]\n",
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>Weekly_Sales</th>\n",
" <th>Date</th>\n",
" <th>CPI</th>\n",
" <th>Store</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th>Store</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">2010-02-05</th>\n",
" <th>1</th>\n",
" <td>1643690.90</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2136989.46</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>461622.22</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2135143.87</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>317173.10</td>\n",
" <td>2010-02-05</td>\n",
" <td>0.0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Weekly_Sales Date CPI Store\n",
"Date Store \n",
"2010-02-05 1 1643690.90 2010-02-05 0.0 1\n",
" 2 2136989.46 2010-02-05 0.0 2\n",
" 3 461622.22 2010-02-05 0.0 3\n",
" 4 2135143.87 2010-02-05 0.0 4\n",
" 5 317173.10 2010-02-05 0.0 5"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {},
"outputs": [],
"source": [
"# Now the only problem is the sira of the two different dataframes\n",
"#in each iteration, i need to increase the\n",
"#for each date, I need to get the cpi s of the stores(in an increasing order)\n",
"\n",
"walmart_features = walmart_features.reset_index(drop = True)\n",
"\n",
"cnt = 0\n",
"for i in range(len(unique_date)):\n",
" counter = 0\n",
" for k in range(45):\n",
" df['CPI'][k+cnt] = walmart_features['CPI'][i+counter]\n",
" counter = counter + 143\n",
" cnt = cnt + 45 \n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Weekly_Sales</th>\n",
" <th>CPI</th>\n",
" <th>Store</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6.435000e+03</td>\n",
" <td>6435.000000</td>\n",
" <td>6435.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.046965e+06</td>\n",
" <td>171.578394</td>\n",
" <td>7.237762</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>5.643666e+05</td>\n",
" <td>39.356712</td>\n",
" <td>12.930052</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>2.099863e+05</td>\n",
" <td>126.064000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.533501e+05</td>\n",
" <td>131.735000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>9.607460e+05</td>\n",
" <td>182.616521</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.420159e+06</td>\n",
" <td>212.743293</td>\n",
" <td>10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>3.818686e+06</td>\n",
" <td>227.232807</td>\n",
" <td>45.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Weekly_Sales CPI Store\n",
"count 6.435000e+03 6435.000000 6435.000000\n",
"mean 1.046965e+06 171.578394 7.237762\n",
"std 5.643666e+05 39.356712 12.930052\n",
"min 2.099863e+05 126.064000 0.000000\n",
"25% 5.533501e+05 131.735000 0.000000\n",
"50% 9.607460e+05 182.616521 0.000000\n",
"75% 1.420159e+06 212.743293 10.000000\n",
"max 3.818686e+06 227.232807 45.000000"
]
},
"execution_count": 200,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NOW WE ARE GOING TO PERFORM HYPOTESIS TESTING"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# cpi mean is 171\n",
"cpi_mean = 171.578\n",
"\n",
"cop1_df = df[(df['CPI']<cpi_mean)]\n",
"cop2_df = df[(df['CPI']>=cpi_mean)]\n",
"\n",
"ax = sns.kdeplot(cop1_df['Weekly_Sales'].rename(\" < mean cpi weekly sales\"))\n",
"sns.kdeplot(cop2_df['Weekly_Sales'].rename(\"> cpi weekly sales\"))\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 202,
"metadata": {},
"outputs": [],
"source": [
"from scipy import stats"
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.391726118107853e-07"
]
},
"execution_count": 203,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#now extract p value\n",
"# extracting values\n",
"cop1_values = cop1_df[\"Weekly_Sales\"].values\n",
"cop2_values = cop2_df[\"Weekly_Sales\"].values\n",
"\n",
"# two-sided test for the null hypothesis that 2 independent samples \n",
"# have identical average (expected) values\n",
"_, p_value = stats.ttest_ind(a=cop1_values, b=cop2_values, equal_var=False)\n",
"p_value"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since the p-value is below 0.05, we can reject the null hypothesis"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Date</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>MarkDown1</th>\n",
" <th>MarkDown2</th>\n",
" <th>MarkDown3</th>\n",
" <th>MarkDown4</th>\n",
" <th>MarkDown5</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>IsHoliday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Date Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 \\\n",
"0 1 2010-02-05 42.31 2.572 NaN NaN NaN \n",
"1 1 2010-02-12 38.51 2.548 NaN NaN NaN \n",
"2 1 2010-02-19 39.93 2.514 NaN NaN NaN \n",
"3 1 2010-02-26 46.63 2.561 NaN NaN NaN \n",
"4 1 2010-03-05 46.50 2.625 NaN NaN NaN \n",
"\n",
" MarkDown4 MarkDown5 CPI Unemployment IsHoliday \n",
"0 NaN NaN 211.096358 8.106 False \n",
"1 NaN NaN 211.242170 8.106 True \n",
"2 NaN NaN 211.289143 8.106 False \n",
"3 NaN NaN 211.319643 8.106 False \n",
"4 NaN NaN 211.350143 8.106 False "
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Linear Regression"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We want to see how CPI and fuel price effects unemployment with this experiment"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [],
"source": [
"my_matrix = walmart_features[['CPI','Fuel_Price']].as_matrix() #.values can be used"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {},
"outputs": [],
"source": [
"#Now linear regression \n",
"from sklearn.linear_model import LinearRegression\n",
"#fuel price and CPI is our two different datas\n",
"#i want to correlate more than two attributes \n",
"fuel_array = walmart_features['Fuel_Price']\n",
"CPI_array = walmart_features['CPI']\n",
"# Showing our (X, y) pairs on the scatter plot\n",
"lol = list(zip(fuel_array,CPI_array))"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {},
"outputs": [],
"source": [
"#linear regression = The objective of a linear regression model is to find a relationship between one or more features\n",
"#(independent variables) and a continuous target variable(dependent variable). \n",
"unemployment = walmart_features['Unemployment'].as_matrix()\n",
"unemployment = unemployment.reshape (-1,1)"
]
},
{
"cell_type": "code",
"execution_count": 208,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAF9NJREFUeJzt3X+QVeddx/HPF1gQQg200JolTaGdNHU3adN0x6nWqeNANS0dE/NXMkBSjcN0Qa0OTqVDhpXaaGY0Vh1ZLNpIUnbSPyq1nUadhtUm/7TVpaYhu2mbagJlScsyAUwCsrB8/WPvhd3l/jj33PPjOee+XzM7sOeevfe7Z89+9rnPec7zmLsLAFB88/IuAACQDAIdAEqCQAeAkiDQAaAkCHQAKAkCHQBKgkAHgJIg0AGgJAh0ACiJBVm+2IoVK3z16tVZviQAFN6hQ4dOuvvKZvtlGuirV6/WyMhIli8JAIVnZkei7EeXCwCUBIEOACVBoANASRDoAFASBDoAlESmo1zisl121TYfYGEO5K/WuSlxfiIfwbfQ6/3C1NsOZKXROcj5iTwEH+gAgGgIdAAoCQIdAEqCQAeAkmga6Gb2iJmdMLPnajz2B2bmZrYinfKA4uLCKLIWpYW+T9Ltczea2VslfUjS0YRrioxfGISOcxRZahro7v60pFdqPPRZSZ+UxIBbAAhArD50M/s1SePu/t0I+242sxEzG5mYmIjzcgCACFoOdDNbImmHpJ1R9nf3ve7e5+59K1c2nZ8dABBTnBb6OyStkfRdM3tJ0vWSvmNmP5NkYQCA1rQc6O5+2N3f7O6r3X21pGOSbnP3HydeXQTrHluXx8sCkXFhFFmJMmzxcUnflHSTmR0zs/vTL+uKZpMcDb84nFElwGxMwIXQNJ1t0d3vafL46sSqAQqmGuq0whEC7hQFgJIofKCvXbM27xIAIAiFD/SD9x7MuwSA/nQEofCBDgCYRqADQEkQ6ABQEgQ6AJREIQK93gUnLkQhJJynyFvTG4tCwS8FioDzFHkqRAsdANAcgQ4AJUGgA0BJFKIPfd1j62rOqkh/JULDuYo8Bd9Cr/cLIjHDHcLCuYq8BR/ozHeOouBcRd6CD3QAQDQEOgCURPCBznznKArOVeQt+EA/eO/Bur8ojBxASDhXkTdzz+5E6+vr85GRkcxeDwDKwMwOuXtfs/2Cb6EDAKIh0AGgJAh0ACgJAh0ASoJAB4CSINABoCQKMduiVHtyo3map6mBqRyqAeqrNxEXY9GRtkK00Ov9glzSJc3fNT/jaoD6Gs2qyIyLSFvTQDezR8zshJk9N2Pbn5nZ98zsWTP7spktS7fM+i7pUl4vDQBBidJC3yfp9jnbnpR0s7u/W9IPJH0q4boAAC1qGuju/rSkV+Zs+7q7X6x8+i1J16dQGwCgBUn0of+mpH9J4HlimVeMywAAkLq20tDMdki6KGmowT6bzWzEzEYmJiZivU690QGMckFoGo1kYZQL0hZptkUzWy3pa+5+84xt90n6uKS17n42yosx2yIAtC7qbIuxxqGb2e2S/lDSL0UNcwBAuqIMW3xc0jcl3WRmx8zsfkl/I+kNkp40s2fM7G9TrhMA0ETTFrq731Nj8+dTqKUh7r5DkdQ6XzlXkbZCDBHh7jsUSb1zknMVaStEoAMAmiPQAaAkCHQAKAkCHQBKohCBzt13KJJ65yTnKtJWmAUu+GVAkXC+Ig+FaKEDAJoj0AGgJAh0ACgJAh0ASoJAB4CSKMwoFyY7QpFwviIPhWihM9kRioTzFXkpRKADAJoj0AGgJAh0ACgJAh0ASqIQo1z6+/q1Z2TPVdsZNYDQNLrwyfkaht7dvRo7OXbV9v6+fg2uH8yhouQE30Lf8sSWmmEuMWoAYSHMw1cvzCVpz8gebXliS8YVJSv4QP/coc/lXQLQVO/u3oaP2y6b9YF81Avzqr2H9mZUSTqCDvShw0O65JfyLgNoqllQzEWoh2nKp/IuoS1B96HvGN6RdwlAU+seW5d3CaijUZdtLfNtforVpC/oFvqRM0fyLgFoaN1j6zT84nDeZaCGVsNckja/b3NK1WQj6EAHQkeYh6vV/vDF8xczygUAQtRKf3j30m6dfeBsitVkI+hAL3p/FtDIqodX5V1CqbWSH+PbxlOsJDtBXxSN8hd2/q75mhoo9pVpFFO7I1WOv3Y8oUpQy8J5C3Vu6lykfef+LBfMW6B9d+7Thls2pFFaaoJuoUdxSQxrBHC1qGFey8VLF7XpwCYNHR5KsKL0NQ10M3vEzE6Y2XMztr3RzJ40sxcq/y5Pt0wAyJbLCzd0OkoLfZ+k2+ds2y5p2N1vlDRc+RxAC7qXduddApo4euZo3iW0pGmgu/vTkl6Zs/kOSY9W/v+opDsTrkuS1GVdTfeZV/xeI3So468d56akFDX6gxk1N2649oakyslE3DR8i7u/LEmVf9+cXElXTO6cbLoPF0RRZMMvDhPqKRnfNl4z1OdpnqYGptSzoqfh15tMD659MK3yUpF689bMNpvZiJmNTExMpP1yQGaivIOMYvjF4cJdfCuKWiOJqo3A0a2jdUN9wbwF+sJdXyjcKJe4wxZ/YmbXufvLZnadpBP1dnT3vZL2SlJfX19Lc4gygRFCdsEv1H2sy7oaPj7XxgMbJalwARKyRot1V6czHt06mmVJqYvbQv+qpPsq/79P0leSKQcoh8mdky234Is2ogLhiTJs8XFJ35R0k5kdM7P7JT0k6UNm9oKkD1U+BzDD5M5J+YBr/137I+1ftBEVCE/TLhd3v6fOQ2sTrgUojblv9xfPX9z0Rpeijagom1UPr6rZ596zoqcwXTOM+QMycG7qXNMumKKNqCiTemEuTS9esuQzSzKuKJ6gAz2pUQRACC74BfX39dd8bP9d+7kgmqCFn15Y97FaQxmbzatzbupcIYaXBj05VyujBIAiGFw/WPg5t4ugUXbEnRStCHPfBx3oQNnVGlrX39dP6CMWAh3IkO0ydS/tbthKrC6bRqhnp9nPpCiC7kMHyihKcLS6fBpma/X6W71pAmZatmhZOyVlgkAHAtTK8mm4WqN5oOqF/fi28ct3kNZy+vzp4C+MEuhAgFh+sX0+4FeFd5d1RZr0r57QL4zShw4EaPP7NuddQu7qzcXSqBU9VzvhXURBt9AXGH9v0JmeeumpvEvIVaOJ+Zi0r76gA531QtGpxk6O5V0Cali7JuwZT4JuAl9yAh0oi1ot61rdJ3m3wH3Aa9awds1aHbz3YA4VRRd0oAMohyhzkzfaL2ut9NOHJOgulyhY6QVlZbvs8gcQReEDnUUB0AnKHOqNJtLK09DhIS3640Wz/rAyDj1lLAqATrH8oeV5l5CKOJPwpf0HbujwkDYe2KjJS7OHPYa+qHfhA51FAdApTp8/nXcJHaPRO/+Qby4qfKCzKADyUtQLZyHq3d2bdwmzFPWdP6NcAORu7ORYy90o1f2XLVqmU9tPJVrPDdfeoCNnjiT6nFkofAudi6LIS6sX83pW9AR/Y0pa0nw3c/r86bauL8y86Fn9aPTOP+SfYeEDvahvjVB8rV7MGzs5FnT/a5GdPn9aW57Y0vLX1XtXsPHARu2/a78Wzpv9Rzv0m4sK3+WypKsYi7cCSFfSC4NsuGVD4dZ5LXwL/dzFc3mXAGSiZ0VP3iUEr5WFQZr12Rdp/HlV4QOd+V6Ql1ZXxWlHz4oejW4dzez1iiqthUFCH39eVfhAZyEA5CXtubZ9wC9/lCHMs3iHkWYeFCHUCx/oLASAvKR9y/rMt/xluEt0dOto6qGe9tJ9oYd64S+KfuCGD+RdAjpUnFvW46oOzUt6vHXW5r7TiHMLf5d1NTz2c2dw7N3dm+j88iGPVCp8C/1j//SxvEsAMlHGW/9ndivVWgN0rlbXBE06zKtCbaUXPtAvXrqYdwkAEjK5c7JuqMdZ4DmtlZ9CbaW31eViZr8v6bckuaTDkn7D3f8vicIAdKYkLjbP7XbpFLFb6Ga2StLvSupz95slzZd0d1KFAZht2aJleZeAwLXb5bJA0mIzWyBpiaTj7ZfUGm62QKfofkN33iUEI2rrO6186F4a5s8idqC7+7ikP5d0VNLLks64+9eTKiyqMozPBaJIqz+4zBoNlaxeiI3j+GvHg7yTtJ0ul+WS7pC0RlK3pGvMbGON/Tab2YiZjUxMTMSvFABaUJ2sq9YfwqTv8g1lfHo7XS7rJL3o7hPufkHSAUm/MHcnd9/r7n3u3rdy5co2Xg4ArmjWut57aG/dce4X/MLlG8PmJTTYL4SRL+18J0clvd/MlpiZSVor6flkygIwF9eLWtPsrtHqzUmXVJ75oNrpQ/+2pC9J+o6mhyzOkxR9qrOExJkDGSii469mPuag0Dpxnqe23mu4+4C7v8vdb3b3Te5+PqnCoqrOgQyUXRnvFE1T1vM8hbCSUeHvFAXQuRr1oz/10lMNv7Z6YTSpPvR3vumdiTxPOwo/OReAcln32Lq6Fxi7l3ZrfNv4rG09K3pqjmQZOzlW9zHpyh2pUwNTsSYJmyvpFZPioIUOIBiNwlyaHv+96uFVs7Y1Gp/f6LGZId7f199ClfW1smJSGgh0AMGIMvTv+GvJXRyuhvrg+sFEQj3t+dibIdABdDTbZerd3avB9YNt3T1aNXR4KKHKWkegAxnqxBkA09ZofH7UsftjJ8curwzVbn/6pgObcgt1Ah0oCG4sqq3efC1xFtZOYmioy7VjeEfbzxMHo1yAjHUv7W65HzhOOJXZks8s0dkHzl7+PLRjc+TMEfXu7s28LlroQMbGt403nX61v69/1tJsoQVW3s5Nncu7hKbGTo6pd3dvpq9Z+EBPetY0IAvj28bbuikG0eW5MEjWUx4XPtCTWK4KCE2nzn2exkXjU9tPdcxqT/ShAwhG1DnFF89f3NLzntp+atbnSdwZGqLCt9ABlEfUOcXb7UPPavho1iOTCh/oyx9anncJQGz1fuEZothcu8u/+YCnOsXuskXLGOXSKqYURZHVGkPNEMXWtLP8W5pT7J4+f/ryH52s0IcO5Izwbt/wi8MaOjykDbdsaOnrqjMjpr2ugu2yTLp5Ct9CBwBJ2nhgY6xb7qtzuJQBgQ6gNPK65T4UhQ/0EJZ9AqLgQmf6jp45mncJdWXRl174QD9478G8SwAioa+8uXbv/Hb5VQtgdJLCBzqA8pjcOdl2qNda1aiZLU9saes1Q0GgAwjK5M7Jti9StjqbZdqjXLJCoAPoaHHHsIeIQAcywl3NrWk2xXAzUYcwRp1uIAlpd+0Q6EBMrfb1cldza8a3jbfVnx7iEMY9I3tSDXUCHYiJqZvTd8EvxP7aUIcw7j20N7XnJtCBNviAcy9EoJZ0LYm0X9Y/vymfSu25CXSgTQfvPSgfcO2/a3/epWCG1y+8Hql74+C9BzMN9TRneCTQgYRsuGWD+vv68y6jVNoN2qjdG9U/ylnM6ZLmDI9tBbqZLTOzL5nZ98zseTP7+aQKA4pm6PBQacYzh6Ld1nOc7o12R9c0U53hMQ3tttD/StK/uvu7JL1H0vPtl3RFWWZAQ2cIcVRFGcxsPbc6H06c7o3xbeOphvrCTy9M7bljB7qZ/bSkD0r6vCS5+6S7My4LHevImSN5l1B6tRYEaSRu98b4tvHULni3M3KnmXYWuHi7pAlJ/2Bm75F0SNIn3P31RCoDCqR3d2/eJXSM0a2jWv7Q8kjj+tvt3qg3+d+6x9ZlekNSVO10uSyQdJukPe7+XkmvS9o+dycz22xmI2Y2MjEx0cbLAeEaOzkWab+yrjaftVPbT2nZomUN97mm65rUXv8bL30jteduRzuBfkzSMXf/duXzL2k64Gdx973u3ufufStXrmzj5QDgilPbTzV8POqwxTjaGUve7mySjcQOdHf/saQfmdlNlU1rJUVrpgBAApoNnEjrrsy4Y8m7rCvVO4zbHeXyO5KGzOxZSbdK+pP2SwKKh9WIwpTWXZlxLrb6gKc+XURbge7uz1S6U97t7ne6e+P3QEBJsRpRfhq1ltO6K3Nw/WDLN5GxBB1QMtxbkbxGreU078ocXD+ot137ttSePw4CHUCh1Wotm0z9ff2p3pUpSQ+ufTDV52+VuWfXYujr6/ORkZGWvqbZ2xRaPAhF1LfUnLPl0kpXStyfvZkdcve+ZvvRQgcyxlj0zhV1FaW4CHQgIbS8O1Mr48o3HdiUaqgT6ECCCPXOM7lzMnKouzzVSdyCDnQW1UVZ0e1SLq2ML09zEregA51FdQEguqADHSiatC96IVwh3C1MoAMJGTo8pI0HNuZdBnIyunU09dWOmgk60JtNjwmEpNWLXfSjl8/4tnEtnr84t9cPOtCbTY8JhOTomaN5l4AAnH3gbG6N0aADHSiSG669Ie8SEIi8GqMEOpCQ0Ob1QL4atdLTWniDQAcSsuGWDdp/1/68y0AgXp18te5jaS28EXygc+cdimTDLRvkAz7roxFuniuvRotrpLXwRvCBDhRdo4UQTp8/TaiXVB4LbxDoQMqavb3mjuhyymPhjQWpPCvQwZZ8ZonOTZ3LuwzkrLq4xp6RPZe3mUwf7/t4agtvBL/AxfKHljdswdDHjpDEDfMsVtdBcZVmgQvejqJI4rbM94zsSW0oGzpH8IEOdIq0hrKhcxDoQCDSGsqGzkGgAwnKc2ImgEAHEnT2gbN1Qz2E+bJRbgQ6kLB6F0bHTo4R6kgVgQ5kaHTraMPHmSMd7SDQgcAQ6oiLQAcCRKgjjrYD3czmm9l/mdnXkigIABBPEi30T0h6PoHnAQC0oa1AN7PrJa2X9PfJlAOUH/MPIS3tttD/UtInJV1KoBagYzSaIx2IK3agm9lHJZ1w90NN9ttsZiNmNjIxMdHy63Qv7Y5bIhCswfWDDUOdVjziiD19rpn9qaRNki5K+ilJPy3pgLtvrPc1cabPlRpf8efER2g4X5G01KfPdfdPufv17r5a0t2S/q1RmAMA0sU4dAAoiUQC3d2/4e4fTeK5aqk3/wXzYiBEnK/ISyFa6KNbR6/6ZehZ0dN0XgwgD5yvyEthFonmlwFFwvmKPBSihQ4AaI5AB4CSINABoCQIdAAoCQIdAEoi9q3/sV7MbELSkRa+ZIWkkymVU0Qcjys4FrNxPGYr2/F4m7uvbLZTpoHeKjMbiTJ/QafgeFzBsZiN4zFbpx4PulwAoCQIdAAoidADfW/eBQSG43EFx2I2jsdsHXk8gu5DBwBEF3oLHQAQUZCBbma3m9n3zeyHZrY973ryYGYvmdlhM3vGzEYq295oZk+a2QuVf5fnXWdazOwRMzthZs/N2Fbz+7dpf105X541s9vyqzwddY7HH5nZeOUcecbMPjLjsU9Vjsf3zexX86k6HWb2VjP7dzN73sxGzewTle0de35UBRfoZjZf0m5JH5bUI+keM+vUiaR/2d1vnTH8arukYXe/UdJw5fOy2ifp9jnb6n3/H5Z0Y+Vjs6Q9GdWYpX26+nhI0mcr58it7v7PklT5fblbUm/lawYrv1dlcVHSNnf/WUnvl7S18j138vkhKcBAl/Rzkn7o7v/j7pOSvijpjpxrCsUdkh6t/P9RSXfmWEuq3P1pSa/M2Vzv+79D0mM+7VuSlpnZddlUmo06x6OeOyR90d3Pu/uLkn6o6d+rUnD3l939O5X/vyrpeUmr1MHnR1WIgb5K0o9mfH6ssq3TuKSvm9khM9tc2fYWd39Zmj6pJb05t+ryUe/77+Rz5rcr3QiPzOiC65jjYWarJb1X0rfF+RFkoNdaMr0Th+J8wN1v0/Tbxa1m9sG8CwpYp54zeyS9Q9Ktkl6W9HBle0ccDzNbKukfJf2eu/9vo11rbCvd8ZDCDPRjkt464/PrJR3PqZbcuPvxyr8nJH1Z02+Zf1J9q1j590R+Feai3vffkeeMu//E3afc/ZKkv9OVbpXSHw8z69J0mA+5+4HK5o4/P0IM9P+UdKOZrTGzhZq+uPPVnGvKlJldY2ZvqP5f0q9Iek7Tx+G+ym73SfpKPhXmpt73/1VJ91ZGM7xf0pnqW+8ym9MP/OuaPkek6eNxt5ktMrM1mr4Y+B9Z15cWMzNJn5f0vLv/xYyHOD/cPbgPSR+R9ANJ/y1pR9715PD9v13Sdysfo9VjIOlNmr56/0Ll3zfmXWuKx+BxTXcjXNB0C+v+et+/pt9S766cL4cl9eVdf0bH4wuV7/dZTYfWdTP231E5Ht+X9OG860/4WPyiprtMnpX0TOXjI518flQ/uFMUAEoixC4XAEAMBDoAlASBDgAlQaADQEkQ6ABQEgQ6AJQEgQ4AJUGgA0BJ/D+7+beni5C3OgAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"'''sales_array = df['Weekly_Sales'].as_matrix()\n",
"sales_array=sales_array.reshape(-1,1)\n",
"fuel_array = walmart_features['Fuel_Price'].as_matrix()\n",
"fuel_array = fuel_array.reshape(-1,1)\n",
"cpi_array = df['CPI'].as_matrix()\n",
"cpi_array=cpi_array.reshape(-1,1)'''\n",
"plt.plot(my_matrix,unemployment,'go')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 209,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n",
" normalize=False)"
]
},
"execution_count": 209,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"regression_model = LinearRegression()\n",
"regression_model.fit(my_matrix,unemployment)"
]
},
{
"cell_type": "code",
"execution_count": 210,
"metadata": {},
"outputs": [],
"source": [
"# Importing the necessary modules for Linear Regression\n",
"from sklearn import linear_model\n",
"lr = linear_model.LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 211,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop('Date',axis = 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We are applying linear regression to walmart features dataset"
]
},
{
"cell_type": "code",
"execution_count": 212,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X = my_matrix\n",
"y = unemployment\n",
"\n",
"# Split X and y into X_\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 213,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n",
" normalize=False)"
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.01523083, -0.33850998]])"
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We can reach each coefficient of our features with coef_\n",
"lr.coef_"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([11.75655358])"
]
},
"execution_count": 215,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# And we can reach intercept point (a14) with intercept_\n",
"lr.intercept_"
]
},
{
"cell_type": "code",
"execution_count": 216,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.10136350699753893"
]
},
"execution_count": 216,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 217,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-0.01511768 -0.362929 ]]\n",
"[11.81195473]\n"
]
}
],
"source": [
"print(regression_model.coef_) \n",
"print(regression_model.intercept_)"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [],
"source": [
"line_pred = regression_model.coef_ * unemployment + regression_model.intercept_\n",
"# This is actually y = a * x + b"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(my_matrix, unemployment, 'go', unemployment, line_pred)\n",
"axes = plt.gca()\n",
"#axes.set_ylim([0, 32])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Date</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>MarkDown1</th>\n",
" <th>MarkDown2</th>\n",
" <th>MarkDown3</th>\n",
" <th>MarkDown4</th>\n",
" <th>MarkDown5</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>IsHoliday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Date Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 \\\n",
"0 1 2010-02-05 42.31 2.572 NaN NaN NaN \n",
"1 1 2010-02-12 38.51 2.548 NaN NaN NaN \n",
"2 1 2010-02-19 39.93 2.514 NaN NaN NaN \n",
"3 1 2010-02-26 46.63 2.561 NaN NaN NaN \n",
"4 1 2010-03-05 46.50 2.625 NaN NaN NaN \n",
"\n",
" MarkDown4 MarkDown5 CPI Unemployment IsHoliday \n",
"0 NaN NaN 211.096358 8.106 False \n",
"1 NaN NaN 211.242170 8.106 True \n",
"2 NaN NaN 211.289143 8.106 False \n",
"3 NaN NaN 211.319643 8.106 False \n",
"4 NaN NaN 211.350143 8.106 False "
]
},
"execution_count": 220,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 221,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Dept</th>\n",
" <th>Date</th>\n",
" <th>Weekly_Sales</th>\n",
" <th>IsHoliday</th>\n",
" <th>Year</th>\n",
" <th>Month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>24924.50</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>46039.49</td>\n",
" <td>True</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>41595.55</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>19403.54</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>21827.90</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Dept Date Weekly_Sales IsHoliday Year Month\n",
"0 1 1 2010-02-05 24924.50 False 2010 2\n",
"1 1 1 2010-02-12 46039.49 True 2010 2\n",
"2 1 1 2010-02-19 41595.55 False 2010 2\n",
"3 1 1 2010-02-26 19403.54 False 2010 2\n",
"4 1 1 2010-03-05 21827.90 False 2010 3"
]
},
"execution_count": 221,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Decision Tree Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CPI attribute is used for the decision tree model. Classification is made with respect to mean of CPI values."
]
},
{
"cell_type": "code",
"execution_count": 222,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn import datasets\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 223,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Dept</th>\n",
" <th>Date</th>\n",
" <th>Weekly_Sales</th>\n",
" <th>IsHoliday</th>\n",
" <th>Year</th>\n",
" <th>Month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>24924.50</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>46039.49</td>\n",
" <td>True</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>41595.55</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>19403.54</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>21827.90</td>\n",
" <td>False</td>\n",
" <td>2010</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Dept Date Weekly_Sales IsHoliday Year Month\n",
"0 1 1 2010-02-05 24924.50 False 2010 2\n",
"1 1 1 2010-02-12 46039.49 True 2010 2\n",
"2 1 1 2010-02-19 41595.55 False 2010 2\n",
"3 1 1 2010-02-26 19403.54 False 2010 2\n",
"4 1 1 2010-03-05 21827.90 False 2010 3"
]
},
"execution_count": 223,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from os.path import join\n",
"walmart_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 224,
"metadata": {},
"outputs": [],
"source": [
"#Mapping discrete values to continuous values\n",
"walmart_features['IsHoliday'] = walmart_features['IsHoliday'].map({True : 1, False : 0})"
]
},
{
"cell_type": "code",
"execution_count": 225,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Date</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>MarkDown1</th>\n",
" <th>MarkDown2</th>\n",
" <th>MarkDown3</th>\n",
" <th>MarkDown4</th>\n",
" <th>MarkDown5</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>IsHoliday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Date Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 \\\n",
"0 1 2010-02-05 42.31 2.572 NaN NaN NaN \n",
"1 1 2010-02-12 38.51 2.548 NaN NaN NaN \n",
"2 1 2010-02-19 39.93 2.514 NaN NaN NaN \n",
"3 1 2010-02-26 46.63 2.561 NaN NaN NaN \n",
"4 1 2010-03-05 46.50 2.625 NaN NaN NaN \n",
"\n",
" MarkDown4 MarkDown5 CPI Unemployment IsHoliday \n",
"0 NaN NaN 211.096358 8.106 0 \n",
"1 NaN NaN 211.242170 8.106 1 \n",
"2 NaN NaN 211.289143 8.106 0 \n",
"3 NaN NaN 211.319643 8.106 0 \n",
"4 NaN NaN 211.350143 8.106 0 "
]
},
"execution_count": 225,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 226,
"metadata": {},
"outputs": [],
"source": [
"walmart_train['IsHoliday'] = walmart_train['IsHoliday'].map({True : 1, False : 0})"
]
},
{
"cell_type": "code",
"execution_count": 227,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Dept</th>\n",
" <th>Date</th>\n",
" <th>Weekly_Sales</th>\n",
" <th>IsHoliday</th>\n",
" <th>Year</th>\n",
" <th>Month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>24924.50</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>46039.49</td>\n",
" <td>1</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>41595.55</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>19403.54</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>21827.90</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Dept Date Weekly_Sales IsHoliday Year Month\n",
"0 1 1 2010-02-05 24924.50 0 2010 2\n",
"1 1 1 2010-02-12 46039.49 1 2010 2\n",
"2 1 1 2010-02-19 41595.55 0 2010 2\n",
"3 1 1 2010-02-26 19403.54 0 2010 2\n",
"4 1 1 2010-03-05 21827.90 0 2010 3"
]
},
"execution_count": 227,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 228,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Date</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>MarkDown1</th>\n",
" <th>MarkDown2</th>\n",
" <th>MarkDown3</th>\n",
" <th>MarkDown4</th>\n",
" <th>MarkDown5</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>IsHoliday</th>\n",
" <th>Year</th>\n",
" <th>Month</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2010-02-05</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010-02-12</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>1</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2010-02-19</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2010-02-26</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2010-03-05</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" <td>2010</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Date Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3 \\\n",
"0 1 2010-02-05 42.31 2.572 NaN NaN NaN \n",
"1 1 2010-02-12 38.51 2.548 NaN NaN NaN \n",
"2 1 2010-02-19 39.93 2.514 NaN NaN NaN \n",
"3 1 2010-02-26 46.63 2.561 NaN NaN NaN \n",
"4 1 2010-03-05 46.50 2.625 NaN NaN NaN \n",
"\n",
" MarkDown4 MarkDown5 CPI Unemployment IsHoliday Year Month \n",
"0 NaN NaN 211.096358 8.106 0 2010 2 \n",
"1 NaN NaN 211.242170 8.106 1 2010 2 \n",
"2 NaN NaN 211.289143 8.106 0 2010 2 \n",
"3 NaN NaN 211.319643 8.106 0 2010 2 \n",
"4 NaN NaN 211.350143 8.106 0 2010 3 "
]
},
"execution_count": 228,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features['Date'] = pd.to_datetime(walmart_features['Date'])\n",
"walmart_features['Year'], walmart_features['Month'] = walmart_features['Date'].dt.year, walmart_features['Date'].dt.month\n",
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 229,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Temperature Fuel_Price CPI Unemployment\n",
"0 1 42.31 2.572 211.096358 8.106\n",
"1 1 38.51 2.548 211.242170 8.106\n",
"2 1 39.93 2.514 211.289143 8.106\n",
"3 1 46.63 2.561 211.319643 8.106\n",
"4 1 46.50 2.625 211.350143 8.106"
]
},
"execution_count": 229,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features = walmart_features.drop('Date',axis = 1)\n",
"walmart_features = walmart_features.drop('Month',axis = 1)\n",
"walmart_features = walmart_features.drop('MarkDown1',axis = 1)\n",
"walmart_features = walmart_features.drop('MarkDown2',axis = 1)\n",
"walmart_features = walmart_features.drop('MarkDown3',axis = 1)\n",
"walmart_features = walmart_features.drop('MarkDown4',axis = 1)\n",
"walmart_features = walmart_features.drop('MarkDown5',axis = 1)\n",
"walmart_features = walmart_features.drop('Year',axis = 1)\n",
"walmart_features = walmart_features.drop('IsHoliday',axis = 1)\n",
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 230,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>cop</th>\n",
" <th>Unemployment</th>\n",
" <th>CPI</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Temperature Fuel_Price cop Unemployment CPI\n",
"0 1 42.31 2.572 211.096358 8.106 15\n",
"1 1 38.51 2.548 211.242170 8.106 15\n",
"2 1 39.93 2.514 211.289143 8.106 15\n",
"3 1 46.63 2.561 211.319643 8.106 15\n",
"4 1 46.50 2.625 211.350143 8.106 15"
]
},
"execution_count": 230,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features = walmart_features.rename(columns={\"CPI\": \"cop\"})\n",
"walmart_features['CPI'] = 15 # a new cpi column is added to walmart_features\n",
"mean = walmart_features['cop'].mean() # classification will be applied with respect to mean\n",
"#mapping is done with this loop indeed\n",
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 231,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"171.5783938487799\n"
]
}
],
"source": [
"#walmart_features['CPI'] = walmart_features['cop']\n",
"print(mean)"
]
},
{
"cell_type": "code",
"execution_count": 232,
"metadata": {},
"outputs": [],
"source": [
"'''for i in range(walmart_features['CPI'].size) :\n",
" if walmart_features['cop'][i] < mean :\n",
" walmart_features['CPI'][i] = 0\n",
" else :\n",
" walmart_features['CPI'][i] = 1'''"
]
},
{
"cell_type": "code",
"execution_count": 278,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Fuel_Price</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>Temperature</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2.572</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2.548</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2.514</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2.561</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2.625</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Fuel_Price CPI Unemployment Temperature\n",
"0 1 2.572 211.096358 8.106 0\n",
"1 1 2.548 211.242170 8.106 0\n",
"2 1 2.514 211.289143 8.106 0\n",
"3 1 2.561 211.319643 8.106 0\n",
"4 1 2.625 211.350143 8.106 0"
]
},
"execution_count": 278,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 286,
"metadata": {},
"outputs": [],
"source": [
"cop_mean = walmart_features[\"CPI\"].mean()\n",
"walmart_features[\"cpi_mod\"] = 0\n",
"walmart_features.loc[walmart_features[\"CPI\"] >= cop_mean, \"cpi_mod\"] = 1"
]
},
{
"cell_type": "code",
"execution_count": 288,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 3289\n",
"0 3146\n",
"Name: cpi_mod, dtype: int64"
]
},
"execution_count": 288,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features[\"cpi_mod\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 289,
"metadata": {},
"outputs": [],
"source": [
"#So, what am i interested is guessing the consumer prıce ındex \n",
"#X = walmart_features.drop('cpi_mod',axis = 1) # these are our features\n",
"\n",
"y = walmart_features['cpi_mod'] # this is what we want to predict."
]
},
{
"cell_type": "code",
"execution_count": 296,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Fuel_Price</th>\n",
" <th>Unemployment</th>\n",
" <th>Temperature</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2.572</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2.548</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2.514</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2.561</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2.625</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Fuel_Price Unemployment Temperature\n",
"0 1 2.572 8.106 10\n",
"1 1 2.548 8.106 10\n",
"2 1 2.514 8.106 10\n",
"3 1 2.561 8.106 10\n",
"4 1 2.625 8.106 10"
]
},
"execution_count": 296,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = walmart_features.drop('CPI',axis =1)\n",
"X = X.drop('cpi_mod',axis = 1) # these are our features\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
"Name: CPI, dtype: int64"
]
},
"execution_count": 243,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y.head()"
]
},
{
"cell_type": "code",
"execution_count": 297,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 298,
"metadata": {},
"outputs": [],
"source": [
"# This will create us train and test datasets from the original dataset that we have where\n",
"# 70% of original dataframe will be train set and 30% of it will be test set to evaluate the Decision Tree\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.30)"
]
},
{
"cell_type": "code",
"execution_count": 299,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import tree\n",
"model = tree.DecisionTreeClassifier()\n"
]
},
{
"cell_type": "code",
"execution_count": 307,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Fuel_Price</th>\n",
" <th>Unemployment</th>\n",
" <th>Temperature</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2.572</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2.548</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2.514</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2.561</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2.625</td>\n",
" <td>8.106</td>\n",
" <td>10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Fuel_Price Unemployment Temperature\n",
"0 1 2.572 8.106 10\n",
"1 1 2.548 8.106 10\n",
"2 1 2.514 8.106 10\n",
"3 1 2.561 8.106 10\n",
"4 1 2.625 8.106 10"
]
},
"execution_count": 307,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 300,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
" max_features=None, max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n",
" splitter='best')"
]
},
"execution_count": 300,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"#i will map nan values for markdowns which actually menas nothing ? -> bunu sor \n",
"#just drop the date object\n",
"\n",
"walmart_features.size\n",
"model.fit(X_train, y_train) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 301,
"metadata": {},
"outputs": [],
"source": [
"y_predict = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 306,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 306,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
"accuracy_score(y_test, y_predict)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It's obvious that our model predicted the CPI values with %100 accuracy"
]
},
{
"cell_type": "code",
"execution_count": 303,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[958, 0],\n",
" [ 0, 973]])"
]
},
"execution_count": 303,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics import confusion_matrix\n",
"confusion_matrix(y_test, y_predict)"
]
},
{
"cell_type": "code",
"execution_count": 304,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Predicted below mean</th>\n",
" <th>Predicted above mean</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>True below mean</th>\n",
" <td>958</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True above mean</th>\n",
" <td>0</td>\n",
" <td>973</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Predicted below mean Predicted above mean\n",
"True below mean 958 0\n",
"True above mean 0 973"
]
},
"execution_count": 304,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(\n",
" confusion_matrix(y_test, y_predict),\n",
" columns=['Predicted below mean', 'Predicted above mean'],\n",
" index=['True below mean', 'True above mean'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Random forest algorithm"
]
},
{
"cell_type": "code",
"execution_count": 244,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Temperature Fuel_Price CPI Unemployment\n",
"0 1 42.31 2.572 211.096358 8.106\n",
"1 1 38.51 2.548 211.242170 8.106\n",
"2 1 39.93 2.514 211.289143 8.106\n",
"3 1 46.63 2.561 211.319643 8.106\n",
"4 1 46.50 2.625 211.350143 8.106"
]
},
"execution_count": 244,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features = walmart_features.drop('CPI',axis = 1)\n",
"walmart_features = walmart_features.rename(index=str, columns={\"cop\": \"CPI\"})\n",
"walmart_features.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Temperature is our target label, but currently it seems to have letter codes as labels. Let's check it below and map those labels correctly before proceeding."
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Temperature</th>\n",
" <th>Fuel_Price</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>42.31</td>\n",
" <td>2.572</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>38.51</td>\n",
" <td>2.548</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>39.93</td>\n",
" <td>2.514</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>46.63</td>\n",
" <td>2.561</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>46.50</td>\n",
" <td>2.625</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Temperature Fuel_Price CPI Unemployment\n",
"0 1 42.31 2.572 211.096358 8.106\n",
"1 1 38.51 2.548 211.242170 8.106\n",
"2 1 39.93 2.514 211.289143 8.106\n",
"3 1 46.63 2.561 211.319643 8.106\n",
"4 1 46.50 2.625 211.350143 8.106"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 258,
"metadata": {},
"outputs": [],
"source": [
"# Mapping target label to 0 and 1s\n",
"temp_mean = walmart_features['Temperature'].mean()\n",
"walmart_features['COP'] = 15\n",
"for i in range(walmart_features['Temperature'].size) :\n",
" if walmart_features['Temperature'][i] < temp_mean :\n",
" walmart_features['COP'][i] = 0\n",
" else :\n",
" walmart_features['COP'][i] = 1\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 260,
"metadata": {},
"outputs": [],
"source": [
"walmart_features = walmart_features.drop('Temperature',axis = 1)\n",
"walmart_features = walmart_features.rename(index=str, columns={\"COP\": \"Temperature\"})\n"
]
},
{
"cell_type": "code",
"execution_count": 261,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Fuel_Price</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>Temperature</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2.572</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2.548</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2.514</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2.561</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2.625</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Fuel_Price CPI Unemployment Temperature\n",
"0 1 2.572 211.096358 8.106 0\n",
"1 1 2.548 211.242170 8.106 0\n",
"2 1 2.514 211.289143 8.106 0\n",
"3 1 2.561 211.319643 8.106 0\n",
"4 1 2.625 211.350143 8.106 0"
]
},
"execution_count": 261,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 262,
"metadata": {},
"outputs": [],
"source": [
"# Creating training and test splits from the original dataframe\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"#walmart_features = walmart_features.drop('Weekly_Sales',axis = 1)\n",
"A = walmart_features.iloc[:, walmart_features.columns != 'Temperature']\n",
"b = walmart_features.iloc[:, walmart_features.columns == 'Temperature']\n",
"\n",
"# 75% for training and 25% for testing\n",
"A_train, A_test, b_train, b_test = train_test_split(A, b, test_size= 0.25, random_state = 42)"
]
},
{
"cell_type": "code",
"execution_count": 265,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"rf = RandomForestClassifier(random_state = 42)"
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store</th>\n",
" <th>Fuel_Price</th>\n",
" <th>CPI</th>\n",
" <th>Unemployment</th>\n",
" <th>Temperature</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2.572</td>\n",
" <td>211.096358</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2.548</td>\n",
" <td>211.242170</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2.514</td>\n",
" <td>211.289143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2.561</td>\n",
" <td>211.319643</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2.625</td>\n",
" <td>211.350143</td>\n",
" <td>8.106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store Fuel_Price CPI Unemployment Temperature\n",
"0 1 2.572 211.096358 8.106 0\n",
"1 1 2.548 211.242170 8.106 0\n",
"2 1 2.514 211.289143 8.106 0\n",
"3 1 2.561 211.319643 8.106 0\n",
"4 1 2.625 211.350143 8.106 0"
]
},
"execution_count": 266,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walmart_features.head()"
]
},
{
"cell_type": "code",
"execution_count": 267,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n",
" oob_score=False, random_state=42, verbose=0, warm_start=False)"
]
},
"execution_count": 267,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rf.fit(A_train, b_train)"
]
},
{
"cell_type": "code",
"execution_count": 272,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 2567\n",
"0 2259\n",
"Name: Temperature, dtype: int64"
]
},
"execution_count": 272,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b_train[\"Temperature\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 273,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 868\n",
"0 741\n",
"Name: Temperature, dtype: int64"
]
},
"execution_count": 273,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b_test[\"Temperature\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 268,
"metadata": {},
"outputs": [],
"source": [
"pred_rf = rf.predict(A_test)"
]
},
{
"cell_type": "code",
"execution_count": 269,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of the random forest model: 0.9210689869484152\n"
]
}
],
"source": [
"# Checking the performance of the model with accuracy score;\n",
"from sklearn import metrics\n",
"\n",
"print(\"Accuracy of the random forest model: \",metrics.accuracy_score(b_test, pred_rf))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Our model uses many decision trees to work properly and it works very well"
]
},
{
"cell_type": "code",
"execution_count": 274,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.07348599, 0.30663925, 0.43827941, 0.18159536])"
]
},
"execution_count": 274,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# It seems our model has a function called feature_importances_\n",
"# Let's call it and see what it does\n",
"rf.feature_importances_"
]
},
{
"cell_type": "code",
"execution_count": 277,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 864x864 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Creating a bar plot for feature importances\n",
"\n",
"# Firstly creating a Pandas Series to match feature importances values and their indices, also sorting them in decreasing order\n",
"feature_importances = pd.Series(rf.feature_importances_, index=A.columns).sort_values(ascending=False)\n",
"\n",
"plt.figure(figsize=(12, 12))\n",
"sns.barplot(x=feature_importances, y=feature_importances.index)\n",
"\n",
"# Add labels to our graph \n",
"plt.xlabel('Feature Importance Score')\n",
"plt.ylabel('Features')\n",
"plt.title(\"Feature Importance Rankings\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment