Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save abegehr/f620ee57ee2e53df6235812c41d0f132 to your computer and use it in GitHub Desktop.
Save abegehr/f620ee57ee2e53df6235812c41d0f132 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/kaggle/input/covid19-global-forecasting-week-2/train.csv\n",
"/kaggle/input/covid19-global-forecasting-week-2/submission.csv\n",
"/kaggle/input/covid19-global-forecasting-week-2/test.csv\n",
"/kaggle/input/countryinfo/covid19countryinfo.csv\n",
"/kaggle/input/countryinfo/restrictions.csv\n"
]
}
],
"source": [
"import os\n",
"for dirname, _, filenames in os.walk('/kaggle/input'):\n",
" for filename in filenames:\n",
" print(os.path.join(dirname, filename))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 0. Load Data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
"_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
},
"outputs": [],
"source": [
"submission = pd.read_csv(\"../input/covid19-global-forecasting-week-2/submission.csv\")\n",
"test = pd.read_csv(\"../input/covid19-global-forecasting-week-2/test.csv\", parse_dates=['Date'])\n",
"train = pd.read_csv(\"../input/covid19-global-forecasting-week-2/train.csv\", parse_dates=['Date'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Exploratory Data Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1.1 Input"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1.1 train"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>Province_State</th>\n",
" <th>Country_Region</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-22</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-23</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-24</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-25</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-26</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Id Province_State Country_Region Date ConfirmedCases Fatalities\n",
"0 1 NaN Afghanistan 2020-01-22 0.0 0.0\n",
"1 2 NaN Afghanistan 2020-01-23 0.0 0.0\n",
"2 3 NaN Afghanistan 2020-01-24 0.0 0.0\n",
"3 4 NaN Afghanistan 2020-01-25 0.0 0.0\n",
"4 5 NaN Afghanistan 2020-01-26 0.0 0.0"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>19698.000000</td>\n",
" <td>19698.000000</td>\n",
" <td>19698.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>14684.000000</td>\n",
" <td>418.251447</td>\n",
" <td>16.258097</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>8487.237333</td>\n",
" <td>3985.736398</td>\n",
" <td>228.217866</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>7334.250000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>14684.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>22033.750000</td>\n",
" <td>22.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>29367.000000</td>\n",
" <td>92472.000000</td>\n",
" <td>10023.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Id ConfirmedCases Fatalities\n",
"count 19698.000000 19698.000000 19698.000000\n",
"mean 14684.000000 418.251447 16.258097\n",
"std 8487.237333 3985.736398 228.217866\n",
"min 1.000000 0.000000 0.000000\n",
"25% 7334.250000 0.000000 0.000000\n",
"50% 14684.000000 0.000000 0.000000\n",
"75% 22033.750000 22.000000 0.000000\n",
"max 29367.000000 92472.000000 10023.000000"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Id int64\n",
"Province_State object\n",
"Country_Region object\n",
"Date datetime64[ns]\n",
"ConfirmedCases float64\n",
"Fatalities float64\n",
"dtype: object\n",
"\n",
"\n",
"Number of Country_Region: 173\n",
"Dates go from day 2020-03-28 00:00:00 to day 2020-01-22 00:00:00 , a total of 67 days\n",
"Countries with Province/State informed: ['Australia' 'Canada' 'China' 'Denmark' 'France' 'Netherlands' 'US'\n",
" 'United Kingdom']\n"
]
}
],
"source": [
"display(train.head(5))\n",
"display(train.describe())\n",
"print(train.dtypes)\n",
"print(\"\\n\")\n",
"print(\"Number of Country_Region: \", train['Country_Region'].nunique())\n",
"print(\"Dates go from day\", max(train['Date']), \"to day\", min(train['Date']), \", a total of\", train['Date'].nunique(), \"days\")\n",
"print(\"Countries with Province/State informed: \", train[train['Province_State'].isna()==False]['Country_Region'].unique())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1.2 test"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>Province_State</th>\n",
" <th>Country_Region</th>\n",
" <th>Date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-23</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ForecastId Province_State Country_Region Date\n",
"0 1 NaN Afghanistan 2020-03-19\n",
"1 2 NaN Afghanistan 2020-03-20\n",
"2 3 NaN Afghanistan 2020-03-21\n",
"3 4 NaN Afghanistan 2020-03-22\n",
"4 5 NaN Afghanistan 2020-03-23"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>12642.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>6321.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3649.575386</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>3161.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>6321.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>9481.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>12642.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ForecastId\n",
"count 12642.000000\n",
"mean 6321.500000\n",
"std 3649.575386\n",
"min 1.000000\n",
"25% 3161.250000\n",
"50% 6321.500000\n",
"75% 9481.750000\n",
"max 12642.000000"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Country_Region: 173\n",
"Dates go from day 2020-04-30 00:00:00 to day 2020-03-19 00:00:00 , a total of 43 days\n",
"Countries with Province/State informed: ['Australia' 'Canada' 'China' 'Denmark' 'France' 'Netherlands' 'US'\n",
" 'United Kingdom']\n"
]
}
],
"source": [
"display(test.head(5))\n",
"display(test.describe())\n",
"print(\"Number of Country_Region: \", test['Country_Region'].nunique())\n",
"print(\"Dates go from day\", max(test['Date']), \"to day\", min(test['Date']), \", a total of\", test['Date'].nunique(), \"days\")\n",
"print(\"Countries with Province/State informed: \", test[test['Province_State'].isna()==False]['Country_Region'].unique())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1.3 Submission"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ForecastId ConfirmedCases Fatalities\n",
"0 1 1 1\n",
"1 2 1 1\n",
"2 3 1 1\n",
"3 4 1 1\n",
"4 5 1 1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>12642.000000</td>\n",
" <td>12642.0</td>\n",
" <td>12642.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>6321.500000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3649.575386</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>3161.250000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>6321.500000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>9481.750000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>12642.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ForecastId ConfirmedCases Fatalities\n",
"count 12642.000000 12642.0 12642.0\n",
"mean 6321.500000 1.0 1.0\n",
"std 3649.575386 0.0 0.0\n",
"min 1.000000 1.0 1.0\n",
"25% 3161.250000 1.0 1.0\n",
"50% 6321.500000 1.0 1.0\n",
"75% 9481.750000 1.0 1.0\n",
"max 12642.000000 1.0 1.0"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(submission.head(5))\n",
"display(submission.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1.2 Cases by Region over Time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2.1 geo_id = Country_Region + Province_State\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>Province_State</th>\n",
" <th>Country_Region</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" <th>geo_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-22</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-23</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-24</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-25</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-01-26</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Id Province_State Country_Region Date ConfirmedCases Fatalities \\\n",
"0 1 NaN Afghanistan 2020-01-22 0.0 0.0 \n",
"1 2 NaN Afghanistan 2020-01-23 0.0 0.0 \n",
"2 3 NaN Afghanistan 2020-01-24 0.0 0.0 \n",
"3 4 NaN Afghanistan 2020-01-25 0.0 0.0 \n",
"4 5 NaN Afghanistan 2020-01-26 0.0 0.0 \n",
"\n",
" geo_id \n",
"0 Afghanistan_nan \n",
"1 Afghanistan_nan \n",
"2 Afghanistan_nan \n",
"3 Afghanistan_nan \n",
"4 Afghanistan_nan "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['geo_id'] = train['Country_Region'].astype(str) + '_' + train['Province_State'].astype(str)\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>Province_State</th>\n",
" <th>Country_Region</th>\n",
" <th>Date</th>\n",
" <th>geo_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-19</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-20</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-21</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-22</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-23</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ForecastId Province_State Country_Region Date geo_id\n",
"0 1 NaN Afghanistan 2020-03-19 Afghanistan_nan\n",
"1 2 NaN Afghanistan 2020-03-20 Afghanistan_nan\n",
"2 3 NaN Afghanistan 2020-03-21 Afghanistan_nan\n",
"3 4 NaN Afghanistan 2020-03-22 Afghanistan_nan\n",
"4 5 NaN Afghanistan 2020-03-23 Afghanistan_nan"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test['geo_id'] = test['Country_Region'].astype(str) + '_' + test['Province_State'].astype(str)\n",
"test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2.2 combined plot"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"geo_id\n",
"Afghanistan_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Albania_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Algeria_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Andorra_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Angola_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
" ... \n",
"Uzbekistan_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Venezuela_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Vietnam_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Zambia_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Zimbabwe_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Length: 294, dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots()\n",
"train.sort_values(by=\"Date\").groupby('geo_id').plot.line(x='Date', y='ConfirmedCases', ax=ax, legend=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2.3 single plots"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#train.groupby('geo_id').plot.line(x='Date', y='ConfirmedCases')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1.3 China"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Anhui', 'Beijing', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong',\n",
" 'Guangxi', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan',\n",
" 'Hong Kong', 'Hubei', 'Hunan', 'Inner Mongolia', 'Jiangsu',\n",
" 'Jiangxi', 'Jilin', 'Liaoning', 'Macau', 'Ningxia', 'Qinghai',\n",
" 'Shaanxi', 'Shandong', 'Shanghai', 'Shanxi', 'Sichuan', 'Tianjin',\n",
" 'Tibet', 'Xinjiang', 'Yunnan', 'Zhejiang'], dtype=object)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train.Country_Region == \"China\"].Province_State.unique()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Province_State</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Anhui</th>\n",
" <td>330578</td>\n",
" <td>52597.0</td>\n",
" <td>281.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Beijing</th>\n",
" <td>337278</td>\n",
" <td>23817.0</td>\n",
" <td>322.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Chongqing</th>\n",
" <td>343978</td>\n",
" <td>31550.0</td>\n",
" <td>287.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fujian</th>\n",
" <td>350678</td>\n",
" <td>16558.0</td>\n",
" <td>38.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Gansu</th>\n",
" <td>357378</td>\n",
" <td>5956.0</td>\n",
" <td>99.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Guangdong</th>\n",
" <td>364078</td>\n",
" <td>74255.0</td>\n",
" <td>305.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Guangxi</th>\n",
" <td>370778</td>\n",
" <td>13622.0</td>\n",
" <td>94.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Guizhou</th>\n",
" <td>377478</td>\n",
" <td>7575.0</td>\n",
" <td>93.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Hainan</th>\n",
" <td>384178</td>\n",
" <td>9084.0</td>\n",
" <td>267.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Hebei</th>\n",
" <td>390878</td>\n",
" <td>16368.0</td>\n",
" <td>274.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Heilongjiang</th>\n",
" <td>397578</td>\n",
" <td>24411.0</td>\n",
" <td>623.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Henan</th>\n",
" <td>404278</td>\n",
" <td>68185.0</td>\n",
" <td>966.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Hong Kong</th>\n",
" <td>410978</td>\n",
" <td>7589.0</td>\n",
" <td>130.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Hubei</th>\n",
" <td>417678</td>\n",
" <td>3233890.0</td>\n",
" <td>130749.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Hunan</th>\n",
" <td>424378</td>\n",
" <td>55346.0</td>\n",
" <td>178.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Inner Mongolia</th>\n",
" <td>431078</td>\n",
" <td>4021.0</td>\n",
" <td>26.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Jiangsu</th>\n",
" <td>437778</td>\n",
" <td>33468.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Jiangxi</th>\n",
" <td>444478</td>\n",
" <td>49861.0</td>\n",
" <td>48.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Jilin</th>\n",
" <td>451178</td>\n",
" <td>4895.0</td>\n",
" <td>51.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Liaoning</th>\n",
" <td>457878</td>\n",
" <td>6920.0</td>\n",
" <td>55.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Macau</th>\n",
" <td>464578</td>\n",
" <td>773.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ningxia</th>\n",
" <td>471278</td>\n",
" <td>3857.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Qinghai</th>\n",
" <td>477978</td>\n",
" <td>1037.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Shaanxi</th>\n",
" <td>484678</td>\n",
" <td>13407.0</td>\n",
" <td>67.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Shandong</th>\n",
" <td>491378</td>\n",
" <td>37457.0</td>\n",
" <td>256.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Shanghai</th>\n",
" <td>498078</td>\n",
" <td>19653.0</td>\n",
" <td>149.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Shanxi</th>\n",
" <td>504778</td>\n",
" <td>7272.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Sichuan</th>\n",
" <td>511478</td>\n",
" <td>28454.0</td>\n",
" <td>144.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Tianjin</th>\n",
" <td>518178</td>\n",
" <td>7222.0</td>\n",
" <td>145.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Tibet</th>\n",
" <td>524878</td>\n",
" <td>59.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Xinjiang</th>\n",
" <td>531578</td>\n",
" <td>3903.0</td>\n",
" <td>111.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Yunnan</th>\n",
" <td>538278</td>\n",
" <td>9704.0</td>\n",
" <td>77.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zhejiang</th>\n",
" <td>544978</td>\n",
" <td>67430.0</td>\n",
" <td>38.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Id ConfirmedCases Fatalities\n",
"Province_State \n",
"Anhui 330578 52597.0 281.0\n",
"Beijing 337278 23817.0 322.0\n",
"Chongqing 343978 31550.0 287.0\n",
"Fujian 350678 16558.0 38.0\n",
"Gansu 357378 5956.0 99.0\n",
"Guangdong 364078 74255.0 305.0\n",
"Guangxi 370778 13622.0 94.0\n",
"Guizhou 377478 7575.0 93.0\n",
"Hainan 384178 9084.0 267.0\n",
"Hebei 390878 16368.0 274.0\n",
"Heilongjiang 397578 24411.0 623.0\n",
"Henan 404278 68185.0 966.0\n",
"Hong Kong 410978 7589.0 130.0\n",
"Hubei 417678 3233890.0 130749.0\n",
"Hunan 424378 55346.0 178.0\n",
"Inner Mongolia 431078 4021.0 26.0\n",
"Jiangsu 437778 33468.0 0.0\n",
"Jiangxi 444478 49861.0 48.0\n",
"Jilin 451178 4895.0 51.0\n",
"Liaoning 457878 6920.0 55.0\n",
"Macau 464578 773.0 0.0\n",
"Ningxia 471278 3857.0 0.0\n",
"Qinghai 477978 1037.0 0.0\n",
"Shaanxi 484678 13407.0 67.0\n",
"Shandong 491378 37457.0 256.0\n",
"Shanghai 498078 19653.0 149.0\n",
"Shanxi 504778 7272.0 0.0\n",
"Sichuan 511478 28454.0 144.0\n",
"Tianjin 518178 7222.0 145.0\n",
"Tibet 524878 59.0 0.0\n",
"Xinjiang 531578 3903.0 111.0\n",
"Yunnan 538278 9704.0 77.0\n",
"Zhejiang 544978 67430.0 38.0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train.Country_Region == \"China\"].groupby(\"Province_State\").sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.3.1 Hubei, China"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>Province_State</th>\n",
" <th>Country_Region</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" <th>geo_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4154</th>\n",
" <td>6201</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-01-22</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4155</th>\n",
" <td>6202</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-01-23</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4156</th>\n",
" <td>6203</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-01-24</td>\n",
" <td>549.0</td>\n",
" <td>24.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4157</th>\n",
" <td>6204</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-01-25</td>\n",
" <td>761.0</td>\n",
" <td>40.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4158</th>\n",
" <td>6205</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-01-26</td>\n",
" <td>1058.0</td>\n",
" <td>52.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4216</th>\n",
" <td>6263</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-03-24</td>\n",
" <td>67801.0</td>\n",
" <td>3160.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4217</th>\n",
" <td>6264</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-03-25</td>\n",
" <td>67801.0</td>\n",
" <td>3163.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4218</th>\n",
" <td>6265</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-03-26</td>\n",
" <td>67801.0</td>\n",
" <td>3169.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4219</th>\n",
" <td>6266</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-03-27</td>\n",
" <td>67801.0</td>\n",
" <td>3174.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4220</th>\n",
" <td>6267</td>\n",
" <td>Hubei</td>\n",
" <td>China</td>\n",
" <td>2020-03-28</td>\n",
" <td>67801.0</td>\n",
" <td>3177.0</td>\n",
" <td>China_Hubei</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>67 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" Id Province_State Country_Region Date ConfirmedCases \\\n",
"4154 6201 Hubei China 2020-01-22 444.0 \n",
"4155 6202 Hubei China 2020-01-23 444.0 \n",
"4156 6203 Hubei China 2020-01-24 549.0 \n",
"4157 6204 Hubei China 2020-01-25 761.0 \n",
"4158 6205 Hubei China 2020-01-26 1058.0 \n",
"... ... ... ... ... ... \n",
"4216 6263 Hubei China 2020-03-24 67801.0 \n",
"4217 6264 Hubei China 2020-03-25 67801.0 \n",
"4218 6265 Hubei China 2020-03-26 67801.0 \n",
"4219 6266 Hubei China 2020-03-27 67801.0 \n",
"4220 6267 Hubei China 2020-03-28 67801.0 \n",
"\n",
" Fatalities geo_id \n",
"4154 17.0 China_Hubei \n",
"4155 17.0 China_Hubei \n",
"4156 24.0 China_Hubei \n",
"4157 40.0 China_Hubei \n",
"4158 52.0 China_Hubei \n",
"... ... ... \n",
"4216 3160.0 China_Hubei \n",
"4217 3163.0 China_Hubei \n",
"4218 3169.0 China_Hubei \n",
"4219 3174.0 China_Hubei \n",
"4220 3177.0 China_Hubei \n",
"\n",
"[67 rows x 7 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_hubei = train[train['geo_id'] == \"China_Hubei\"]\n",
"train_hubei"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7fbfebd14cf8>"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots()\n",
"train_hubei.plot.line(x='Date', y='ConfirmedCases', ax=ax, legend=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2 Copy Hubei, China\n",
"Idea: Scale number from Hubei, China by population and translate by date of first infection."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.1 Feature Preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1.1 Population Data for Country_Regions\n",
"> https://www.kaggle.com/koryto/countryinfo"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN\n",
"Albania_nan Albania NaN\n",
"Algeria_nan Algeria NaN\n",
"Andorra_nan Andorra NaN\n",
"Angola_nan Angola NaN\n",
"... ... ...\n",
"Uzbekistan_nan Uzbekistan NaN\n",
"Venezuela_nan Venezuela NaN\n",
"Vietnam_nan Vietnam NaN\n",
"Zambia_nan Zambia NaN\n",
"Zimbabwe_nan Zimbabwe NaN\n",
"\n",
"[294 rows x 2 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"area_features = train[['geo_id', 'Country_Region', 'Province_State']].drop_duplicates().set_index('geo_id')\n",
"area_features"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"countryinfo = pd.read_csv(\"../input/countryinfo/covid19countryinfo.csv\", thousands=',')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"region object\n",
"country object\n",
"alpha3code object\n",
"alpha2code object\n",
"pop float64\n",
"tests float64\n",
"testpop float64\n",
"density float64\n",
"medianage float64\n",
"urbanpop float64\n",
"quarantine object\n",
"schools object\n",
"publicplace object\n",
"gatheringlimit object\n",
"gathering object\n",
"nonessential object\n",
"hospibed float64\n",
"smokers float64\n",
"sex0 float64\n",
"sex14 float64\n",
"sex25 float64\n",
"sex54 float64\n",
"sex64 float64\n",
"sex65plus float64\n",
"sexratio float64\n",
"lung float64\n",
"femalelung float64\n",
"malelung float64\n",
"gdp2019 float64\n",
"healthexp float64\n",
"healthperpop float64\n",
"fertility float64\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>region</th>\n",
" <th>country</th>\n",
" <th>alpha3code</th>\n",
" <th>alpha2code</th>\n",
" <th>pop</th>\n",
" <th>tests</th>\n",
" <th>testpop</th>\n",
" <th>density</th>\n",
" <th>medianage</th>\n",
" <th>urbanpop</th>\n",
" <th>...</th>\n",
" <th>sex64</th>\n",
" <th>sex65plus</th>\n",
" <th>sexratio</th>\n",
" <th>lung</th>\n",
" <th>femalelung</th>\n",
" <th>malelung</th>\n",
" <th>gdp2019</th>\n",
" <th>healthexp</th>\n",
" <th>healthperpop</th>\n",
" <th>fertility</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>AFG</td>\n",
" <td>AF</td>\n",
" <td>38928346.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>60.0</td>\n",
" <td>18.0</td>\n",
" <td>25.0</td>\n",
" <td>...</td>\n",
" <td>0.97</td>\n",
" <td>0.85</td>\n",
" <td>1.03</td>\n",
" <td>37.62</td>\n",
" <td>36.31</td>\n",
" <td>39.33</td>\n",
" <td>18734.0</td>\n",
" <td>184.0</td>\n",
" <td>4.726633</td>\n",
" <td>5.12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>Albania</td>\n",
" <td>ALB</td>\n",
" <td>AL</td>\n",
" <td>2877797.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>105.0</td>\n",
" <td>36.0</td>\n",
" <td>63.0</td>\n",
" <td>...</td>\n",
" <td>0.95</td>\n",
" <td>0.87</td>\n",
" <td>0.98</td>\n",
" <td>11.67</td>\n",
" <td>7.02</td>\n",
" <td>17.04</td>\n",
" <td>15418.0</td>\n",
" <td>774.0</td>\n",
" <td>268.955733</td>\n",
" <td>1.51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>Algeria</td>\n",
" <td>DZA</td>\n",
" <td>DZ</td>\n",
" <td>43851044.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>18.0</td>\n",
" <td>29.0</td>\n",
" <td>73.0</td>\n",
" <td>...</td>\n",
" <td>1.01</td>\n",
" <td>0.89</td>\n",
" <td>1.03</td>\n",
" <td>8.77</td>\n",
" <td>5.03</td>\n",
" <td>12.81</td>\n",
" <td>172781.0</td>\n",
" <td>1031.0</td>\n",
" <td>23.511413</td>\n",
" <td>2.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>Andorra</td>\n",
" <td>AND</td>\n",
" <td>AD</td>\n",
" <td>77265.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>164.0</td>\n",
" <td>45.0</td>\n",
" <td>88.0</td>\n",
" <td>...</td>\n",
" <td>1.15</td>\n",
" <td>1.02</td>\n",
" <td>1.06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>91527.0</td>\n",
" <td>5949.0</td>\n",
" <td>76994.758300</td>\n",
" <td>1.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>Antigua and Barbuda</td>\n",
" <td>ATG</td>\n",
" <td>AG</td>\n",
" <td>97929.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>223.0</td>\n",
" <td>34.0</td>\n",
" <td>26.0</td>\n",
" <td>...</td>\n",
" <td>0.82</td>\n",
" <td>0.76</td>\n",
" <td>0.90</td>\n",
" <td>11.76</td>\n",
" <td>7.67</td>\n",
" <td>18.78</td>\n",
" <td>1688.0</td>\n",
" <td>1105.0</td>\n",
" <td>11283.685120</td>\n",
" <td>2.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 32 columns</p>\n",
"</div>"
],
"text/plain": [
" region country alpha3code alpha2code pop tests \\\n",
"0 NaN Afghanistan AFG AF 38928346.0 NaN \n",
"1 NaN Albania ALB AL 2877797.0 NaN \n",
"2 NaN Algeria DZA DZ 43851044.0 NaN \n",
"3 NaN Andorra AND AD 77265.0 NaN \n",
"4 NaN Antigua and Barbuda ATG AG 97929.0 NaN \n",
"\n",
" testpop density medianage urbanpop ... sex64 sex65plus sexratio lung \\\n",
"0 NaN 60.0 18.0 25.0 ... 0.97 0.85 1.03 37.62 \n",
"1 NaN 105.0 36.0 63.0 ... 0.95 0.87 0.98 11.67 \n",
"2 NaN 18.0 29.0 73.0 ... 1.01 0.89 1.03 8.77 \n",
"3 NaN 164.0 45.0 88.0 ... 1.15 1.02 1.06 NaN \n",
"4 NaN 223.0 34.0 26.0 ... 0.82 0.76 0.90 11.76 \n",
"\n",
" femalelung malelung gdp2019 healthexp healthperpop fertility \n",
"0 36.31 39.33 18734.0 184.0 4.726633 5.12 \n",
"1 7.02 17.04 15418.0 774.0 268.955733 1.51 \n",
"2 5.03 12.81 172781.0 1031.0 23.511413 2.70 \n",
"3 NaN NaN 91527.0 5949.0 76994.758300 1.40 \n",
"4 7.67 18.78 1688.0 1105.0 11283.685120 2.00 \n",
"\n",
"[5 rows x 32 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pop</th>\n",
" <th>tests</th>\n",
" <th>testpop</th>\n",
" <th>density</th>\n",
" <th>medianage</th>\n",
" <th>urbanpop</th>\n",
" <th>hospibed</th>\n",
" <th>smokers</th>\n",
" <th>sex0</th>\n",
" <th>sex14</th>\n",
" <th>...</th>\n",
" <th>sex64</th>\n",
" <th>sex65plus</th>\n",
" <th>sexratio</th>\n",
" <th>lung</th>\n",
" <th>femalelung</th>\n",
" <th>malelung</th>\n",
" <th>gdp2019</th>\n",
" <th>healthexp</th>\n",
" <th>healthperpop</th>\n",
" <th>fertility</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1.870000e+02</td>\n",
" <td>52.000000</td>\n",
" <td>52.000000</td>\n",
" <td>187.000000</td>\n",
" <td>187.000000</td>\n",
" <td>187.000000</td>\n",
" <td>186.000000</td>\n",
" <td>123.000000</td>\n",
" <td>187.000000</td>\n",
" <td>187.000000</td>\n",
" <td>...</td>\n",
" <td>187.000000</td>\n",
" <td>187.000000</td>\n",
" <td>186.000000</td>\n",
" <td>164.000000</td>\n",
" <td>164.000000</td>\n",
" <td>164.000000</td>\n",
" <td>1.890000e+02</td>\n",
" <td>176.000000</td>\n",
" <td>183.000000</td>\n",
" <td>185.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>4.065431e+07</td>\n",
" <td>30192.903846</td>\n",
" <td>22281.534633</td>\n",
" <td>412.743316</td>\n",
" <td>32.084492</td>\n",
" <td>64.087166</td>\n",
" <td>3.299401</td>\n",
" <td>21.986992</td>\n",
" <td>1.051390</td>\n",
" <td>1.045989</td>\n",
" <td>...</td>\n",
" <td>0.959679</td>\n",
" <td>0.796952</td>\n",
" <td>1.007204</td>\n",
" <td>24.505549</td>\n",
" <td>18.920122</td>\n",
" <td>32.088841</td>\n",
" <td>4.617362e+05</td>\n",
" <td>1434.931818</td>\n",
" <td>2355.124042</td>\n",
" <td>2.482649</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>1.505850e+08</td>\n",
" <td>65479.055781</td>\n",
" <td>67986.836475</td>\n",
" <td>2083.936842</td>\n",
" <td>9.455375</td>\n",
" <td>22.164482</td>\n",
" <td>2.806039</td>\n",
" <td>9.005344</td>\n",
" <td>0.026664</td>\n",
" <td>0.030311</td>\n",
" <td>...</td>\n",
" <td>0.261363</td>\n",
" <td>0.155203</td>\n",
" <td>0.207092</td>\n",
" <td>15.475574</td>\n",
" <td>13.932863</td>\n",
" <td>19.537331</td>\n",
" <td>1.949343e+06</td>\n",
" <td>1711.726292</td>\n",
" <td>11877.522514</td>\n",
" <td>1.221939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>8.010000e+02</td>\n",
" <td>500.000000</td>\n",
" <td>35.588092</td>\n",
" <td>0.000000</td>\n",
" <td>15.200000</td>\n",
" <td>15.000000</td>\n",
" <td>0.100000</td>\n",
" <td>3.700000</td>\n",
" <td>0.940000</td>\n",
" <td>0.970000</td>\n",
" <td>...</td>\n",
" <td>0.580000</td>\n",
" <td>0.450000</td>\n",
" <td>0.840000</td>\n",
" <td>6.290000</td>\n",
" <td>0.690000</td>\n",
" <td>9.210000</td>\n",
" <td>5.930000e+02</td>\n",
" <td>32.000000</td>\n",
" <td>0.172463</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.552280e+06</td>\n",
" <td>1505.500000</td>\n",
" <td>794.582044</td>\n",
" <td>35.500000</td>\n",
" <td>24.700000</td>\n",
" <td>46.500000</td>\n",
" <td>1.300000</td>\n",
" <td>15.150000</td>\n",
" <td>1.035000</td>\n",
" <td>1.030000</td>\n",
" <td>...</td>\n",
" <td>0.870000</td>\n",
" <td>0.715000</td>\n",
" <td>0.960000</td>\n",
" <td>16.320000</td>\n",
" <td>10.902500</td>\n",
" <td>21.540000</td>\n",
" <td>1.168800e+04</td>\n",
" <td>208.250000</td>\n",
" <td>11.883978</td>\n",
" <td>1.690000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>8.278724e+06</td>\n",
" <td>5950.000000</td>\n",
" <td>2811.814632</td>\n",
" <td>94.000000</td>\n",
" <td>32.000000</td>\n",
" <td>66.000000</td>\n",
" <td>2.600000</td>\n",
" <td>22.450000</td>\n",
" <td>1.050000</td>\n",
" <td>1.050000</td>\n",
" <td>...</td>\n",
" <td>0.950000</td>\n",
" <td>0.790000</td>\n",
" <td>0.990000</td>\n",
" <td>21.295000</td>\n",
" <td>16.485000</td>\n",
" <td>26.865000</td>\n",
" <td>4.417200e+04</td>\n",
" <td>774.000000</td>\n",
" <td>89.112200</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2.878637e+07</td>\n",
" <td>16121.000000</td>\n",
" <td>5943.719566</td>\n",
" <td>233.000000</td>\n",
" <td>40.000000</td>\n",
" <td>83.000000</td>\n",
" <td>4.575000</td>\n",
" <td>27.350000</td>\n",
" <td>1.060000</td>\n",
" <td>1.060000</td>\n",
" <td>...</td>\n",
" <td>0.995000</td>\n",
" <td>0.860000</td>\n",
" <td>1.010000</td>\n",
" <td>27.117500</td>\n",
" <td>23.050000</td>\n",
" <td>34.890000</td>\n",
" <td>2.469530e+05</td>\n",
" <td>1891.000000</td>\n",
" <td>465.193200</td>\n",
" <td>2.820000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.439324e+09</td>\n",
" <td>320000.000000</td>\n",
" <td>415993.107300</td>\n",
" <td>26337.000000</td>\n",
" <td>62.000000</td>\n",
" <td>100.000000</td>\n",
" <td>14.353400</td>\n",
" <td>42.650000</td>\n",
" <td>1.260000</td>\n",
" <td>1.170000</td>\n",
" <td>...</td>\n",
" <td>3.380000</td>\n",
" <td>1.710000</td>\n",
" <td>3.410000</td>\n",
" <td>114.280000</td>\n",
" <td>94.590000</td>\n",
" <td>174.880000</td>\n",
" <td>2.143945e+07</td>\n",
" <td>9536.000000</td>\n",
" <td>108308.036900</td>\n",
" <td>6.950000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" pop tests testpop density medianage \\\n",
"count 1.870000e+02 52.000000 52.000000 187.000000 187.000000 \n",
"mean 4.065431e+07 30192.903846 22281.534633 412.743316 32.084492 \n",
"std 1.505850e+08 65479.055781 67986.836475 2083.936842 9.455375 \n",
"min 8.010000e+02 500.000000 35.588092 0.000000 15.200000 \n",
"25% 1.552280e+06 1505.500000 794.582044 35.500000 24.700000 \n",
"50% 8.278724e+06 5950.000000 2811.814632 94.000000 32.000000 \n",
"75% 2.878637e+07 16121.000000 5943.719566 233.000000 40.000000 \n",
"max 1.439324e+09 320000.000000 415993.107300 26337.000000 62.000000 \n",
"\n",
" urbanpop hospibed smokers sex0 sex14 ... \\\n",
"count 187.000000 186.000000 123.000000 187.000000 187.000000 ... \n",
"mean 64.087166 3.299401 21.986992 1.051390 1.045989 ... \n",
"std 22.164482 2.806039 9.005344 0.026664 0.030311 ... \n",
"min 15.000000 0.100000 3.700000 0.940000 0.970000 ... \n",
"25% 46.500000 1.300000 15.150000 1.035000 1.030000 ... \n",
"50% 66.000000 2.600000 22.450000 1.050000 1.050000 ... \n",
"75% 83.000000 4.575000 27.350000 1.060000 1.060000 ... \n",
"max 100.000000 14.353400 42.650000 1.260000 1.170000 ... \n",
"\n",
" sex64 sex65plus sexratio lung femalelung malelung \\\n",
"count 187.000000 187.000000 186.000000 164.000000 164.000000 164.000000 \n",
"mean 0.959679 0.796952 1.007204 24.505549 18.920122 32.088841 \n",
"std 0.261363 0.155203 0.207092 15.475574 13.932863 19.537331 \n",
"min 0.580000 0.450000 0.840000 6.290000 0.690000 9.210000 \n",
"25% 0.870000 0.715000 0.960000 16.320000 10.902500 21.540000 \n",
"50% 0.950000 0.790000 0.990000 21.295000 16.485000 26.865000 \n",
"75% 0.995000 0.860000 1.010000 27.117500 23.050000 34.890000 \n",
"max 3.380000 1.710000 3.410000 114.280000 94.590000 174.880000 \n",
"\n",
" gdp2019 healthexp healthperpop fertility \n",
"count 1.890000e+02 176.000000 183.000000 185.000000 \n",
"mean 4.617362e+05 1434.931818 2355.124042 2.482649 \n",
"std 1.949343e+06 1711.726292 11877.522514 1.221939 \n",
"min 5.930000e+02 32.000000 0.172463 0.200000 \n",
"25% 1.168800e+04 208.250000 11.883978 1.690000 \n",
"50% 4.417200e+04 774.000000 89.112200 2.000000 \n",
"75% 2.469530e+05 1891.000000 465.193200 2.820000 \n",
"max 2.143945e+07 9536.000000 108308.036900 6.950000 \n",
"\n",
"[8 rows x 22 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(countryinfo.dtypes)\n",
"display(countryinfo.head(5))\n",
"display(countryinfo.describe())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pop_country</th>\n",
" </tr>\n",
" <tr>\n",
" <th>country</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan</th>\n",
" <td>38928346.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania</th>\n",
" <td>2877797.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria</th>\n",
" <td>43851044.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra</th>\n",
" <td>77265.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Antigua and Barbuda</th>\n",
" <td>97929.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan</th>\n",
" <td>33469203.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela</th>\n",
" <td>28435940.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam</th>\n",
" <td>97338579.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia</th>\n",
" <td>18383955.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe</th>\n",
" <td>14862924.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>175 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" pop_country\n",
"country \n",
"Afghanistan 38928346.0\n",
"Albania 2877797.0\n",
"Algeria 43851044.0\n",
"Andorra 77265.0\n",
"Antigua and Barbuda 97929.0\n",
"... ...\n",
"Uzbekistan 33469203.0\n",
"Venezuela 28435940.0\n",
"Vietnam 97338579.0\n",
"Zambia 18383955.0\n",
"Zimbabwe 14862924.0\n",
"\n",
"[175 rows x 1 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# extract population country data\n",
"pop_data = countryinfo[['country', \"pop\"]].rename(columns={'pop': 'pop_country'}).groupby(\"country\").max()\n",
"pop_data"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" <th>pop_country</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" <td>38928346.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" <td>2877797.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" <td>43851044.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" <td>77265.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" <td>33469203.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" <td>28435940.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" <td>97338579.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" <td>18383955.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" <td>14862924.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State pop_country\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN 38928346.0\n",
"Albania_nan Albania NaN 2877797.0\n",
"Algeria_nan Algeria NaN 43851044.0\n",
"Andorra_nan Andorra NaN 77265.0\n",
"Angola_nan Angola NaN NaN\n",
"... ... ... ...\n",
"Uzbekistan_nan Uzbekistan NaN 33469203.0\n",
"Venezuela_nan Venezuela NaN 28435940.0\n",
"Vietnam_nan Vietnam NaN 97338579.0\n",
"Zambia_nan Zambia NaN 18383955.0\n",
"Zimbabwe_nan Zimbabwe NaN 14862924.0\n",
"\n",
"[294 rows x 3 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# left join pop_country to area_features\n",
"area_features = area_features.join(pop_data, how='left', on=\"Country_Region\")\n",
"area_features"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of countries with population: 171\n",
"Number of countries without population: 2\n",
"Countries without population: ['Angola' 'Papua New Guinea']\n"
]
}
],
"source": [
"print(\"Number of countries with population: \", area_features[area_features['pop_country'].isna()==False].Country_Region.nunique())\n",
"print(\"Number of countries without population: \", area_features[area_features['pop_country'].isna()==True].Country_Region.nunique())\n",
"print(\"Countries without population: \", area_features[area_features['pop_country'].isna()==True].Country_Region.unique())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# fill country population (pop_country) NA with 100 000\n",
"area_features['pop_country'] = area_features['pop_country'].fillna(100000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1.2 Population Data for Province_States"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Countries with Province/State informed: ['Australia' 'Canada' 'China' 'Denmark' 'France' 'Netherlands' 'US'\n",
" 'United Kingdom']\n"
]
}
],
"source": [
"print(\"Countries with Province/State informed: \", train[train['Province_State'].isna()==False]['Country_Region'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" <th>pop_country</th>\n",
" <th>num_states</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" <td>38928346.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" <td>2877797.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" <td>43851044.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" <td>77265.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" <td>100000.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" <td>33469203.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" <td>28435940.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" <td>97338579.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" <td>18383955.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" <td>14862924.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State pop_country num_states\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN 38928346.0 1\n",
"Albania_nan Albania NaN 2877797.0 1\n",
"Algeria_nan Algeria NaN 43851044.0 1\n",
"Andorra_nan Andorra NaN 77265.0 1\n",
"Angola_nan Angola NaN 100000.0 1\n",
"... ... ... ... ...\n",
"Uzbekistan_nan Uzbekistan NaN 33469203.0 1\n",
"Venezuela_nan Venezuela NaN 28435940.0 1\n",
"Vietnam_nan Vietnam NaN 97338579.0 1\n",
"Zambia_nan Zambia NaN 18383955.0 1\n",
"Zimbabwe_nan Zimbabwe NaN 14862924.0 1\n",
"\n",
"[294 rows x 4 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# add column num_states per country\n",
"num_states = area_features[['Country_Region', 'Province_State']].fillna(\"\").groupby('Country_Region').count().rename(columns={'Province_State': \"num_states\"})\n",
"area_features = area_features.join(num_states, on=\"Country_Region\")\n",
"area_features"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" <th>pop_country</th>\n",
" <th>num_states</th>\n",
" <th>pop</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" <td>38928346.0</td>\n",
" <td>1</td>\n",
" <td>38928346.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" <td>2877797.0</td>\n",
" <td>1</td>\n",
" <td>2877797.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" <td>43851044.0</td>\n",
" <td>1</td>\n",
" <td>43851044.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" <td>77265.0</td>\n",
" <td>1</td>\n",
" <td>77265.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" <td>100000.0</td>\n",
" <td>1</td>\n",
" <td>100000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" <td>33469203.0</td>\n",
" <td>1</td>\n",
" <td>33469203.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" <td>28435940.0</td>\n",
" <td>1</td>\n",
" <td>28435940.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" <td>97338579.0</td>\n",
" <td>1</td>\n",
" <td>97338579.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" <td>18383955.0</td>\n",
" <td>1</td>\n",
" <td>18383955.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" <td>14862924.0</td>\n",
" <td>1</td>\n",
" <td>14862924.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State pop_country num_states \\\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN 38928346.0 1 \n",
"Albania_nan Albania NaN 2877797.0 1 \n",
"Algeria_nan Algeria NaN 43851044.0 1 \n",
"Andorra_nan Andorra NaN 77265.0 1 \n",
"Angola_nan Angola NaN 100000.0 1 \n",
"... ... ... ... ... \n",
"Uzbekistan_nan Uzbekistan NaN 33469203.0 1 \n",
"Venezuela_nan Venezuela NaN 28435940.0 1 \n",
"Vietnam_nan Vietnam NaN 97338579.0 1 \n",
"Zambia_nan Zambia NaN 18383955.0 1 \n",
"Zimbabwe_nan Zimbabwe NaN 14862924.0 1 \n",
"\n",
" pop \n",
"geo_id \n",
"Afghanistan_nan 38928346.0 \n",
"Albania_nan 2877797.0 \n",
"Algeria_nan 43851044.0 \n",
"Andorra_nan 77265.0 \n",
"Angola_nan 100000.0 \n",
"... ... \n",
"Uzbekistan_nan 33469203.0 \n",
"Venezuela_nan 28435940.0 \n",
"Vietnam_nan 97338579.0 \n",
"Zambia_nan 18383955.0 \n",
"Zimbabwe_nan 14862924.0 \n",
"\n",
"[294 rows x 5 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# fill province_state population (pop) with pop_country / num_states\n",
"area_features['pop'] = area_features['pop_country'] / area_features['num_states']\n",
"area_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1.3 Date of First Infection per Province_State"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date_of_first_infection</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>2020-02-24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>2020-03-09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>2020-02-25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>2020-03-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>2020-03-20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>2020-03-15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>2020-03-14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>2020-01-23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>2020-03-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>2020-03-20</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" date_of_first_infection\n",
"geo_id \n",
"Afghanistan_nan 2020-02-24\n",
"Albania_nan 2020-03-09\n",
"Algeria_nan 2020-02-25\n",
"Andorra_nan 2020-03-02\n",
"Angola_nan 2020-03-20\n",
"... ...\n",
"Uzbekistan_nan 2020-03-15\n",
"Venezuela_nan 2020-03-14\n",
"Vietnam_nan 2020-01-23\n",
"Zambia_nan 2020-03-18\n",
"Zimbabwe_nan 2020-03-20\n",
"\n",
"[294 rows x 1 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"date_of_first_infection = train[train['ConfirmedCases'] > 0].groupby(['geo_id']).agg({'Date': 'min'}).rename(columns={'Date': 'date_of_first_infection'})\n",
"date_of_first_infection"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" <th>pop_country</th>\n",
" <th>num_states</th>\n",
" <th>pop</th>\n",
" <th>date_of_first_infection</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" <td>38928346.0</td>\n",
" <td>1</td>\n",
" <td>38928346.0</td>\n",
" <td>2020-02-24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" <td>2877797.0</td>\n",
" <td>1</td>\n",
" <td>2877797.0</td>\n",
" <td>2020-03-09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" <td>43851044.0</td>\n",
" <td>1</td>\n",
" <td>43851044.0</td>\n",
" <td>2020-02-25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" <td>77265.0</td>\n",
" <td>1</td>\n",
" <td>77265.0</td>\n",
" <td>2020-03-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" <td>100000.0</td>\n",
" <td>1</td>\n",
" <td>100000.0</td>\n",
" <td>2020-03-20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" <td>33469203.0</td>\n",
" <td>1</td>\n",
" <td>33469203.0</td>\n",
" <td>2020-03-15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" <td>28435940.0</td>\n",
" <td>1</td>\n",
" <td>28435940.0</td>\n",
" <td>2020-03-14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" <td>97338579.0</td>\n",
" <td>1</td>\n",
" <td>97338579.0</td>\n",
" <td>2020-01-23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" <td>18383955.0</td>\n",
" <td>1</td>\n",
" <td>18383955.0</td>\n",
" <td>2020-03-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" <td>14862924.0</td>\n",
" <td>1</td>\n",
" <td>14862924.0</td>\n",
" <td>2020-03-20</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State pop_country num_states \\\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN 38928346.0 1 \n",
"Albania_nan Albania NaN 2877797.0 1 \n",
"Algeria_nan Algeria NaN 43851044.0 1 \n",
"Andorra_nan Andorra NaN 77265.0 1 \n",
"Angola_nan Angola NaN 100000.0 1 \n",
"... ... ... ... ... \n",
"Uzbekistan_nan Uzbekistan NaN 33469203.0 1 \n",
"Venezuela_nan Venezuela NaN 28435940.0 1 \n",
"Vietnam_nan Vietnam NaN 97338579.0 1 \n",
"Zambia_nan Zambia NaN 18383955.0 1 \n",
"Zimbabwe_nan Zimbabwe NaN 14862924.0 1 \n",
"\n",
" pop date_of_first_infection \n",
"geo_id \n",
"Afghanistan_nan 38928346.0 2020-02-24 \n",
"Albania_nan 2877797.0 2020-03-09 \n",
"Algeria_nan 43851044.0 2020-02-25 \n",
"Andorra_nan 77265.0 2020-03-02 \n",
"Angola_nan 100000.0 2020-03-20 \n",
"... ... ... \n",
"Uzbekistan_nan 33469203.0 2020-03-15 \n",
"Venezuela_nan 28435940.0 2020-03-14 \n",
"Vietnam_nan 97338579.0 2020-01-23 \n",
"Zambia_nan 18383955.0 2020-03-18 \n",
"Zimbabwe_nan 14862924.0 2020-03-20 \n",
"\n",
"[294 rows x 6 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"area_features = area_features.join(date_of_first_infection, on=\"geo_id\")\n",
"area_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1.4 Date Delta to Hubei"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2020-01-22</th>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-01-23</th>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-01-24</th>\n",
" <td>549.0</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-01-25</th>\n",
" <td>761.0</td>\n",
" <td>40.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-01-26</th>\n",
" <td>1058.0</td>\n",
" <td>52.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-24</th>\n",
" <td>67801.0</td>\n",
" <td>3160.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-25</th>\n",
" <td>67801.0</td>\n",
" <td>3163.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-26</th>\n",
" <td>67801.0</td>\n",
" <td>3169.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-27</th>\n",
" <td>67801.0</td>\n",
" <td>3174.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-28</th>\n",
" <td>67801.0</td>\n",
" <td>3177.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>67 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" ConfirmedCases Fatalities\n",
"Date \n",
"2020-01-22 444.0 17.0\n",
"2020-01-23 444.0 17.0\n",
"2020-01-24 549.0 24.0\n",
"2020-01-25 761.0 40.0\n",
"2020-01-26 1058.0 52.0\n",
"... ... ...\n",
"2020-03-24 67801.0 3160.0\n",
"2020-03-25 67801.0 3163.0\n",
"2020-03-26 67801.0 3169.0\n",
"2020-03-27 67801.0 3174.0\n",
"2020-03-28 67801.0 3177.0\n",
"\n",
"[67 rows x 2 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hubei_curve = train[(train['Country_Region'] == 'China') & (train['Province_State'] == 'Hubei')]\n",
"hubei_curve = hubei_curve[['Date', 'ConfirmedCases', 'Fatalities']].set_index('Date')\n",
"hubei_curve"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7fbfebc74d30>"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"hubei_curve.plot.line()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"datetime.datetime(2019, 12, 15, 0, 0)"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import datetime\n",
"date_start_hubei = datetime.datetime(2019, 12, 15) #hubei_curve.index.min()\n",
"date_start_hubei"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" <th>pop_country</th>\n",
" <th>num_states</th>\n",
" <th>pop</th>\n",
" <th>date_of_first_infection</th>\n",
" <th>date_delta_hubei</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" <td>38928346.0</td>\n",
" <td>1</td>\n",
" <td>38928346.0</td>\n",
" <td>2020-02-24</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" <td>2877797.0</td>\n",
" <td>1</td>\n",
" <td>2877797.0</td>\n",
" <td>2020-03-09</td>\n",
" <td>85 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" <td>43851044.0</td>\n",
" <td>1</td>\n",
" <td>43851044.0</td>\n",
" <td>2020-02-25</td>\n",
" <td>72 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" <td>77265.0</td>\n",
" <td>1</td>\n",
" <td>77265.0</td>\n",
" <td>2020-03-02</td>\n",
" <td>78 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" <td>100000.0</td>\n",
" <td>1</td>\n",
" <td>100000.0</td>\n",
" <td>2020-03-20</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" <td>33469203.0</td>\n",
" <td>1</td>\n",
" <td>33469203.0</td>\n",
" <td>2020-03-15</td>\n",
" <td>91 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" <td>28435940.0</td>\n",
" <td>1</td>\n",
" <td>28435940.0</td>\n",
" <td>2020-03-14</td>\n",
" <td>90 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" <td>97338579.0</td>\n",
" <td>1</td>\n",
" <td>97338579.0</td>\n",
" <td>2020-01-23</td>\n",
" <td>39 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" <td>18383955.0</td>\n",
" <td>1</td>\n",
" <td>18383955.0</td>\n",
" <td>2020-03-18</td>\n",
" <td>94 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" <td>14862924.0</td>\n",
" <td>1</td>\n",
" <td>14862924.0</td>\n",
" <td>2020-03-20</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State pop_country num_states \\\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN 38928346.0 1 \n",
"Albania_nan Albania NaN 2877797.0 1 \n",
"Algeria_nan Algeria NaN 43851044.0 1 \n",
"Andorra_nan Andorra NaN 77265.0 1 \n",
"Angola_nan Angola NaN 100000.0 1 \n",
"... ... ... ... ... \n",
"Uzbekistan_nan Uzbekistan NaN 33469203.0 1 \n",
"Venezuela_nan Venezuela NaN 28435940.0 1 \n",
"Vietnam_nan Vietnam NaN 97338579.0 1 \n",
"Zambia_nan Zambia NaN 18383955.0 1 \n",
"Zimbabwe_nan Zimbabwe NaN 14862924.0 1 \n",
"\n",
" pop date_of_first_infection date_delta_hubei \n",
"geo_id \n",
"Afghanistan_nan 38928346.0 2020-02-24 71 days \n",
"Albania_nan 2877797.0 2020-03-09 85 days \n",
"Algeria_nan 43851044.0 2020-02-25 72 days \n",
"Andorra_nan 77265.0 2020-03-02 78 days \n",
"Angola_nan 100000.0 2020-03-20 96 days \n",
"... ... ... ... \n",
"Uzbekistan_nan 33469203.0 2020-03-15 91 days \n",
"Venezuela_nan 28435940.0 2020-03-14 90 days \n",
"Vietnam_nan 97338579.0 2020-01-23 39 days \n",
"Zambia_nan 18383955.0 2020-03-18 94 days \n",
"Zimbabwe_nan 14862924.0 2020-03-20 96 days \n",
"\n",
"[294 rows x 7 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"area_features['date_delta_hubei'] = area_features['date_of_first_infection'] - date_start_hubei\n",
"area_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1.5 Population Scale to Hubei"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"58500000.0"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"population_hubei = 58.5 * 10**6\n",
"population_hubei"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" <th>pop_country</th>\n",
" <th>num_states</th>\n",
" <th>pop</th>\n",
" <th>date_of_first_infection</th>\n",
" <th>date_delta_hubei</th>\n",
" <th>pop_scale_hubei</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" <td>38928346.0</td>\n",
" <td>1</td>\n",
" <td>38928346.0</td>\n",
" <td>2020-02-24</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" <td>2877797.0</td>\n",
" <td>1</td>\n",
" <td>2877797.0</td>\n",
" <td>2020-03-09</td>\n",
" <td>85 days</td>\n",
" <td>0.049193</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" <td>43851044.0</td>\n",
" <td>1</td>\n",
" <td>43851044.0</td>\n",
" <td>2020-02-25</td>\n",
" <td>72 days</td>\n",
" <td>0.749590</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" <td>77265.0</td>\n",
" <td>1</td>\n",
" <td>77265.0</td>\n",
" <td>2020-03-02</td>\n",
" <td>78 days</td>\n",
" <td>0.001321</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" <td>100000.0</td>\n",
" <td>1</td>\n",
" <td>100000.0</td>\n",
" <td>2020-03-20</td>\n",
" <td>96 days</td>\n",
" <td>0.001709</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" <td>33469203.0</td>\n",
" <td>1</td>\n",
" <td>33469203.0</td>\n",
" <td>2020-03-15</td>\n",
" <td>91 days</td>\n",
" <td>0.572123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" <td>28435940.0</td>\n",
" <td>1</td>\n",
" <td>28435940.0</td>\n",
" <td>2020-03-14</td>\n",
" <td>90 days</td>\n",
" <td>0.486084</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" <td>97338579.0</td>\n",
" <td>1</td>\n",
" <td>97338579.0</td>\n",
" <td>2020-01-23</td>\n",
" <td>39 days</td>\n",
" <td>1.663907</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" <td>18383955.0</td>\n",
" <td>1</td>\n",
" <td>18383955.0</td>\n",
" <td>2020-03-18</td>\n",
" <td>94 days</td>\n",
" <td>0.314256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" <td>14862924.0</td>\n",
" <td>1</td>\n",
" <td>14862924.0</td>\n",
" <td>2020-03-20</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State pop_country num_states \\\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN 38928346.0 1 \n",
"Albania_nan Albania NaN 2877797.0 1 \n",
"Algeria_nan Algeria NaN 43851044.0 1 \n",
"Andorra_nan Andorra NaN 77265.0 1 \n",
"Angola_nan Angola NaN 100000.0 1 \n",
"... ... ... ... ... \n",
"Uzbekistan_nan Uzbekistan NaN 33469203.0 1 \n",
"Venezuela_nan Venezuela NaN 28435940.0 1 \n",
"Vietnam_nan Vietnam NaN 97338579.0 1 \n",
"Zambia_nan Zambia NaN 18383955.0 1 \n",
"Zimbabwe_nan Zimbabwe NaN 14862924.0 1 \n",
"\n",
" pop date_of_first_infection date_delta_hubei \\\n",
"geo_id \n",
"Afghanistan_nan 38928346.0 2020-02-24 71 days \n",
"Albania_nan 2877797.0 2020-03-09 85 days \n",
"Algeria_nan 43851044.0 2020-02-25 72 days \n",
"Andorra_nan 77265.0 2020-03-02 78 days \n",
"Angola_nan 100000.0 2020-03-20 96 days \n",
"... ... ... ... \n",
"Uzbekistan_nan 33469203.0 2020-03-15 91 days \n",
"Venezuela_nan 28435940.0 2020-03-14 90 days \n",
"Vietnam_nan 97338579.0 2020-01-23 39 days \n",
"Zambia_nan 18383955.0 2020-03-18 94 days \n",
"Zimbabwe_nan 14862924.0 2020-03-20 96 days \n",
"\n",
" pop_scale_hubei \n",
"geo_id \n",
"Afghanistan_nan 0.665442 \n",
"Albania_nan 0.049193 \n",
"Algeria_nan 0.749590 \n",
"Andorra_nan 0.001321 \n",
"Angola_nan 0.001709 \n",
"... ... \n",
"Uzbekistan_nan 0.572123 \n",
"Venezuela_nan 0.486084 \n",
"Vietnam_nan 1.663907 \n",
"Zambia_nan 0.314256 \n",
"Zimbabwe_nan 0.254067 \n",
"\n",
"[294 rows x 8 columns]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"area_features['pop_scale_hubei'] = area_features['pop'] / population_hubei\n",
"area_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.2 Set Hubei S-Curve for All Areas"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country_Region</th>\n",
" <th>Province_State</th>\n",
" <th>pop_country</th>\n",
" <th>num_states</th>\n",
" <th>pop</th>\n",
" <th>date_of_first_infection</th>\n",
" <th>date_delta_hubei</th>\n",
" <th>pop_scale_hubei</th>\n",
" </tr>\n",
" <tr>\n",
" <th>geo_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan_nan</th>\n",
" <td>Afghanistan</td>\n",
" <td>NaN</td>\n",
" <td>38928346.0</td>\n",
" <td>1</td>\n",
" <td>38928346.0</td>\n",
" <td>2020-02-24</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania_nan</th>\n",
" <td>Albania</td>\n",
" <td>NaN</td>\n",
" <td>2877797.0</td>\n",
" <td>1</td>\n",
" <td>2877797.0</td>\n",
" <td>2020-03-09</td>\n",
" <td>85 days</td>\n",
" <td>0.049193</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria_nan</th>\n",
" <td>Algeria</td>\n",
" <td>NaN</td>\n",
" <td>43851044.0</td>\n",
" <td>1</td>\n",
" <td>43851044.0</td>\n",
" <td>2020-02-25</td>\n",
" <td>72 days</td>\n",
" <td>0.749590</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Andorra_nan</th>\n",
" <td>Andorra</td>\n",
" <td>NaN</td>\n",
" <td>77265.0</td>\n",
" <td>1</td>\n",
" <td>77265.0</td>\n",
" <td>2020-03-02</td>\n",
" <td>78 days</td>\n",
" <td>0.001321</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola_nan</th>\n",
" <td>Angola</td>\n",
" <td>NaN</td>\n",
" <td>100000.0</td>\n",
" <td>1</td>\n",
" <td>100000.0</td>\n",
" <td>2020-03-20</td>\n",
" <td>96 days</td>\n",
" <td>0.001709</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Uzbekistan_nan</th>\n",
" <td>Uzbekistan</td>\n",
" <td>NaN</td>\n",
" <td>33469203.0</td>\n",
" <td>1</td>\n",
" <td>33469203.0</td>\n",
" <td>2020-03-15</td>\n",
" <td>91 days</td>\n",
" <td>0.572123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Venezuela_nan</th>\n",
" <td>Venezuela</td>\n",
" <td>NaN</td>\n",
" <td>28435940.0</td>\n",
" <td>1</td>\n",
" <td>28435940.0</td>\n",
" <td>2020-03-14</td>\n",
" <td>90 days</td>\n",
" <td>0.486084</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vietnam_nan</th>\n",
" <td>Vietnam</td>\n",
" <td>NaN</td>\n",
" <td>97338579.0</td>\n",
" <td>1</td>\n",
" <td>97338579.0</td>\n",
" <td>2020-01-23</td>\n",
" <td>39 days</td>\n",
" <td>1.663907</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zambia_nan</th>\n",
" <td>Zambia</td>\n",
" <td>NaN</td>\n",
" <td>18383955.0</td>\n",
" <td>1</td>\n",
" <td>18383955.0</td>\n",
" <td>2020-03-18</td>\n",
" <td>94 days</td>\n",
" <td>0.314256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zimbabwe_nan</th>\n",
" <td>Zimbabwe</td>\n",
" <td>NaN</td>\n",
" <td>14862924.0</td>\n",
" <td>1</td>\n",
" <td>14862924.0</td>\n",
" <td>2020-03-20</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>294 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" Country_Region Province_State pop_country num_states \\\n",
"geo_id \n",
"Afghanistan_nan Afghanistan NaN 38928346.0 1 \n",
"Albania_nan Albania NaN 2877797.0 1 \n",
"Algeria_nan Algeria NaN 43851044.0 1 \n",
"Andorra_nan Andorra NaN 77265.0 1 \n",
"Angola_nan Angola NaN 100000.0 1 \n",
"... ... ... ... ... \n",
"Uzbekistan_nan Uzbekistan NaN 33469203.0 1 \n",
"Venezuela_nan Venezuela NaN 28435940.0 1 \n",
"Vietnam_nan Vietnam NaN 97338579.0 1 \n",
"Zambia_nan Zambia NaN 18383955.0 1 \n",
"Zimbabwe_nan Zimbabwe NaN 14862924.0 1 \n",
"\n",
" pop date_of_first_infection date_delta_hubei \\\n",
"geo_id \n",
"Afghanistan_nan 38928346.0 2020-02-24 71 days \n",
"Albania_nan 2877797.0 2020-03-09 85 days \n",
"Algeria_nan 43851044.0 2020-02-25 72 days \n",
"Andorra_nan 77265.0 2020-03-02 78 days \n",
"Angola_nan 100000.0 2020-03-20 96 days \n",
"... ... ... ... \n",
"Uzbekistan_nan 33469203.0 2020-03-15 91 days \n",
"Venezuela_nan 28435940.0 2020-03-14 90 days \n",
"Vietnam_nan 97338579.0 2020-01-23 39 days \n",
"Zambia_nan 18383955.0 2020-03-18 94 days \n",
"Zimbabwe_nan 14862924.0 2020-03-20 96 days \n",
"\n",
" pop_scale_hubei \n",
"geo_id \n",
"Afghanistan_nan 0.665442 \n",
"Albania_nan 0.049193 \n",
"Algeria_nan 0.749590 \n",
"Andorra_nan 0.001321 \n",
"Angola_nan 0.001709 \n",
"... ... \n",
"Uzbekistan_nan 0.572123 \n",
"Venezuela_nan 0.486084 \n",
"Vietnam_nan 1.663907 \n",
"Zambia_nan 0.314256 \n",
"Zimbabwe_nan 0.254067 \n",
"\n",
"[294 rows x 8 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"area_features"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geo_id</th>\n",
" <th>Date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19693</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19694</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19695</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19696</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19697</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-28</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19698 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" geo_id Date\n",
"0 Afghanistan_nan 2020-01-22\n",
"1 Afghanistan_nan 2020-01-23\n",
"2 Afghanistan_nan 2020-01-24\n",
"3 Afghanistan_nan 2020-01-25\n",
"4 Afghanistan_nan 2020-01-26\n",
"... ... ...\n",
"19693 Zimbabwe_nan 2020-03-24\n",
"19694 Zimbabwe_nan 2020-03-25\n",
"19695 Zimbabwe_nan 2020-03-26\n",
"19696 Zimbabwe_nan 2020-03-27\n",
"19697 Zimbabwe_nan 2020-03-28\n",
"\n",
"[19698 rows x 2 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = train[['geo_id', 'Date']]\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geo_id</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-22</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-23</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-24</td>\n",
" <td>549.0</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-25</td>\n",
" <td>761.0</td>\n",
" <td>40.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-26</td>\n",
" <td>1058.0</td>\n",
" <td>52.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19693</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-24</td>\n",
" <td>67801.0</td>\n",
" <td>3160.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19694</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-25</td>\n",
" <td>67801.0</td>\n",
" <td>3163.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19695</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-26</td>\n",
" <td>67801.0</td>\n",
" <td>3169.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19696</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-27</td>\n",
" <td>67801.0</td>\n",
" <td>3174.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19697</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-28</td>\n",
" <td>67801.0</td>\n",
" <td>3177.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19698 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" geo_id Date ConfirmedCases Fatalities\n",
"0 Afghanistan_nan 2020-01-22 444.0 17.0\n",
"1 Afghanistan_nan 2020-01-23 444.0 17.0\n",
"2 Afghanistan_nan 2020-01-24 549.0 24.0\n",
"3 Afghanistan_nan 2020-01-25 761.0 40.0\n",
"4 Afghanistan_nan 2020-01-26 1058.0 52.0\n",
"... ... ... ... ...\n",
"19693 Zimbabwe_nan 2020-03-24 67801.0 3160.0\n",
"19694 Zimbabwe_nan 2020-03-25 67801.0 3163.0\n",
"19695 Zimbabwe_nan 2020-03-26 67801.0 3169.0\n",
"19696 Zimbabwe_nan 2020-03-27 67801.0 3174.0\n",
"19697 Zimbabwe_nan 2020-03-28 67801.0 3177.0\n",
"\n",
"[19698 rows x 4 columns]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# add Hubei Curve to all Areas\n",
"data = data.join(hubei_curve, on='Date')\n",
"data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.3 Translate Hubei S-[](http://)Curve\n",
"Translating Hubei curve to date of first infection of each Province_State"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geo_id</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" <th>date_delta_hubei</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-22</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-23</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-24</td>\n",
" <td>549.0</td>\n",
" <td>24.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-25</td>\n",
" <td>761.0</td>\n",
" <td>40.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-01-26</td>\n",
" <td>1058.0</td>\n",
" <td>52.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19693</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-24</td>\n",
" <td>67801.0</td>\n",
" <td>3160.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19694</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-25</td>\n",
" <td>67801.0</td>\n",
" <td>3163.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19695</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-26</td>\n",
" <td>67801.0</td>\n",
" <td>3169.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19696</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-27</td>\n",
" <td>67801.0</td>\n",
" <td>3174.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19697</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-03-28</td>\n",
" <td>67801.0</td>\n",
" <td>3177.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19698 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" geo_id Date ConfirmedCases Fatalities date_delta_hubei\n",
"0 Afghanistan_nan 2020-01-22 444.0 17.0 71 days\n",
"1 Afghanistan_nan 2020-01-23 444.0 17.0 71 days\n",
"2 Afghanistan_nan 2020-01-24 549.0 24.0 71 days\n",
"3 Afghanistan_nan 2020-01-25 761.0 40.0 71 days\n",
"4 Afghanistan_nan 2020-01-26 1058.0 52.0 71 days\n",
"... ... ... ... ... ...\n",
"19693 Zimbabwe_nan 2020-03-24 67801.0 3160.0 96 days\n",
"19694 Zimbabwe_nan 2020-03-25 67801.0 3163.0 96 days\n",
"19695 Zimbabwe_nan 2020-03-26 67801.0 3169.0 96 days\n",
"19696 Zimbabwe_nan 2020-03-27 67801.0 3174.0 96 days\n",
"19697 Zimbabwe_nan 2020-03-28 67801.0 3177.0 96 days\n",
"\n",
"[19698 rows x 5 columns]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = data.join(area_features[['date_delta_hubei']], on=\"geo_id\")\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geo_id</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" <th>date_delta_hubei</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-02</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-03</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-04</td>\n",
" <td>549.0</td>\n",
" <td>24.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-05</td>\n",
" <td>761.0</td>\n",
" <td>40.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-06</td>\n",
" <td>1058.0</td>\n",
" <td>52.0</td>\n",
" <td>71 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19693</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-28</td>\n",
" <td>67801.0</td>\n",
" <td>3160.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19694</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-29</td>\n",
" <td>67801.0</td>\n",
" <td>3163.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19695</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-30</td>\n",
" <td>67801.0</td>\n",
" <td>3169.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19696</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-01</td>\n",
" <td>67801.0</td>\n",
" <td>3174.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19697</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-02</td>\n",
" <td>67801.0</td>\n",
" <td>3177.0</td>\n",
" <td>96 days</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19698 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" geo_id Date ConfirmedCases Fatalities date_delta_hubei\n",
"0 Afghanistan_nan 2020-04-02 444.0 17.0 71 days\n",
"1 Afghanistan_nan 2020-04-03 444.0 17.0 71 days\n",
"2 Afghanistan_nan 2020-04-04 549.0 24.0 71 days\n",
"3 Afghanistan_nan 2020-04-05 761.0 40.0 71 days\n",
"4 Afghanistan_nan 2020-04-06 1058.0 52.0 71 days\n",
"... ... ... ... ... ...\n",
"19693 Zimbabwe_nan 2020-06-28 67801.0 3160.0 96 days\n",
"19694 Zimbabwe_nan 2020-06-29 67801.0 3163.0 96 days\n",
"19695 Zimbabwe_nan 2020-06-30 67801.0 3169.0 96 days\n",
"19696 Zimbabwe_nan 2020-07-01 67801.0 3174.0 96 days\n",
"19697 Zimbabwe_nan 2020-07-02 67801.0 3177.0 96 days\n",
"\n",
"[19698 rows x 5 columns]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# translate by date_delta_hubei\n",
"data['Date'] = data['Date'] + data['date_delta_hubei']\n",
"data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.4 Scale Hubei S-Curve\n",
"Scale Hubei curve to population by each Province_State****"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geo_id</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" <th>date_delta_hubei</th>\n",
" <th>pop_scale_hubei</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-02</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-03</td>\n",
" <td>444.0</td>\n",
" <td>17.0</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-04</td>\n",
" <td>549.0</td>\n",
" <td>24.0</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-05</td>\n",
" <td>761.0</td>\n",
" <td>40.0</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-06</td>\n",
" <td>1058.0</td>\n",
" <td>52.0</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19693</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-28</td>\n",
" <td>67801.0</td>\n",
" <td>3160.0</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19694</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-29</td>\n",
" <td>67801.0</td>\n",
" <td>3163.0</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19695</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-30</td>\n",
" <td>67801.0</td>\n",
" <td>3169.0</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19696</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-01</td>\n",
" <td>67801.0</td>\n",
" <td>3174.0</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19697</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-02</td>\n",
" <td>67801.0</td>\n",
" <td>3177.0</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19698 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" geo_id Date ConfirmedCases Fatalities \\\n",
"0 Afghanistan_nan 2020-04-02 444.0 17.0 \n",
"1 Afghanistan_nan 2020-04-03 444.0 17.0 \n",
"2 Afghanistan_nan 2020-04-04 549.0 24.0 \n",
"3 Afghanistan_nan 2020-04-05 761.0 40.0 \n",
"4 Afghanistan_nan 2020-04-06 1058.0 52.0 \n",
"... ... ... ... ... \n",
"19693 Zimbabwe_nan 2020-06-28 67801.0 3160.0 \n",
"19694 Zimbabwe_nan 2020-06-29 67801.0 3163.0 \n",
"19695 Zimbabwe_nan 2020-06-30 67801.0 3169.0 \n",
"19696 Zimbabwe_nan 2020-07-01 67801.0 3174.0 \n",
"19697 Zimbabwe_nan 2020-07-02 67801.0 3177.0 \n",
"\n",
" date_delta_hubei pop_scale_hubei \n",
"0 71 days 0.665442 \n",
"1 71 days 0.665442 \n",
"2 71 days 0.665442 \n",
"3 71 days 0.665442 \n",
"4 71 days 0.665442 \n",
"... ... ... \n",
"19693 96 days 0.254067 \n",
"19694 96 days 0.254067 \n",
"19695 96 days 0.254067 \n",
"19696 96 days 0.254067 \n",
"19697 96 days 0.254067 \n",
"\n",
"[19698 rows x 6 columns]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = data.join(area_features[['pop_scale_hubei']], on=\"geo_id\")\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geo_id</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" <th>date_delta_hubei</th>\n",
" <th>pop_scale_hubei</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-02</td>\n",
" <td>295.456165</td>\n",
" <td>11.312511</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-03</td>\n",
" <td>295.456165</td>\n",
" <td>11.312511</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-04</td>\n",
" <td>365.327555</td>\n",
" <td>15.970603</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-05</td>\n",
" <td>506.401219</td>\n",
" <td>26.617672</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-06</td>\n",
" <td>704.037437</td>\n",
" <td>34.602974</td>\n",
" <td>71 days</td>\n",
" <td>0.665442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19693</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-28</td>\n",
" <td>17226.001882</td>\n",
" <td>802.851963</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19694</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-29</td>\n",
" <td>17226.001882</td>\n",
" <td>803.614164</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19695</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-30</td>\n",
" <td>17226.001882</td>\n",
" <td>805.138567</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19696</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-01</td>\n",
" <td>17226.001882</td>\n",
" <td>806.408902</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19697</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-02</td>\n",
" <td>17226.001882</td>\n",
" <td>807.171103</td>\n",
" <td>96 days</td>\n",
" <td>0.254067</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19698 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" geo_id Date ConfirmedCases Fatalities \\\n",
"0 Afghanistan_nan 2020-04-02 295.456165 11.312511 \n",
"1 Afghanistan_nan 2020-04-03 295.456165 11.312511 \n",
"2 Afghanistan_nan 2020-04-04 365.327555 15.970603 \n",
"3 Afghanistan_nan 2020-04-05 506.401219 26.617672 \n",
"4 Afghanistan_nan 2020-04-06 704.037437 34.602974 \n",
"... ... ... ... ... \n",
"19693 Zimbabwe_nan 2020-06-28 17226.001882 802.851963 \n",
"19694 Zimbabwe_nan 2020-06-29 17226.001882 803.614164 \n",
"19695 Zimbabwe_nan 2020-06-30 17226.001882 805.138567 \n",
"19696 Zimbabwe_nan 2020-07-01 17226.001882 806.408902 \n",
"19697 Zimbabwe_nan 2020-07-02 17226.001882 807.171103 \n",
"\n",
" date_delta_hubei pop_scale_hubei \n",
"0 71 days 0.665442 \n",
"1 71 days 0.665442 \n",
"2 71 days 0.665442 \n",
"3 71 days 0.665442 \n",
"4 71 days 0.665442 \n",
"... ... ... \n",
"19693 96 days 0.254067 \n",
"19694 96 days 0.254067 \n",
"19695 96 days 0.254067 \n",
"19696 96 days 0.254067 \n",
"19697 96 days 0.254067 \n",
"\n",
"[19698 rows x 6 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# scale by pop_scale_hubei\n",
"data['ConfirmedCases'] = data['ConfirmedCases'] * data['pop_scale_hubei']\n",
"data['Fatalities'] = data['Fatalities'] * data['pop_scale_hubei']\n",
"data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.5 Generate Submission"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geo_id</th>\n",
" <th>Date</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-02</td>\n",
" <td>295.456165</td>\n",
" <td>11.312511</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-03</td>\n",
" <td>295.456165</td>\n",
" <td>11.312511</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-04</td>\n",
" <td>365.327555</td>\n",
" <td>15.970603</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-05</td>\n",
" <td>506.401219</td>\n",
" <td>26.617672</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Afghanistan_nan</td>\n",
" <td>2020-04-06</td>\n",
" <td>704.037437</td>\n",
" <td>34.602974</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19693</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-28</td>\n",
" <td>17226.001882</td>\n",
" <td>802.851963</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19694</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-29</td>\n",
" <td>17226.001882</td>\n",
" <td>803.614164</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19695</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-06-30</td>\n",
" <td>17226.001882</td>\n",
" <td>805.138567</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19696</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-01</td>\n",
" <td>17226.001882</td>\n",
" <td>806.408902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19697</th>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>2020-07-02</td>\n",
" <td>17226.001882</td>\n",
" <td>807.171103</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19698 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" geo_id Date ConfirmedCases Fatalities\n",
"0 Afghanistan_nan 2020-04-02 295.456165 11.312511\n",
"1 Afghanistan_nan 2020-04-03 295.456165 11.312511\n",
"2 Afghanistan_nan 2020-04-04 365.327555 15.970603\n",
"3 Afghanistan_nan 2020-04-05 506.401219 26.617672\n",
"4 Afghanistan_nan 2020-04-06 704.037437 34.602974\n",
"... ... ... ... ...\n",
"19693 Zimbabwe_nan 2020-06-28 17226.001882 802.851963\n",
"19694 Zimbabwe_nan 2020-06-29 17226.001882 803.614164\n",
"19695 Zimbabwe_nan 2020-06-30 17226.001882 805.138567\n",
"19696 Zimbabwe_nan 2020-07-01 17226.001882 806.408902\n",
"19697 Zimbabwe_nan 2020-07-02 17226.001882 807.171103\n",
"\n",
"[19698 rows x 4 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop unneeded columns\n",
"data = data[['geo_id', 'Date', 'ConfirmedCases', 'Fatalities']]\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"geo_id\n",
"Afghanistan_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Albania_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Algeria_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Andorra_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Angola_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
" ... \n",
"Uzbekistan_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Venezuela_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Vietnam_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Zambia_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Zimbabwe_nan AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Length: 294, dtype: object"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots()\n",
"data.sort_values(by=\"Date\").groupby('geo_id').plot.line(x='Date', y='ConfirmedCases', ax=ax, legend=False)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>Province_State</th>\n",
" <th>Country_Region</th>\n",
" <th>Date</th>\n",
" <th>geo_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-19</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-20</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-21</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-22</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-23</td>\n",
" <td>Afghanistan_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12637</th>\n",
" <td>12638</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-26</td>\n",
" <td>Zimbabwe_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12638</th>\n",
" <td>12639</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-27</td>\n",
" <td>Zimbabwe_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12639</th>\n",
" <td>12640</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-28</td>\n",
" <td>Zimbabwe_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12640</th>\n",
" <td>12641</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-29</td>\n",
" <td>Zimbabwe_nan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12641</th>\n",
" <td>12642</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-30</td>\n",
" <td>Zimbabwe_nan</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>12642 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" ForecastId Province_State Country_Region Date geo_id\n",
"0 1 NaN Afghanistan 2020-03-19 Afghanistan_nan\n",
"1 2 NaN Afghanistan 2020-03-20 Afghanistan_nan\n",
"2 3 NaN Afghanistan 2020-03-21 Afghanistan_nan\n",
"3 4 NaN Afghanistan 2020-03-22 Afghanistan_nan\n",
"4 5 NaN Afghanistan 2020-03-23 Afghanistan_nan\n",
"... ... ... ... ... ...\n",
"12637 12638 NaN Zimbabwe 2020-04-26 Zimbabwe_nan\n",
"12638 12639 NaN Zimbabwe 2020-04-27 Zimbabwe_nan\n",
"12639 12640 NaN Zimbabwe 2020-04-28 Zimbabwe_nan\n",
"12640 12641 NaN Zimbabwe 2020-04-29 Zimbabwe_nan\n",
"12641 12642 NaN Zimbabwe 2020-04-30 Zimbabwe_nan\n",
"\n",
"[12642 rows x 5 columns]"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>Province_State</th>\n",
" <th>Country_Region</th>\n",
" <th>Date</th>\n",
" <th>geo_id</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-19</td>\n",
" <td>Afghanistan_nan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-20</td>\n",
" <td>Afghanistan_nan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-21</td>\n",
" <td>Afghanistan_nan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-22</td>\n",
" <td>Afghanistan_nan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>Afghanistan</td>\n",
" <td>2020-03-23</td>\n",
" <td>Afghanistan_nan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12637</th>\n",
" <td>12638</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-26</td>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12638</th>\n",
" <td>12639</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-27</td>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>112.805782</td>\n",
" <td>4.319140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12639</th>\n",
" <td>12640</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-28</td>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>112.805782</td>\n",
" <td>4.319140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12640</th>\n",
" <td>12641</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-29</td>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>139.482825</td>\n",
" <td>6.097610</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12641</th>\n",
" <td>12642</td>\n",
" <td>NaN</td>\n",
" <td>Zimbabwe</td>\n",
" <td>2020-04-30</td>\n",
" <td>Zimbabwe_nan</td>\n",
" <td>193.345046</td>\n",
" <td>10.162683</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>12642 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" ForecastId Province_State Country_Region Date geo_id \\\n",
"0 1 NaN Afghanistan 2020-03-19 Afghanistan_nan \n",
"1 2 NaN Afghanistan 2020-03-20 Afghanistan_nan \n",
"2 3 NaN Afghanistan 2020-03-21 Afghanistan_nan \n",
"3 4 NaN Afghanistan 2020-03-22 Afghanistan_nan \n",
"4 5 NaN Afghanistan 2020-03-23 Afghanistan_nan \n",
"... ... ... ... ... ... \n",
"12637 12638 NaN Zimbabwe 2020-04-26 Zimbabwe_nan \n",
"12638 12639 NaN Zimbabwe 2020-04-27 Zimbabwe_nan \n",
"12639 12640 NaN Zimbabwe 2020-04-28 Zimbabwe_nan \n",
"12640 12641 NaN Zimbabwe 2020-04-29 Zimbabwe_nan \n",
"12641 12642 NaN Zimbabwe 2020-04-30 Zimbabwe_nan \n",
"\n",
" ConfirmedCases Fatalities \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"... ... ... \n",
"12637 NaN NaN \n",
"12638 112.805782 4.319140 \n",
"12639 112.805782 4.319140 \n",
"12640 139.482825 6.097610 \n",
"12641 193.345046 10.162683 \n",
"\n",
"[12642 rows x 7 columns]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# join \n",
"submission = pd.merge(test, data, how=\"left\", on=[\"geo_id\", 'Date'])\n",
"submission"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12637</th>\n",
" <td>12638</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12638</th>\n",
" <td>12639</td>\n",
" <td>112.805782</td>\n",
" <td>4.319140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12639</th>\n",
" <td>12640</td>\n",
" <td>112.805782</td>\n",
" <td>4.319140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12640</th>\n",
" <td>12641</td>\n",
" <td>139.482825</td>\n",
" <td>6.097610</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12641</th>\n",
" <td>12642</td>\n",
" <td>193.345046</td>\n",
" <td>10.162683</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>12642 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" ForecastId ConfirmedCases Fatalities\n",
"0 1 NaN NaN\n",
"1 2 NaN NaN\n",
"2 3 NaN NaN\n",
"3 4 NaN NaN\n",
"4 5 NaN NaN\n",
"... ... ... ...\n",
"12637 12638 NaN NaN\n",
"12638 12639 112.805782 4.319140\n",
"12639 12640 112.805782 4.319140\n",
"12640 12641 139.482825 6.097610\n",
"12641 12642 193.345046 10.162683\n",
"\n",
"[12642 rows x 3 columns]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"submission = submission[['ForecastId', 'ConfirmedCases', 'Fatalities']]\n",
"submission"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py:4259: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" **kwargs\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12637</th>\n",
" <td>12638</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12638</th>\n",
" <td>12639</td>\n",
" <td>112.805782</td>\n",
" <td>4.319140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12639</th>\n",
" <td>12640</td>\n",
" <td>112.805782</td>\n",
" <td>4.319140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12640</th>\n",
" <td>12641</td>\n",
" <td>139.482825</td>\n",
" <td>6.097610</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12641</th>\n",
" <td>12642</td>\n",
" <td>193.345046</td>\n",
" <td>10.162683</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>12642 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" ForecastId ConfirmedCases Fatalities\n",
"0 1 0.000000 0.000000\n",
"1 2 0.000000 0.000000\n",
"2 3 0.000000 0.000000\n",
"3 4 0.000000 0.000000\n",
"4 5 0.000000 0.000000\n",
"... ... ... ...\n",
"12637 12638 0.000000 0.000000\n",
"12638 12639 112.805782 4.319140\n",
"12639 12640 112.805782 4.319140\n",
"12640 12641 139.482825 6.097610\n",
"12641 12642 193.345046 10.162683\n",
"\n",
"[12642 rows x 3 columns]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"submission.fillna(0, inplace=True)\n",
"submission"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ForecastId</th>\n",
" <th>ConfirmedCases</th>\n",
" <th>Fatalities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>12642.000000</td>\n",
" <td>1.264200e+04</td>\n",
" <td>12642.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>6321.500000</td>\n",
" <td>1.431743e+04</td>\n",
" <td>550.230093</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3649.575386</td>\n",
" <td>8.137373e+04</td>\n",
" <td>3220.130825</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>3161.250000</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>6321.500000</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>9481.750000</td>\n",
" <td>3.064835e+03</td>\n",
" <td>93.633302</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>12642.000000</td>\n",
" <td>1.599248e+06</td>\n",
" <td>72774.590218</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ForecastId ConfirmedCases Fatalities\n",
"count 12642.000000 1.264200e+04 12642.000000\n",
"mean 6321.500000 1.431743e+04 550.230093\n",
"std 3649.575386 8.137373e+04 3220.130825\n",
"min 1.000000 0.000000e+00 0.000000\n",
"25% 3161.250000 0.000000e+00 0.000000\n",
"50% 6321.500000 0.000000e+00 0.000000\n",
"75% 9481.750000 3.064835e+03 93.633302\n",
"max 12642.000000 1.599248e+06 72774.590218"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"submission.describe()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"submission.to_csv('submission.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment