Last active
April 7, 2020 22:22
-
-
Save johnjdavisiv/a664e8e4a4f4a833a040bb0a8e406649 to your computer and use it in GitHub Desktop.
Joins simple world development data with Covid-19 data for week 3 on Kaggle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#---------------------------------------------------------------- | |
# Add economic development data to train and test | |
#------------------------------------------------------------------- | |
covid_train <- read.csv("train.csv", stringsAsFactors = FALSE) %>% | |
mutate(Province_State = ifelse(Province_State == "", "none", Province_State)) %>% | |
unite(country_and_province, | |
Country_Region, Province_State, | |
remove=FALSE) | |
covid_test <- read.csv("test.csv", stringsAsFactors = FALSE) %>% | |
mutate(Province_State = ifelse(Province_State == "", "none", Province_State)) %>% | |
unite(country_and_province, | |
Country_Region, Province_State, | |
remove=FALSE) | |
#Summary of all countries in dataset | |
all_countries <- covid_train %>% | |
group_by(country_and_province, Country_Region, Province_State) %>% | |
summarize(cases_today = max(ConfirmedCases), | |
deaths_today = max(Fatalities)) | |
world_dev <- read_csv("World Development - Country.csv", | |
col_types = cols_only( | |
CountryCode = col_character(), | |
ShortName = col_character(), | |
TableName = col_character(), | |
LongName = col_character(), | |
Alpha2Code = col_character(), | |
Region = col_character(), | |
IncomeGroup = col_character())) %>% | |
filter(!is.na(Region), !is.na(IncomeGroup)) %>% | |
mutate(IncomeGroup = gsub(":", "", IncomeGroup)) %>% | |
mutate(IncomeGroup = gsub(" ", "_", IncomeGroup)) %>% | |
mutate(Region = gsub("&", "", Region)) %>% | |
mutate(Region = gsub(" ", "", Region)) %>% | |
mutate(Region = gsub("-", "", Region)) %>% | |
mutate(ShortName = ifelse(ShortName == "Korea", "Korea, South", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Côte d'Ivoire", "Cote d'Ivoire", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Czech Republic", "Czechia", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "United States", "US", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "The Bahamas", "Bahamas", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Myanmar", "Burma", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "The Gambia", "Gambia", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Kyrgyz Republic", "Kyrgyzstan", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Lao PDR", "Laos", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Syrian Arab Republic", "Syria", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "St. Lucia", "Saint Lucia", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Slovak Republic", "Slovakia", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "St. Vincent and the Grenadines", "Saint Vincent and the Grenadines", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Swaziland", "Eswatini", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Macedonia", "North Macedonia", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "St. Kitts and Nevis", "Saint Kitts and Nevis", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Congo", "Congo (Brazzaville)", ShortName)) %>% | |
mutate(ShortName = ifelse(ShortName == "Dem. Rep. Congo", "Congo (Kinshasa)", ShortName)) | |
all_countries_dev <- all_countries %>% | |
left_join(world_dev, by = c("Country_Region" = "ShortName")) %>% | |
select(-TableName, -LongName, -Alpha2Code) %>% | |
mutate(Region = ifelse(Country_Region == "China", "China", Region)) | |
#Fixing Taiwan and Holy See | |
taiwan_ind <- which(all_countries_dev$Country_Region == "Taiwan*") | |
vatican_ind <- which(all_countries_dev$Country_Region == "Holy See") | |
diamond_ind <- which(all_countries_dev$Country_Region == "Diamond Princess") | |
zaandam_ind <- which(all_countries_dev$Country_Region == "MS Zaandam") | |
all_countries_dev$CountryCode[taiwan_ind] <- "TWN" | |
all_countries_dev$CountryCode[vatican_ind] <- "VAT" | |
all_countries_dev$CountryCode[diamond_ind] <- NA | |
all_countries_dev$CountryCode[zaandam_ind] <- NA | |
all_countries_dev$Region[taiwan_ind] <- "EastAsiaPacific" | |
all_countries_dev$Region[vatican_ind] <- "EuropeCentralAsia" | |
all_countries_dev$Region[diamond_ind] <- "Boat" | |
all_countries_dev_2 <- all_countries_dev %>% | |
ungroup() %>% | |
mutate(geo_region = Region) %>% | |
select(country_and_province, CountryCode, geo_region, IncomeGroup) | |
covid_train_augmented <- left_join(covid_train, all_countries_dev_2, by = "country_and_province") | |
covid_test_augmented <- left_join(covid_test, all_countries_dev_2, by = "country_and_province") | |
write.csv(covid_train_augmented, "train_augmented.csv", row.names = FALSE) | |
write.csv(covid_test_augmented, "test_augmented.csv", row.names = FALSE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment