Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Joins simple world development data with Covid-19 data for week 3 on Kaggle
#----------------------------------------------------------------
# Add economic development data to train and test
#-------------------------------------------------------------------
covid_train <- read.csv("train.csv", stringsAsFactors = FALSE) %>%
mutate(Province_State = ifelse(Province_State == "", "none", Province_State)) %>%
unite(country_and_province,
Country_Region, Province_State,
remove=FALSE)
covid_test <- read.csv("test.csv", stringsAsFactors = FALSE) %>%
mutate(Province_State = ifelse(Province_State == "", "none", Province_State)) %>%
unite(country_and_province,
Country_Region, Province_State,
remove=FALSE)
#Summary of all countries in dataset
all_countries <- covid_train %>%
group_by(country_and_province, Country_Region, Province_State) %>%
summarize(cases_today = max(ConfirmedCases),
deaths_today = max(Fatalities))
world_dev <- read_csv("World Development - Country.csv",
col_types = cols_only(
CountryCode = col_character(),
ShortName = col_character(),
TableName = col_character(),
LongName = col_character(),
Alpha2Code = col_character(),
Region = col_character(),
IncomeGroup = col_character())) %>%
filter(!is.na(Region), !is.na(IncomeGroup)) %>%
mutate(IncomeGroup = gsub(":", "", IncomeGroup)) %>%
mutate(IncomeGroup = gsub(" ", "_", IncomeGroup)) %>%
mutate(Region = gsub("&", "", Region)) %>%
mutate(Region = gsub(" ", "", Region)) %>%
mutate(Region = gsub("-", "", Region)) %>%
mutate(ShortName = ifelse(ShortName == "Korea", "Korea, South", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Côte d'Ivoire", "Cote d'Ivoire", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Czech Republic", "Czechia", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "United States", "US", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "The Bahamas", "Bahamas", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Myanmar", "Burma", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "The Gambia", "Gambia", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Kyrgyz Republic", "Kyrgyzstan", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Lao PDR", "Laos", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Syrian Arab Republic", "Syria", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "St. Lucia", "Saint Lucia", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Slovak Republic", "Slovakia", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "St. Vincent and the Grenadines", "Saint Vincent and the Grenadines", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Swaziland", "Eswatini", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Macedonia", "North Macedonia", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "St. Kitts and Nevis", "Saint Kitts and Nevis", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Congo", "Congo (Brazzaville)", ShortName)) %>%
mutate(ShortName = ifelse(ShortName == "Dem. Rep. Congo", "Congo (Kinshasa)", ShortName))
all_countries_dev <- all_countries %>%
left_join(world_dev, by = c("Country_Region" = "ShortName")) %>%
select(-TableName, -LongName, -Alpha2Code) %>%
mutate(Region = ifelse(Country_Region == "China", "China", Region))
#Fixing Taiwan and Holy See
taiwan_ind <- which(all_countries_dev$Country_Region == "Taiwan*")
vatican_ind <- which(all_countries_dev$Country_Region == "Holy See")
diamond_ind <- which(all_countries_dev$Country_Region == "Diamond Princess")
zaandam_ind <- which(all_countries_dev$Country_Region == "MS Zaandam")
all_countries_dev$CountryCode[taiwan_ind] <- "TWN"
all_countries_dev$CountryCode[vatican_ind] <- "VAT"
all_countries_dev$CountryCode[diamond_ind] <- NA
all_countries_dev$CountryCode[zaandam_ind] <- NA
all_countries_dev$Region[taiwan_ind] <- "EastAsiaPacific"
all_countries_dev$Region[vatican_ind] <- "EuropeCentralAsia"
all_countries_dev$Region[diamond_ind] <- "Boat"
all_countries_dev_2 <- all_countries_dev %>%
ungroup() %>%
mutate(geo_region = Region) %>%
select(country_and_province, CountryCode, geo_region, IncomeGroup)
covid_train_augmented <- left_join(covid_train, all_countries_dev_2, by = "country_and_province")
covid_test_augmented <- left_join(covid_test, all_countries_dev_2, by = "country_and_province")
write.csv(covid_train_augmented, "train_augmented.csv", row.names = FALSE)
write.csv(covid_test_augmented, "test_augmented.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment