Skip to content

Instantly share code, notes, and snippets.

@johnjdavisiv
Last active March 24, 2020 21:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnjdavisiv/de43decd1c70efcba8e0341d5768d584 to your computer and use it in GitHub Desktop.
Save johnjdavisiv/de43decd1c70efcba8e0341d5768d584 to your computer and use it in GitHub Desktop.
This script pulls JHU data on COVID19 in the US and matches it with the Kaggle competition data prior to March 9th.
#Kaggle COVID19 prediction model
#JJD
#24 march 2020
#johnjdiv@gmail.com
library(tidyverse)
library(nlme)
library(HRW)
#Get JHU data
jhu_cases <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
jhu_fatalities <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
#Load data
covid_train <- read.csv("train.csv")
covid_test <- read.csv("test.csv")
covid_submit_sample <- read.csv("submission.csv")
#Clean training data
covid_world_prelim <- covid_train %>%
mutate(Date = as.Date(Date, "%Y-%m-%d")) %>%
filter(!(Country.Region == "US" & Date < as.Date("2020-03-09", "%Y-%m-%d")))
#Justification: the Kaggle data does not appropriately include US data prior to 03-09-2020
#Need to augment with JHU data
state_terr_abb <- c(state.abb, "DC")
state_terr_name <- c(state.name, "District of Columbia")
#Use JHU data to get US statewide data
jhu_cases_us_states <- jhu_cases %>%
filter(`Country/Region` == "US") %>%
filter(grepl(paste(".",
paste(state.abb, collapse="|"),
".", sep = ""),
`Province/State`)) %>%
separate(`Province/State`, into = c("city", "state"), sep = ",\\s") %>%
group_by(state) %>% summarise_if(is.numeric, sum) %>%
mutate(`Country.Region` = "US") %>%
mutate(Province.State = state_terr_name[match(state, state_terr_abb)]) %>%
select(Province.State, Country.Region, everything()) %>%
pivot_longer(-(1:5), names_to = "Date", values_to = "ConfirmedCases") %>%
mutate(Date = as.Date(Date, "%m/%d/%y")) %>%
filter(Date < as.Date("2020-03-09", "%Y-%m-%d")) %>%
select(Province.State, Country.Region, Lat, Long, Date, ConfirmedCases)
jhu_fatalities_us_states <- jhu_fatalities %>%
filter(`Country/Region` == "US") %>%
filter(grepl(paste(".",
paste(state.abb, collapse="|"),
".", sep = ""),
`Province/State`)) %>%
separate(`Province/State`, into = c("city", "state"), sep = ",\\s") %>%
group_by(state) %>% summarise_if(is.numeric, sum) %>%
mutate(`Country.Region` = "US") %>%
mutate(Province.State = state_terr_name[match(state, state_terr_abb)]) %>%
select(Province.State, Country.Region, everything()) %>%
pivot_longer(-(1:5), names_to = "Date", values_to = "Fatalities") %>%
mutate(Date = as.Date(Date, "%m/%d/%y")) %>%
filter(Date < as.Date("2020-03-09", "%Y-%m-%d")) %>%
select(Fatalities)
#Bind columsn together
jhu_us_train <- jhu_cases_us_states %>%
bind_cols(jhu_fatalities_us_states) %>%
select(-Lat, -Long)
#Reindex latitude and longitude
lat_long_Id_ind <- covid_train %>%
mutate(Date = as.Date(Date, "%Y-%m-%d")) %>%
filter((Country.Region == "US" & Date < as.Date("2020-03-09", "%Y-%m-%d"))) %>%
select(Id, Province.State, Lat, Long, Date)
#Reformat to match Kaggle
jhu_us_train <- jhu_us_train %>%
left_join(lat_long_Id_ind, by = c("Province.State", "Date")) %>%
select(Id, Province.State, Country.Region,
Lat, Long, Date, ConfirmedCases, Fatalities)
#Rejoin with Kaggle
covid_world <- covid_world_prelim %>%
bind_rows(jhu_us_train) %>%
arrange(Id)
#Show that this is better
covid_world_prelim %>%
filter(Country.Region == "US") %>%
filter(ConfirmedCases > 0) %>%
ggplot(aes(x=Date, y=ConfirmedCases, color = Province.State)) +
geom_point() +
geom_line() +
scale_y_log10() +
coord_cartesian(xlim = as.Date(c("2020-02-15","2020-03-25"))) +
theme(legend.position = "none") +
ggtitle("Cases by US state: Original Kaggle data")
covid_world %>%
filter(Country.Region == "US") %>%
filter(ConfirmedCases > 0) %>%
ggplot(aes(x=Date, y=ConfirmedCases, color = Province.State)) +
geom_point() +
geom_line() +
scale_y_log10() +
coord_cartesian(xlim = as.Date(c("2020-02-15","2020-03-25"))) +
theme(legend.position = "none")+
ggtitle("Cases by US state: Augmented with JHU raw data")
#Write data
write.csv(covid_world,
file = "covid19_train_data_us_states_before_march_09.csv",
row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment