Skip to content

Instantly share code, notes, and snippets.

@BatonVatrushka
Last active November 29, 2020 12:50
Show Gist options
  • Save BatonVatrushka/cd2a2e38e92ba8a27be87aa4f1873c1f to your computer and use it in GitHub Desktop.
Save BatonVatrushka/cd2a2e38e92ba8a27be87aa4f1873c1f to your computer and use it in GitHub Desktop.
COVID Viz Code
# This script is used for covid data / maps
# data is from the NY Times Github
library(pacman)
p_load(tidyverse)
p_load(ggmap, maptools, rgeos, car, scales, directlabels)
# read in the data
covid_state = read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
str(covid_state)
#------------------------------------------------------------------
# LINE GRAPHS
#------------------------------------------------------------------
# remove the US territories from covid_state (or don't, if you like)
df1 = filter(covid_state, !(state %in% c('Northern Mariana Islands'
, 'Puerto Rico'
, 'Virgin Islands'
, 'Guam')))
# rename DC using recode from the car package
df1 = mutate(df1, state = recode(state,"'District of Columbia' = 'DC'"))
# create a line graph using facet_wrap
ggplot(df1) + aes(date, cases, group = state, color = cases) +
geom_line(lwd = 1) +
theme_gray() +
theme(legend.position = 'none') +
scale_color_gradient(high = 'darkorange', low = 'darkblue') + facet_wrap(vars(state)) +
# this code gets rid of scientific notation on the y-axis
scale_y_continuous(labels = comma)
# choose states w/ the highest # of caes from the facet_wrap graph
# create a data frame w/ only California, Florida, Illinois, New York, and Texas
df2 = filter(covid_state, (state %in% c('California'
, 'Illinois'
, 'New York'
, 'Texas'
, 'Florida')))
# labels at end of lines: https://stackoverflow.com/questions/29357612/plot-labels-at-ends-of-lines
# set date range: https://stackoverflow.com/questions/14162829/set-date-range-in-ggplot
# get rid of scientific notation: https://statisticsglobe.com/change-formatting-of-numbers-of-ggplot2-plot-axis-in-r
# change row text: https://stackoverflow.com/questions/30219908/change-row-text-using-dplyr
# store the current date in a variable
date = max(df2$date)
# create the line plot
ggplot(df2) + aes(date, cases, group = state, color = cases) + theme_gray() +
geom_line(lwd = 1.5) + theme(legend.position = 'none') +
scale_color_gradient(high = 'darkorange', low = 'darkblue') +
# this code adds labels to the end of the lines using the directlabels package
geom_dl(aes(label=state), method = list(dl.trans(x=x+0.1), 'last.qp')) +
# set the date range for your x-axis
scale_x_date(date_breaks = '1 month',
labels = date_format("%b-%y"),
limits = as.Date(c('2020-01-01', '2021-01-01'))) +
# remove scientific notation
scale_y_continuous(labels = comma) +
# title w/ current date
ggtitle("Total Cases as of:",date)
#------------------------------------------------------------------
# CHOROPLETH
#------------------------------------------------------------------
# aggregate cases and deaths
# use the max function w/ aggregate because the NYT data calculates a rolling sum
cases = aggregate(covid_state$cases, list(covid_state$state), max)
cases = as_tibble(cases)
cases = cases %>% rename(state = Group.1, cases = x)
colnames(cases)
# store a map w/ US States in a variable
states = raster::getData('GADM', country = 'USA', level = 1)
# compare columns between data and map
cbind(sort(states$NAME_1), sort(cases$state))
# Remove non-contiguous states and territories from the cases df
cases = filter(cases, !(state %in% c('Alaska', 'Hawaii', 'Guam'
, 'Northern Mariana Islands'
, 'Puerto Rico'
, 'Virgin Islands')))
# store the map as a df for ggplot and remove Alaska and Hawaii
# caution: this code might take a bit to run
map = fortify(states, region = "NAME_1")
map = filter(map, !(id %in% c('Alaska', 'Hawaii')))
# check if Alaska or Hawaii are still in the map df
any(map$id[map$id == 'Alaska']) # FALSE
any(map$id[map$id == 'Hawaii']) # FALSE
# create a new df for merging
df = cases
df = df %>% rename(id = state)
# merge the dfs together
my.map <- merge(map, df, by = 'id')
# plot the map
ggplot() + geom_map(data = my.map, map = my.map) +
# define the aesthetics
aes(x = long, y = lat, map_id = id, group = group, fill = cases) +
# create the title using the date variable
ggtitle("COVID-19 Cases in the Contiguous United States as of:", date) +
# set the theme
theme_void() +
# set the fill gradient colors
scale_fill_gradient(high = 'darkorange', low = 'darkblue')
#==============================================================
# PER-CAPITA DATA PREP
#==============================================================
# now we need to calculate cases-per-capita
# read in the population data and check columns
state_population = read_csv('csvData.csv')
cbind(sort(state_population$State), sort(unique(covid_state$state)))
# we need to remove some states and territories from the dfs
# we just want the contiguous sates and DC
state_population = filter(state_population, !(State %in% c('Alaska'
, 'Hawaii'
, 'Puerto Rico')))
# make the state column lowercase
state_population = state_population %>% rename(state = State)
# now create a new df from covid_state then filter for the contiguous US
contiguous = filter(covid_state, !(state %in% c('Alaska', 'Hawaii', 'Guam'
, 'Northern Mariana Islands'
, 'Puerto Rico'
, 'Virgin Islands')))
# merge the data frames
percapita = merge(state_population, contiguous, by = 'state')
# now select the relevant columns
percapita = percapita %>% select(c('date', 'state', 'cases', 'Pop')) %>%
# create the cases per-capita column
mutate(cases_per_capita = (cases / Pop) * 100000) %>%
# arrange by the date
arrange(by_group = date) %>%
# rename DC using recode from the car package
mutate(state = recode(state,"'District of Columbia' = 'DC'"))
#------------------------------------------------------------------
# LINE GRAPHS PER CAP
#------------------------------------------------------------------
# create a line graph using facet_wrap
ggplot(percapita) + aes(date, cases_per_capita, group = state, color = cases_per_capita) +
geom_line(lwd = 1) +
theme_gray() +
theme(legend.position = 'none') +
scale_color_gradient(high = 'darkorange', low = 'darkblue') + facet_wrap(vars(state)) +
# this code gets rid of scientific notation on the y-axis
scale_y_continuous(labels = comma)
# create a data frame w/ Iowa, Nebraska, ND, SD, Utah, Wisconsin, Wyoming
df3 = filter(percapita, (state %in% c('Iowa'
, 'Nebraska'
, 'North Dakota'
, 'South Dakota'
, 'Utah'
, 'Wisconsin'
, 'Wyoming')))
# create the line graph w/ state w/ higher cases per-capita
ggplot(df3) + aes(date, cases_per_capita, group = state, color = cases_per_capita) +
theme_gray() +
geom_line(lwd = 1.5) + theme(legend.position = 'none') +
scale_color_gradient(high = 'darkorange', low = 'darkblue') +
# this code adds labels to the end of the lines using the directlabels package
geom_dl(aes(label=state), method = list(dl.trans(x=x+0.05), 'last.qp')) +
# set the date range for your x-axis
scale_x_date(date_breaks = '1 month',
labels = date_format("%b-%y"),
limits = as.Date(c('2020-01-01', '2021-01-01'))) +
# remove scientific notation
scale_y_continuous(labels = comma) +
# title w/ current date
ggtitle("Cases Per-Capita as of:",date)
#------------------------------------------------------------------
# CHOROPLETH PER CAPPIN MANY SUCKAS
#------------------------------------------------------------------
# aggregate the percapita df
agg.percap = aggregate(percapita$cases_per_capita, list(percapita$state), max)
# rename the columns and change DC back to D.O.C to merge properly
agg.percap = agg.percap %>% rename(id = Group.1, cases_per_capita = x) %>%
mutate(cases_per_capita = round(cases_per_capita,0)) %>%
mutate(id, id = recode(id, "'DC'='District of Columbia'"))
# check to make sure the columns match
cbind(sort(unique(map$id)), sort(agg.percap$id))
# merge the dfs
my.map2 = merge(agg.percap, map, by = 'id')
# plot the map
ggplot() + geom_map(data = my.map2, map = my.map2) +
# define the aesthetics
aes(x = long, y = lat, map_id = id, group = group, fill = cases_per_capita) +
# create the title using the date variable
ggtitle("COVID-19 Cases Per-Capita in the Contiguous United States as of:", date) +
# set the theme
theme_void() +
# set the fill gradient colors
scale_fill_gradient(high = 'darkorange', low = 'darkblue')
#================================================
# FURTHER ANALYSIS
#================================================
# what is the correlation between # of cases and population?
cor(percapita$Pop, percapita$cases)
# what about pop and cases per capita?
cor(percapita$Pop, percapita$cases_per_capita)
# let's do another quick aggregation for one last graph
df4 = merge(cases, state_population, by = 'state')
df4 = df4 %>% mutate(cases_per_capita = round((cases / Pop) * 100000),0) %>%
mutate(state, state = recode(state, "'District of Columbia'='DC'"))
# create a color map for states whose number of cases PerCap > mean
col = ifelse(df4$cases_per_capita > mean(df4$cases_per_capita), 'red', 'blue')
# plot cases_per_capita against population
ggplot(df4) + aes(Pop, cases_per_capita) +
# label the points by state
geom_label(aes(label=state), nudge_x = 50, color = col) +
# set the theme
theme_dark() +
# get rid of scientific notation and set the limits
scale_x_continuous(labels = comma, limits = c(-5,45000000)) +
# add labels to the axes
xlab("Population") + ylab("Cases Per-Capita") +
ggtitle("Cases Per-Capita Plotted Against Population")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment