Last active
November 29, 2020 12:50
-
-
Save BatonVatrushka/cd2a2e38e92ba8a27be87aa4f1873c1f to your computer and use it in GitHub Desktop.
COVID Viz Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script is used for covid data / maps | |
# data is from the NY Times Github | |
library(pacman) | |
p_load(tidyverse) | |
p_load(ggmap, maptools, rgeos, car, scales, directlabels) | |
# read in the data | |
covid_state = read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv') | |
str(covid_state) | |
#------------------------------------------------------------------ | |
# LINE GRAPHS | |
#------------------------------------------------------------------ | |
# remove the US territories from covid_state (or don't, if you like) | |
df1 = filter(covid_state, !(state %in% c('Northern Mariana Islands' | |
, 'Puerto Rico' | |
, 'Virgin Islands' | |
, 'Guam'))) | |
# rename DC using recode from the car package | |
df1 = mutate(df1, state = recode(state,"'District of Columbia' = 'DC'")) | |
# create a line graph using facet_wrap | |
ggplot(df1) + aes(date, cases, group = state, color = cases) + | |
geom_line(lwd = 1) + | |
theme_gray() + | |
theme(legend.position = 'none') + | |
scale_color_gradient(high = 'darkorange', low = 'darkblue') + facet_wrap(vars(state)) + | |
# this code gets rid of scientific notation on the y-axis | |
scale_y_continuous(labels = comma) | |
# choose states w/ the highest # of caes from the facet_wrap graph | |
# create a data frame w/ only California, Florida, Illinois, New York, and Texas | |
df2 = filter(covid_state, (state %in% c('California' | |
, 'Illinois' | |
, 'New York' | |
, 'Texas' | |
, 'Florida'))) | |
# labels at end of lines: https://stackoverflow.com/questions/29357612/plot-labels-at-ends-of-lines | |
# set date range: https://stackoverflow.com/questions/14162829/set-date-range-in-ggplot | |
# get rid of scientific notation: https://statisticsglobe.com/change-formatting-of-numbers-of-ggplot2-plot-axis-in-r | |
# change row text: https://stackoverflow.com/questions/30219908/change-row-text-using-dplyr | |
# store the current date in a variable | |
date = max(df2$date) | |
# create the line plot | |
ggplot(df2) + aes(date, cases, group = state, color = cases) + theme_gray() + | |
geom_line(lwd = 1.5) + theme(legend.position = 'none') + | |
scale_color_gradient(high = 'darkorange', low = 'darkblue') + | |
# this code adds labels to the end of the lines using the directlabels package | |
geom_dl(aes(label=state), method = list(dl.trans(x=x+0.1), 'last.qp')) + | |
# set the date range for your x-axis | |
scale_x_date(date_breaks = '1 month', | |
labels = date_format("%b-%y"), | |
limits = as.Date(c('2020-01-01', '2021-01-01'))) + | |
# remove scientific notation | |
scale_y_continuous(labels = comma) + | |
# title w/ current date | |
ggtitle("Total Cases as of:",date) | |
#------------------------------------------------------------------ | |
# CHOROPLETH | |
#------------------------------------------------------------------ | |
# aggregate cases and deaths | |
# use the max function w/ aggregate because the NYT data calculates a rolling sum | |
cases = aggregate(covid_state$cases, list(covid_state$state), max) | |
cases = as_tibble(cases) | |
cases = cases %>% rename(state = Group.1, cases = x) | |
colnames(cases) | |
# store a map w/ US States in a variable | |
states = raster::getData('GADM', country = 'USA', level = 1) | |
# compare columns between data and map | |
cbind(sort(states$NAME_1), sort(cases$state)) | |
# Remove non-contiguous states and territories from the cases df | |
cases = filter(cases, !(state %in% c('Alaska', 'Hawaii', 'Guam' | |
, 'Northern Mariana Islands' | |
, 'Puerto Rico' | |
, 'Virgin Islands'))) | |
# store the map as a df for ggplot and remove Alaska and Hawaii | |
# caution: this code might take a bit to run | |
map = fortify(states, region = "NAME_1") | |
map = filter(map, !(id %in% c('Alaska', 'Hawaii'))) | |
# check if Alaska or Hawaii are still in the map df | |
any(map$id[map$id == 'Alaska']) # FALSE | |
any(map$id[map$id == 'Hawaii']) # FALSE | |
# create a new df for merging | |
df = cases | |
df = df %>% rename(id = state) | |
# merge the dfs together | |
my.map <- merge(map, df, by = 'id') | |
# plot the map | |
ggplot() + geom_map(data = my.map, map = my.map) + | |
# define the aesthetics | |
aes(x = long, y = lat, map_id = id, group = group, fill = cases) + | |
# create the title using the date variable | |
ggtitle("COVID-19 Cases in the Contiguous United States as of:", date) + | |
# set the theme | |
theme_void() + | |
# set the fill gradient colors | |
scale_fill_gradient(high = 'darkorange', low = 'darkblue') | |
#============================================================== | |
# PER-CAPITA DATA PREP | |
#============================================================== | |
# now we need to calculate cases-per-capita | |
# read in the population data and check columns | |
state_population = read_csv('csvData.csv') | |
cbind(sort(state_population$State), sort(unique(covid_state$state))) | |
# we need to remove some states and territories from the dfs | |
# we just want the contiguous sates and DC | |
state_population = filter(state_population, !(State %in% c('Alaska' | |
, 'Hawaii' | |
, 'Puerto Rico'))) | |
# make the state column lowercase | |
state_population = state_population %>% rename(state = State) | |
# now create a new df from covid_state then filter for the contiguous US | |
contiguous = filter(covid_state, !(state %in% c('Alaska', 'Hawaii', 'Guam' | |
, 'Northern Mariana Islands' | |
, 'Puerto Rico' | |
, 'Virgin Islands'))) | |
# merge the data frames | |
percapita = merge(state_population, contiguous, by = 'state') | |
# now select the relevant columns | |
percapita = percapita %>% select(c('date', 'state', 'cases', 'Pop')) %>% | |
# create the cases per-capita column | |
mutate(cases_per_capita = (cases / Pop) * 100000) %>% | |
# arrange by the date | |
arrange(by_group = date) %>% | |
# rename DC using recode from the car package | |
mutate(state = recode(state,"'District of Columbia' = 'DC'")) | |
#------------------------------------------------------------------ | |
# LINE GRAPHS PER CAP | |
#------------------------------------------------------------------ | |
# create a line graph using facet_wrap | |
ggplot(percapita) + aes(date, cases_per_capita, group = state, color = cases_per_capita) + | |
geom_line(lwd = 1) + | |
theme_gray() + | |
theme(legend.position = 'none') + | |
scale_color_gradient(high = 'darkorange', low = 'darkblue') + facet_wrap(vars(state)) + | |
# this code gets rid of scientific notation on the y-axis | |
scale_y_continuous(labels = comma) | |
# create a data frame w/ Iowa, Nebraska, ND, SD, Utah, Wisconsin, Wyoming | |
df3 = filter(percapita, (state %in% c('Iowa' | |
, 'Nebraska' | |
, 'North Dakota' | |
, 'South Dakota' | |
, 'Utah' | |
, 'Wisconsin' | |
, 'Wyoming'))) | |
# create the line graph w/ state w/ higher cases per-capita | |
ggplot(df3) + aes(date, cases_per_capita, group = state, color = cases_per_capita) + | |
theme_gray() + | |
geom_line(lwd = 1.5) + theme(legend.position = 'none') + | |
scale_color_gradient(high = 'darkorange', low = 'darkblue') + | |
# this code adds labels to the end of the lines using the directlabels package | |
geom_dl(aes(label=state), method = list(dl.trans(x=x+0.05), 'last.qp')) + | |
# set the date range for your x-axis | |
scale_x_date(date_breaks = '1 month', | |
labels = date_format("%b-%y"), | |
limits = as.Date(c('2020-01-01', '2021-01-01'))) + | |
# remove scientific notation | |
scale_y_continuous(labels = comma) + | |
# title w/ current date | |
ggtitle("Cases Per-Capita as of:",date) | |
#------------------------------------------------------------------ | |
# CHOROPLETH PER CAPPIN MANY SUCKAS | |
#------------------------------------------------------------------ | |
# aggregate the percapita df | |
agg.percap = aggregate(percapita$cases_per_capita, list(percapita$state), max) | |
# rename the columns and change DC back to D.O.C to merge properly | |
agg.percap = agg.percap %>% rename(id = Group.1, cases_per_capita = x) %>% | |
mutate(cases_per_capita = round(cases_per_capita,0)) %>% | |
mutate(id, id = recode(id, "'DC'='District of Columbia'")) | |
# check to make sure the columns match | |
cbind(sort(unique(map$id)), sort(agg.percap$id)) | |
# merge the dfs | |
my.map2 = merge(agg.percap, map, by = 'id') | |
# plot the map | |
ggplot() + geom_map(data = my.map2, map = my.map2) + | |
# define the aesthetics | |
aes(x = long, y = lat, map_id = id, group = group, fill = cases_per_capita) + | |
# create the title using the date variable | |
ggtitle("COVID-19 Cases Per-Capita in the Contiguous United States as of:", date) + | |
# set the theme | |
theme_void() + | |
# set the fill gradient colors | |
scale_fill_gradient(high = 'darkorange', low = 'darkblue') | |
#================================================ | |
# FURTHER ANALYSIS | |
#================================================ | |
# what is the correlation between # of cases and population? | |
cor(percapita$Pop, percapita$cases) | |
# what about pop and cases per capita? | |
cor(percapita$Pop, percapita$cases_per_capita) | |
# let's do another quick aggregation for one last graph | |
df4 = merge(cases, state_population, by = 'state') | |
df4 = df4 %>% mutate(cases_per_capita = round((cases / Pop) * 100000),0) %>% | |
mutate(state, state = recode(state, "'District of Columbia'='DC'")) | |
# create a color map for states whose number of cases PerCap > mean | |
col = ifelse(df4$cases_per_capita > mean(df4$cases_per_capita), 'red', 'blue') | |
# plot cases_per_capita against population | |
ggplot(df4) + aes(Pop, cases_per_capita) + | |
# label the points by state | |
geom_label(aes(label=state), nudge_x = 50, color = col) + | |
# set the theme | |
theme_dark() + | |
# get rid of scientific notation and set the limits | |
scale_x_continuous(labels = comma, limits = c(-5,45000000)) + | |
# add labels to the axes | |
xlab("Population") + ylab("Cases Per-Capita") + | |
ggtitle("Cases Per-Capita Plotted Against Population") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment