Skip to content

Instantly share code, notes, and snippets.

@bmschmidt
Created December 13, 2020 17:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bmschmidt/818962befd5c592f4e4e1de502a8ae41 to your computer and use it in GitHub Desktop.
Save bmschmidt/818962befd5c592f4e4e1de502a8ae41 to your computer and use it in GitHub Desktop.
Why does the New York Times hate colleges?
---
title: "R Notebook"
output: html_notebook
---
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*.
```{r}
library(tidycensus)
library(tidyverse)
library(sf)
options(tigris_use_cache = TRUE)
v18 <- load_variables(2018, "acs5", cache = TRUE)
v18 %>% filter(concept == "SCHOOL ENROLLMENT BY LEVEL OF SCHOOL FOR THE POPULATION 3 YEARS AND OVER")
# high school, college, grad school, total.
cats = c("B14001_007", "B14001_008", "B14001_009", "B01001_001")
g = get_acs("county", cats, geometry = TRUE, cache_table = TRUE)
# ?get_acs
shares = g %>% st_set_geometry(NULL) %>% select(-moe) %>%
pivot_wider(names_from = "variable", values_from="estimate") %>%
mutate(share = (B14001_008 + B14001_009) / B01001_001) %>%
mutate(students = B14001_008 + B14001_009, undergrads = B14001_008) %>%
select(NAME, FIPS = GEOID, share, students, undergrads, total = B01001_001) %>%
mutate(category = ifelse(share > .1, "student county", "non-student county") )
#wider = g %>%
# distinct(GEOID) %>%
# inner_join(shares) %>%
# ggplot() + geom_sf(aes(fill=B14001_001))
# wider + scale_fill_viridis_c(trans="log")
shares %>%
arrange(-total) %>%
mutate(r = 1:n(), cumulative = cumsum(total))%>%
ggplot() + geom_line(aes(x=r, y = cumulative))
shares %>% write_csv("ACS.csv")
confirmed_raw = read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv')
deaths_raw = read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')
deparse = . %>% mutate(FIPS = str_pad(FIPS, 5, "left", "0")) %>%
pivot_longer(cols = matches("[0-9]/"), names_to = "date", values_to = "count") %>%
mutate(date = lubridate::parse_date_time(date, orders = "m/d/y")) %>% mutate(new = count - lag(count, 1))
confirmed = confirmed_raw %>% deparse %>% mutate(variable = "confirmed")
deaths = deaths_raw %>% deparse %>% mutate(variable = "deaths")
together = confirmed %>% bind_rows(deaths) %>% inner_join(shares)
together %>% filter(variable=="deaths") %>% filter(new > 0) %>%
group_by(category, date, variable) %>% summarize(count = sum(new)/sum(total)) %>%
ggplot() + geom_line(aes(x = date, y = count, color=category)) + labs(title = "Death Rate from COVID-19 by student population") + scale_color_brewer(palette = 2, type="qual") + theme_bw() + facet_wrap(~variable, scales = "free_y") + labs(caption = "Johns Hopkins data")
weekly = together %>% filter(date > lubridate::ymd("2020-03-15")) %>% mutate(week = lubridate::round_date(date, unit = "week")) %>%
group_by(FIPS, variable, Combined_Key, week, students, share, category, total) %>%
summarize(new = sum(new))
weekly %>% group_by(week, variable) %>% summarize(new = sum(new)) %>%
ggplot() + geom_line(aes(x = week, y = new, color = variable)) +
labs(title = "Weekly counts. Sanity test on data.")
```
```{r}
populations = weekly %>% ungroup %>% distinct(FIPS, category, total)
pops = populations %>% count(category, wt = total, name = "total")
weekly %>%
group_by(category, week, variable) %>%
summarize(count = sum(new)) %>%
inner_join(pops) %>%
ggplot() + geom_line(aes(x = week, y = count/total, color = category)) +
facet_wrap(~variable, scales = "free_y")
```
```{r}
overall_rates = weekly %>%
group_by(variable, total) %>%
summarize(total = sum(total))
```
```{r}
weekly %>% ungroup %>%
group_by(week, total, variable, FIPS, category) %>%
summarize(new = sum(new)) %>%
group_by(week, variable, category) %>%
summarize(total = sum(total), new = sum(new)) %>%
mutate(ratio = new/total) %>%
select(-new, -total) %>%
pivot_wider(names_from = "category", values_from = "ratio") %>%
ggplot() + geom_line(aes(x = week, y = `student county`/`non-student county`, color = variable)) +
theme_bw() + labs(title = "Student county rates as share of non-student-county rates.")
```
```{r}
# Counties with low death rates before August 30.
library(lubridate)
weekly_ranks = weekly %>% ungroup %>% filter(week < ymd("2020-08-30")) %>% group_by(FIPS) %>%
filter(variable=="deaths") %>% summarize(deaths = sum(new)) %>% inner_join(populations) %>%
mutate(ratio = deaths/total) %>% arrange(ratio) %>%
mutate(death_rank_pre_september = 1:n()) %>% select(total, FIPS, category, death_rank_pre_september)
weekly
cat2 = weekly_ranks %>%
mutate(pre_sept_class = ifelse(death_rank_pre_september < 1500, "low", "high")) %>%
select(-death_rank_pre_september)
weekly %>% ungroup %>%
left_join(cat2) %>%
mutate(category = pre_sept_class) %>%
group_by(week, total, variable, FIPS, category) %>%
summarize(new = sum(new)) %>%
group_by(week, variable, category) %>%
summarize(total = sum(total), new = sum(new)) %>%
mutate(ratio = new/total) %>%
select(-new, -total) %>%
pivot_wider(names_from = "category", values_from = "ratio") %>%
ggplot() + geom_line(aes(x = week, y = `low`/`high`, color = variable)) +
theme_bw() + labs(title = "The 1500 counties with low death rates before August \ngenerally reverted back up over the mean ") + scale_y_continuous("rate relative to high-incidence counties", labels = scales::percent)
```
```{r}
weekly %>% inner_join(tibble(week = lubridate::ymd("2020-08-02", "2020-12-06")))
weekly %>%
pivot_wider(names_from = "week", values_from = "new") %>%
filter(variable == "confirmed") %>%
mutate(delta = `2020-12-06`/`2020-08-02`) %>%
filter(delta > 0, `2020-08-02` > 5, `2020-12-06` > 5) %>%
arrange(-total) %>%
ggplot() + aes(x = share, y = delta) + geom_point(alpha = 0.1) + geom_text(aes(label = Combined_Key %>% str_replace(", US", "")), check_overlap = TRUE) + scale_y_continuous("Change in Coronavirus cases, Week of 8/02 to week of 12-06", trans="log10") + scale_x_continuous("Student Population share", trans="sqrt", labels = scales::percent) + geom_smooth() + theme_bw() + labs(title = "Is there a relationship between student population and corona rate?", subtitle = "Not that I can see")
```
```{r}
weekly %>%
pivot_wider(names_from = "week", values_from = "new") %>%
filter(variable == "deaths") %>%
mutate(delta = `2020-12-06`/`2020-08-30`) %>%
filter(delta > 0, total > 3000) %>%
arrange(-total) %>%
ggplot() + aes(x = share, y = delta) + geom_point(alpha = 0.1) + geom_text(aes(label = Combined_Key %>% str_replace(", US", "")), check_overlap = TRUE) + scale_y_continuous("Change in Coronavirus deaths, Week of 8/02 to week of 12-06", trans="log10") + scale_x_continuous("Student Population share", trans="sqrt", labels = scales::percent) + geom_smooth() + theme_bw() + labs(title = "Is there a relationship between student population and corona death rate?", subtitle = "Not that I can see")
```
```{r}
weekly %>%
pivot_wider(names_from = "week", values_from = "new") %>%
filter(variable == "deaths") %>%
filter(Combined_Key %>% str_detect("Wisconsin")) %>%
mutate(delta = `2020-12-06`/`2020-08-30`) %>%
arrange(-total) %>%
ggplot() + aes(x = total, y = delta, color=category) + geom_point(alpha = 0.1) + geom_text(aes(label = Combined_Key %>% str_replace(", US", "")), check_overlap = TRUE) + scale_y_continuous("Change in Coronavirus deaths, Week of 8/02 to week of 12-06", trans="log10") + scale_x_continuous("pop", trans="log10", labels = scales::percent)+ theme_bw() + labs(title = "Is there a relationship between student population and corona death rate?", subtitle = "Not that I can see")
```
```
```{r}
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment