Last active
March 29, 2020 23:04
-
-
Save patrickvossler18/7dd0a56652bb8700566ae1bdbbbf8839 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(tabulizer) | |
download.file(url = "https://floridadisaster.org/globalassets/covid19/dailies/covid-19-data---daily-report-2020-03-29-1002.pdf", | |
destfile = "fl_covid_data.pdf") | |
f = "fl_covid_data.pdf" | |
out1 <- extract_tables(f,pages = 21:89,method = "stream") | |
# It looks like every so often it mis-reads multi-line rows. You could probably | |
# fix this by looping through the rows and combining rows by checking if the row | |
# below it is mostly blank | |
combined_tables <- map_df(1:length(out1), function(j){ | |
# combine first three rows to make column names | |
# brittle solution, but oh well. | |
column_names <- map_chr(1:ncol(out1[[j]]), function(i) { | |
str_trim(paste(out1[[j]][1:3,i],collapse = " ")) | |
}) | |
# using `output = "data.frame"` gives weird results so we'll just make the data frame ourselves | |
table_results <- data.frame(out1[[j]][-c(1,2,3),], stringsAsFactors = F) | |
colnames(table_results) <- column_names | |
# check for rows that were mis-read | |
i = 1 | |
while(i < nrow(table_results)){ | |
if(table_results[i+1,]$Case == ""){ | |
table_results[i,]$`Travel related` = table_results[i+1,]$`Travel related` | |
} | |
i = i + 1 | |
} | |
table_results %>% filter(Case != "") | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment