patrickvossler18/fl_covid_pdf_to_df.R

## fl_covid_pdf_to_df.R
library(tidyverse)
library(tabulizer)


download.file(url = "https://floridadisaster.org/globalassets/covid19/dailies/covid-19-data---daily-report-2020-03-29-1002.pdf",
                   destfile = "fl_covid_data.pdf")
f = "fl_covid_data.pdf"
out1 <- extract_tables(f,pages = 21:89,method = "stream")

# It looks like every so often it mis-reads multi-line rows. You could probably
# fix this by looping through the rows and combining rows by checking if the row
# below it is mostly blank
combined_tables <- map_df(1:length(out1), function(j){
    # combine first three rows to make column names
    # brittle solution, but oh well.
    column_names <- map_chr(1:ncol(out1[[j]]), function(i) {
        str_trim(paste(out1[[j]][1:3,i],collapse = " "))
    })

    # using `output = "data.frame"` gives weird results so we'll just make the data frame ourselves
    table_results <- data.frame(out1[[j]][-c(1,2,3),], stringsAsFactors = F)
    colnames(table_results) <- column_names

    # check for rows that were mis-read
    i = 1
    while(i < nrow(table_results)){
        if(table_results[i+1,]$Case == ""){
            table_results[i,]$`Travel  related` = table_results[i+1,]$`Travel  related`
        }
    i = i + 1
    }

    table_results %>% filter(Case != "")

})
	library(tidyverse)
	library(tabulizer)


	download.file(url = "https://floridadisaster.org/globalassets/covid19/dailies/covid-19-data---daily-report-2020-03-29-1002.pdf",
	destfile = "fl_covid_data.pdf")
	f = "fl_covid_data.pdf"
	out1 <- extract_tables(f,pages = 21:89,method = "stream")

	# It looks like every so often it mis-reads multi-line rows. You could probably
	# fix this by looping through the rows and combining rows by checking if the row
	# below it is mostly blank
	combined_tables <- map_df(1:length(out1), function(j){
	# combine first three rows to make column names
	# brittle solution, but oh well.
	column_names <- map_chr(1:ncol(out1[[j]]), function(i) {
	str_trim(paste(out1[[j]][1:3,i],collapse = " "))
	})

	# using `output = "data.frame"` gives weird results so we'll just make the data frame ourselves
	table_results <- data.frame(out1[[j]][-c(1,2,3),], stringsAsFactors = F)
	colnames(table_results) <- column_names

	# check for rows that were mis-read
	i = 1
	while(i < nrow(table_results)){
	if(table_results[i+1,]$Case == ""){
	table_results[i,]$`Travel related` = table_results[i+1,]$`Travel related`
	}
	i = i + 1
	}

	table_results %>% filter(Case != "")

	})