library(httr) | |
library(rvest) | |
library(readxl) | |
library(tidyverse) | |
doe <- read_html("https://www.oe.netl.doe.gov/OE417_annual_summary.aspx") | |
dir.create("~/Data/doe-cache-dir", showWarnings = FALSE) | |
html_nodes(doe, xpath=".//a[contains(., 'XLS')]") %>% | |
html_attr("href") %>% | |
print() %>% # to show the hrefs do not provide nice names | |
{ .pb <<- progress_estimated(length(.)) ; . } %>% # we likely don't rly need progress bars tho | |
walk(~{ | |
.pb$tick()$print() | |
dl_url <- sprintf("https://www.oe.netl.doe.gov/%s", .x) | |
res <- HEAD(dl_url) # so we can get the filename | |
stop_for_status(res) # halt on network errors | |
fil <- str_replace( | |
string = res$headers['content-disposition'], | |
pattern = "attachment; filename=", | |
replacement = "~/Data/doe-cache-dir/" | |
) | |
if (!file.exists(fil)) { # this pattern allows us to issue a lightweight HTTP HEAD request, then cache and refresh w/o wasting server/our bandwidth/cpu | |
res <- GET(dl_url, httr::write_disk(fil)) | |
stop_for_status(res) | |
Sys.sleep(5) # be kind to the server(s) but only if we're downloading data files since HEAD requests don't really tax services | |
} | |
}) | |
# check to see if the files are all the same (spoiler alert: they're not) | |
list.files("~/Data/doe-cache-dir", "xls", full.names=TRUE) %>% | |
map_df(~list( | |
fil = basename(.x), | |
ncols = read_xls(.x, col_names=FALSE, col_types="text") %>% ncol() | |
)) -> cols_profile | |
cols_profile | |
map2(cols_profile$fil, cols_profile$ncols, ~{ | |
if (.y == 8) { # handle 8 cols | |
suppressWarnings(read_xls( | |
path = sprintf("~/Data/doe-cache-dir/%s", .x), | |
col_names = c("date_began", "region", "time", "area", "event_type", "loss", "customers_affected", "date_restored"), | |
col_types = c("date", "text", "text", "text", "text", "text", "text", "date") | |
)) %>% | |
filter(!is.na(date_began)) %>% | |
mutate(date_began = as.Date(date_began)) | |
} else if (.y == 9) { # handle 9 cols | |
suppressWarnings(read_xls( | |
path = sprintf("~/Data/doe-cache-dir/%s", .x), | |
col_names = c("date_began", "time_began", "date_restored", "time_restored", "area", "region", "event_type", "loss", "customers_affected"), | |
col_types = c("date", "guess", "date", "guess", "text", "text", "text", "text", "text") | |
)) %>% | |
filter(!is.na(date_began)) %>% | |
mutate(date_began = as.Date(date_began)) | |
} else if (.y == 11) { # handle 11 cols | |
# note that the date columns aren't uniform in the Excel spreadsheets even in these more data-literate files :-( | |
suppressWarnings(read_xls( | |
path = sprintf("~/Data/doe-cache-dir/%s", .x), | |
col_names = c("month", "date_began", "time_began", "date_restored", "time_restored", "area", "region", "alert_criteria", "event_type", "loss", "customers_affected"), | |
col_types = c("text", "text", "guess", "text", "guess", "text", "text", "text", "text", "text", "text") | |
)) %>% | |
mutate( | |
date_began = case_when( | |
str_detect(date_began, "/") ~ suppressWarnings(as.Date(date_began, format="%m/%d/%Y")), | |
str_detect(date_began, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_began), origin = "1899-12-30")), | |
TRUE ~ suppressWarnings(as.Date(NA)) | |
) | |
) %>% | |
mutate( | |
date_restored = case_when( | |
str_detect(date_restored, "/") ~ suppressWarnings(as.Date(date_restored, format="%m/%d/%Y")), | |
str_detect(date_restored, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_restored), origin = "1899-12-30")), | |
TRUE ~ suppressWarnings(as.Date(NA)) | |
) | |
) %>% | |
filter(!is.na(date_began)) | |
} | |
}) -> reports | |
reports[[1]] | |
reports[[10]] | |
reports[[17]] | |
map_df(reports, ~{ | |
select(.x, date_began, region, area, event_type, customers_affected, date_restored) %>% | |
mutate(date_restored = as.Date(date_restored)) %>% | |
mutate( | |
customers_affected = suppressWarnings( | |
str_replace_all(customers_affected, "\\-.*$|[[:punct:]]+|[[:alpha:]]+", "") %>% | |
as.numeric() | |
) | |
) %>% | |
mutate(date_restored = as.Date(ifelse(is.na(date_restored), date_began, date_restored), origin = "1970-01-01")) | |
}) -> events | |
events | |
count(events, event_type, sort=TRUE) | |
filter(events, grepl("cyber|hack", event_type, ignore.case=TRUE)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment