Last active
January 3, 2020 04:37
-
-
Save hrbrmstr/7bd38defd92979970b187ae533550696 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(httr) | |
library(rvest) | |
library(readxl) | |
library(tidyverse) | |
doe <- read_html("https://www.oe.netl.doe.gov/OE417_annual_summary.aspx") | |
dir.create("~/Data/doe-cache-dir", showWarnings = FALSE) | |
html_nodes(doe, xpath=".//a[contains(., 'XLS')]") %>% | |
html_attr("href") %>% | |
print() %>% # to show the hrefs do not provide nice names | |
{ .pb <<- progress_estimated(length(.)) ; . } %>% # we likely don't rly need progress bars tho | |
walk(~{ | |
.pb$tick()$print() | |
dl_url <- sprintf("https://www.oe.netl.doe.gov/%s", .x) | |
res <- HEAD(dl_url) # so we can get the filename | |
stop_for_status(res) # halt on network errors | |
fil <- str_replace( | |
string = res$headers['content-disposition'], | |
pattern = "attachment; filename=", | |
replacement = "~/Data/doe-cache-dir/" | |
) | |
if (!file.exists(fil)) { # this pattern allows us to issue a lightweight HTTP HEAD request, then cache and refresh w/o wasting server/our bandwidth/cpu | |
res <- GET(dl_url, httr::write_disk(fil)) | |
stop_for_status(res) | |
Sys.sleep(5) # be kind to the server(s) but only if we're downloading data files since HEAD requests don't really tax services | |
} | |
}) | |
# check to see if the files are all the same (spoiler alert: they're not) | |
list.files("~/Data/doe-cache-dir", "xls", full.names=TRUE) %>% | |
map_df(~list( | |
fil = basename(.x), | |
ncols = read_xls(.x, col_names=FALSE, col_types="text") %>% ncol() | |
)) -> cols_profile | |
cols_profile | |
map2(cols_profile$fil, cols_profile$ncols, ~{ | |
if (.y == 8) { # handle 8 cols | |
suppressWarnings(read_xls( | |
path = sprintf("~/Data/doe-cache-dir/%s", .x), | |
col_names = c("date_began", "region", "time", "area", "event_type", "loss", "customers_affected", "date_restored"), | |
col_types = c("date", "text", "text", "text", "text", "text", "text", "date") | |
)) %>% | |
filter(!is.na(date_began)) %>% | |
mutate(date_began = as.Date(date_began)) | |
} else if (.y == 9) { # handle 9 cols | |
suppressWarnings(read_xls( | |
path = sprintf("~/Data/doe-cache-dir/%s", .x), | |
col_names = c("date_began", "time_began", "date_restored", "time_restored", "area", "region", "event_type", "loss", "customers_affected"), | |
col_types = c("date", "guess", "date", "guess", "text", "text", "text", "text", "text") | |
)) %>% | |
filter(!is.na(date_began)) %>% | |
mutate(date_began = as.Date(date_began)) | |
} else if (.y == 11) { # handle 11 cols | |
# note that the date columns aren't uniform in the Excel spreadsheets even in these more data-literate files :-( | |
suppressWarnings(read_xls( | |
path = sprintf("~/Data/doe-cache-dir/%s", .x), | |
col_names = c("month", "date_began", "time_began", "date_restored", "time_restored", "area", "region", "alert_criteria", "event_type", "loss", "customers_affected"), | |
col_types = c("text", "text", "guess", "text", "guess", "text", "text", "text", "text", "text", "text") | |
)) %>% | |
mutate( | |
date_began = case_when( | |
str_detect(date_began, "/") ~ suppressWarnings(as.Date(date_began, format="%m/%d/%Y")), | |
str_detect(date_began, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_began), origin = "1899-12-30")), | |
TRUE ~ suppressWarnings(as.Date(NA)) | |
) | |
) %>% | |
mutate( | |
date_restored = case_when( | |
str_detect(date_restored, "/") ~ suppressWarnings(as.Date(date_restored, format="%m/%d/%Y")), | |
str_detect(date_restored, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_restored), origin = "1899-12-30")), | |
TRUE ~ suppressWarnings(as.Date(NA)) | |
) | |
) %>% | |
filter(!is.na(date_began)) | |
} | |
}) -> reports | |
reports[[1]] | |
reports[[10]] | |
reports[[17]] | |
map_df(reports, ~{ | |
select(.x, date_began, region, area, event_type, customers_affected, date_restored) %>% | |
mutate(date_restored = as.Date(date_restored)) %>% | |
mutate( | |
customers_affected = suppressWarnings( | |
str_replace_all(customers_affected, "\\-.*$|[[:punct:]]+|[[:alpha:]]+", "") %>% | |
as.numeric() | |
) | |
) %>% | |
mutate(date_restored = as.Date(ifelse(is.na(date_restored), date_began, date_restored), origin = "1970-01-01")) | |
}) -> events | |
events | |
count(events, event_type, sort=TRUE) | |
filter(events, grepl("cyber|hack", event_type, ignore.case=TRUE)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment