Skip to content

Instantly share code, notes, and snippets.

@hrbrmstr
Last active January 3, 2020 04:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hrbrmstr/7bd38defd92979970b187ae533550696 to your computer and use it in GitHub Desktop.
Save hrbrmstr/7bd38defd92979970b187ae533550696 to your computer and use it in GitHub Desktop.
library(httr)
library(rvest)
library(readxl)
library(tidyverse)
doe <- read_html("https://www.oe.netl.doe.gov/OE417_annual_summary.aspx")
dir.create("~/Data/doe-cache-dir", showWarnings = FALSE)
html_nodes(doe, xpath=".//a[contains(., 'XLS')]") %>%
html_attr("href") %>%
print() %>% # to show the hrefs do not provide nice names
{ .pb <<- progress_estimated(length(.)) ; . } %>% # we likely don't rly need progress bars tho
walk(~{
.pb$tick()$print()
dl_url <- sprintf("https://www.oe.netl.doe.gov/%s", .x)
res <- HEAD(dl_url) # so we can get the filename
stop_for_status(res) # halt on network errors
fil <- str_replace(
string = res$headers['content-disposition'],
pattern = "attachment; filename=",
replacement = "~/Data/doe-cache-dir/"
)
if (!file.exists(fil)) { # this pattern allows us to issue a lightweight HTTP HEAD request, then cache and refresh w/o wasting server/our bandwidth/cpu
res <- GET(dl_url, httr::write_disk(fil))
stop_for_status(res)
Sys.sleep(5) # be kind to the server(s) but only if we're downloading data files since HEAD requests don't really tax services
}
})
# check to see if the files are all the same (spoiler alert: they're not)
list.files("~/Data/doe-cache-dir", "xls", full.names=TRUE) %>%
map_df(~list(
fil = basename(.x),
ncols = read_xls(.x, col_names=FALSE, col_types="text") %>% ncol()
)) -> cols_profile
cols_profile
map2(cols_profile$fil, cols_profile$ncols, ~{
if (.y == 8) { # handle 8 cols
suppressWarnings(read_xls(
path = sprintf("~/Data/doe-cache-dir/%s", .x),
col_names = c("date_began", "region", "time", "area", "event_type", "loss", "customers_affected", "date_restored"),
col_types = c("date", "text", "text", "text", "text", "text", "text", "date")
)) %>%
filter(!is.na(date_began)) %>%
mutate(date_began = as.Date(date_began))
} else if (.y == 9) { # handle 9 cols
suppressWarnings(read_xls(
path = sprintf("~/Data/doe-cache-dir/%s", .x),
col_names = c("date_began", "time_began", "date_restored", "time_restored", "area", "region", "event_type", "loss", "customers_affected"),
col_types = c("date", "guess", "date", "guess", "text", "text", "text", "text", "text")
)) %>%
filter(!is.na(date_began)) %>%
mutate(date_began = as.Date(date_began))
} else if (.y == 11) { # handle 11 cols
# note that the date columns aren't uniform in the Excel spreadsheets even in these more data-literate files :-(
suppressWarnings(read_xls(
path = sprintf("~/Data/doe-cache-dir/%s", .x),
col_names = c("month", "date_began", "time_began", "date_restored", "time_restored", "area", "region", "alert_criteria", "event_type", "loss", "customers_affected"),
col_types = c("text", "text", "guess", "text", "guess", "text", "text", "text", "text", "text", "text")
)) %>%
mutate(
date_began = case_when(
str_detect(date_began, "/") ~ suppressWarnings(as.Date(date_began, format="%m/%d/%Y")),
str_detect(date_began, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_began), origin = "1899-12-30")),
TRUE ~ suppressWarnings(as.Date(NA))
)
) %>%
mutate(
date_restored = case_when(
str_detect(date_restored, "/") ~ suppressWarnings(as.Date(date_restored, format="%m/%d/%Y")),
str_detect(date_restored, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_restored), origin = "1899-12-30")),
TRUE ~ suppressWarnings(as.Date(NA))
)
) %>%
filter(!is.na(date_began))
}
}) -> reports
reports[[1]]
reports[[10]]
reports[[17]]
map_df(reports, ~{
select(.x, date_began, region, area, event_type, customers_affected, date_restored) %>%
mutate(date_restored = as.Date(date_restored)) %>%
mutate(
customers_affected = suppressWarnings(
str_replace_all(customers_affected, "\\-.*$|[[:punct:]]+|[[:alpha:]]+", "") %>%
as.numeric()
)
) %>%
mutate(date_restored = as.Date(ifelse(is.na(date_restored), date_began, date_restored), origin = "1970-01-01"))
}) -> events
events
count(events, event_type, sort=TRUE)
filter(events, grepl("cyber|hack", event_type, ignore.case=TRUE))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment