hrbrmstr/power-mining.R

## power-mining.R
library(httr)
library(rvest)
library(readxl)
library(tidyverse)

doe <- read_html("https://www.oe.netl.doe.gov/OE417_annual_summary.aspx")

dir.create("~/Data/doe-cache-dir", showWarnings = FALSE)

html_nodes(doe, xpath=".//a[contains(., 'XLS')]") %>%
  html_attr("href") %>%
  print() %>% # to show the hrefs do not provide nice names
  { .pb <<- progress_estimated(length(.)) ; . } %>% # we likely don't rly need progress bars tho
  walk(~{

    .pb$tick()$print()

    dl_url <- sprintf("https://www.oe.netl.doe.gov/%s", .x)

    res <- HEAD(dl_url) # so we can get the filename
    stop_for_status(res) # halt on network errors

    fil <- str_replace(
      string = res$headers['content-disposition'],
      pattern = "attachment; filename=",
      replacement = "~/Data/doe-cache-dir/"
    )

    if (!file.exists(fil)) { # this pattern allows us to issue a lightweight HTTP HEAD request, then cache and refresh w/o wasting server/our bandwidth/cpu
      res <- GET(dl_url, httr::write_disk(fil))
      stop_for_status(res)
      Sys.sleep(5) # be kind to the server(s) but only if we're downloading data files since HEAD requests don't really tax services
    }

  })

# check to see if the files are all the same (spoiler alert: they're not)
list.files("~/Data/doe-cache-dir", "xls", full.names=TRUE) %>%
  map_df(~list(
    fil = basename(.x),
    ncols = read_xls(.x, col_names=FALSE, col_types="text") %>% ncol()
  )) -> cols_profile

cols_profile

map2(cols_profile$fil, cols_profile$ncols, ~{

  if (.y == 8) { # handle 8 cols

    suppressWarnings(read_xls(
      path = sprintf("~/Data/doe-cache-dir/%s", .x),
      col_names = c("date_began", "region", "time", "area", "event_type", "loss", "customers_affected", "date_restored"),
      col_types = c("date", "text", "text", "text", "text", "text", "text", "date")
    )) %>%
      filter(!is.na(date_began)) %>%
      mutate(date_began = as.Date(date_began))

  } else if (.y == 9) { # handle 9 cols

    suppressWarnings(read_xls(
      path = sprintf("~/Data/doe-cache-dir/%s", .x),
      col_names = c("date_began", "time_began", "date_restored", "time_restored", "area", "region", "event_type", "loss", "customers_affected"),
      col_types = c("date", "guess", "date", "guess", "text", "text", "text", "text", "text")
    )) %>%
      filter(!is.na(date_began)) %>%
      mutate(date_began = as.Date(date_began))

  } else if (.y == 11) { # handle 11 cols

    # note that the date columns aren't uniform in the Excel spreadsheets even in these more data-literate files :-(

    suppressWarnings(read_xls(
      path = sprintf("~/Data/doe-cache-dir/%s", .x),
      col_names = c("month", "date_began", "time_began", "date_restored", "time_restored", "area", "region", "alert_criteria", "event_type", "loss", "customers_affected"),
      col_types = c("text", "text", "guess", "text", "guess", "text", "text", "text", "text", "text", "text")
    )) %>%
      mutate(
        date_began = case_when(
          str_detect(date_began, "/") ~ suppressWarnings(as.Date(date_began, format="%m/%d/%Y")),
          str_detect(date_began, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_began), origin = "1899-12-30")),
          TRUE ~ suppressWarnings(as.Date(NA))
        )
      ) %>%
      mutate(
        date_restored = case_when(
          str_detect(date_restored, "/") ~ suppressWarnings(as.Date(date_restored, format="%m/%d/%Y")),
          str_detect(date_restored, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_restored), origin = "1899-12-30")),
          TRUE ~ suppressWarnings(as.Date(NA))
        )
      ) %>%
      filter(!is.na(date_began))

  }

}) -> reports

reports[[1]]

reports[[10]]
reports[[17]]

map_df(reports, ~{
  select(.x, date_began, region, area, event_type, customers_affected, date_restored) %>%
    mutate(date_restored = as.Date(date_restored)) %>%
    mutate(
      customers_affected = suppressWarnings(
        str_replace_all(customers_affected, "\\-.*$|[[:punct:]]+|[[:alpha:]]+", "") %>%
          as.numeric()
      )
    ) %>%
    mutate(date_restored = as.Date(ifelse(is.na(date_restored), date_began, date_restored), origin = "1970-01-01"))
}) -> events

events

count(events, event_type, sort=TRUE)

filter(events, grepl("cyber|hack", event_type, ignore.case=TRUE))
	library(httr)
	library(rvest)
	library(readxl)
	library(tidyverse)

	doe <- read_html("https://www.oe.netl.doe.gov/OE417_annual_summary.aspx")

	dir.create("~/Data/doe-cache-dir", showWarnings = FALSE)

	html_nodes(doe, xpath=".//a[contains(., 'XLS')]") %>%
	html_attr("href") %>%
	print() %>% # to show the hrefs do not provide nice names
	{ .pb <<- progress_estimated(length(.)) ; . } %>% # we likely don't rly need progress bars tho
	walk(~{

	.pb$tick()$print()

	dl_url <- sprintf("https://www.oe.netl.doe.gov/%s", .x)

	res <- HEAD(dl_url) # so we can get the filename
	stop_for_status(res) # halt on network errors

	fil <- str_replace(
	string = res$headers['content-disposition'],
	pattern = "attachment; filename=",
	replacement = "~/Data/doe-cache-dir/"
	)

	if (!file.exists(fil)) { # this pattern allows us to issue a lightweight HTTP HEAD request, then cache and refresh w/o wasting server/our bandwidth/cpu
	res <- GET(dl_url, httr::write_disk(fil))
	stop_for_status(res)
	Sys.sleep(5) # be kind to the server(s) but only if we're downloading data files since HEAD requests don't really tax services
	}

	})

	# check to see if the files are all the same (spoiler alert: they're not)
	list.files("~/Data/doe-cache-dir", "xls", full.names=TRUE) %>%
	map_df(~list(
	fil = basename(.x),
	ncols = read_xls(.x, col_names=FALSE, col_types="text") %>% ncol()
	)) -> cols_profile

	cols_profile

	map2(cols_profile$fil, cols_profile$ncols, ~{

	if (.y == 8) { # handle 8 cols

	suppressWarnings(read_xls(
	path = sprintf("~/Data/doe-cache-dir/%s", .x),
	col_names = c("date_began", "region", "time", "area", "event_type", "loss", "customers_affected", "date_restored"),
	col_types = c("date", "text", "text", "text", "text", "text", "text", "date")
	)) %>%
	filter(!is.na(date_began)) %>%
	mutate(date_began = as.Date(date_began))

	} else if (.y == 9) { # handle 9 cols

	suppressWarnings(read_xls(
	path = sprintf("~/Data/doe-cache-dir/%s", .x),
	col_names = c("date_began", "time_began", "date_restored", "time_restored", "area", "region", "event_type", "loss", "customers_affected"),
	col_types = c("date", "guess", "date", "guess", "text", "text", "text", "text", "text")
	)) %>%
	filter(!is.na(date_began)) %>%
	mutate(date_began = as.Date(date_began))

	} else if (.y == 11) { # handle 11 cols

	# note that the date columns aren't uniform in the Excel spreadsheets even in these more data-literate files :-(

	suppressWarnings(read_xls(
	path = sprintf("~/Data/doe-cache-dir/%s", .x),
	col_names = c("month", "date_began", "time_began", "date_restored", "time_restored", "area", "region", "alert_criteria", "event_type", "loss", "customers_affected"),
	col_types = c("text", "text", "guess", "text", "guess", "text", "text", "text", "text", "text", "text")
	)) %>%
	mutate(
	date_began = case_when(
	str_detect(date_began, "/") ~ suppressWarnings(as.Date(date_began, format="%m/%d/%Y")),
	str_detect(date_began, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_began), origin = "1899-12-30")),
	TRUE ~ suppressWarnings(as.Date(NA))
	)
	) %>%
	mutate(
	date_restored = case_when(
	str_detect(date_restored, "/") ~ suppressWarnings(as.Date(date_restored, format="%m/%d/%Y")),
	str_detect(date_restored, "^[[:digit:]]+$") ~ suppressWarnings(as.Date(as.integer(date_restored), origin = "1899-12-30")),
	TRUE ~ suppressWarnings(as.Date(NA))
	)
	) %>%
	filter(!is.na(date_began))

	}

	}) -> reports

	reports[[1]]

	reports[[10]]
	reports[[17]]

	map_df(reports, ~{
	select(.x, date_began, region, area, event_type, customers_affected, date_restored) %>%
	mutate(date_restored = as.Date(date_restored)) %>%
	mutate(
	customers_affected = suppressWarnings(
	str_replace_all(customers_affected, "\\-.*$\|[[:punct:]]+\|[[:alpha:]]+", "") %>%
	as.numeric()
	)
	) %>%
	mutate(date_restored = as.Date(ifelse(is.na(date_restored), date_began, date_restored), origin = "1970-01-01"))
	}) -> events

	events

	count(events, event_type, sort=TRUE)

	filter(events, grepl("cyber\|hack", event_type, ignore.case=TRUE))