Skip to content

Instantly share code, notes, and snippets.

@juba
Last active March 1, 2019 14:02
Show Gist options
  • Save juba/e85e8ffaed5d2e3635d5a07727a32bef to your computer and use it in GitHub Desktop.
Save juba/e85e8ffaed5d2e3635d5a07727a32bef to your computer and use it in GitHub Desktop.
#Sys.setenv(DEBUGME = "crrri")
#library(debugme)
library(promises)
library(crrri)
library(purrr)
library(dplyr)
library(progress)
getDOMs <- function(url, chrome, pb, delay = 30) {
## Caching results
if (!is.null(res) && url %in% res$url) {
cached <<- cached + 1
pb$tick(tokens = list(success = success, error = error, cached = cached))
return(res[res$url == url,])
}
promise_race(
timeout(delay),
chrome %...T>% {
Sys.sleep(2) } %>%
Page.navigate(url) %>%
Page.loadEventFired() %>%
DOM.getDocument() %>%
DOM.getOuterHTML(nodeId = ~.$root$nodeId)
) %>%
## If rejected -> timeout
then(onRejected = function(err) {
error <<- error + 1
data.frame(url = url,
status = "error",
value = "",
error = err$message,
stringsAsFactors = FALSE)
},
## Else -> success
onFulfilled = function(value) {
success <<- success + 1
data.frame(url = url,
status = "ok",
value = value$result$outerHTML,
error = "",
stringsAsFactors = FALSE)
}) %>%
## Finally, update progress bar
finally(~pb$tick(tokens = list(success = success, error = error, cached = cached)))
}
urls <- c("https://cran.rstudio.com/web/packages/progress/",
"https://cran.rstudio.com/web/packages/dplyr",
"https://www.dsdcsdc.fgf/",
"https://cran.rstudio.com/web/packages/promises/",
"https://cran.rstudio.com/web/packages/ggplot256/",
"https://cran.rstudio.com/web/packages/ggplot2/")
success <- 0
error <- 0
cached <- 0
res <- NULL
status <- NULL
chrome <- chr_connect(bin = "google-chrome", headless = FALSE)
pb <- progress_bar$new(total = length(urls),
format = "Succ::success / Err::error / Cach::cached [:bar] :current/:total (:percent)",
clear = FALSE, show_after = 0)
configured <- chrome %>%
Page.enable() %>%
Network.enable() %>%
DOM.enable() %>%
Network.setUserAgentOverride("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36")
intercepted <- configured %>%
Network.responseReceived(.callback = function(msg) {
url <- msg$params$response$url
stripped_url <- sub("/$", "", url)
if (stripped_url %in% urls) url <- stripped_url
if (url %in% urls) {
status <<- rbind(status,
data.frame(url = url,
http_status = msg$params$response$status,
stringsAsFactors = FALSE))
}
})
promise_map(urls, getDOMs, configured, pb) %...>%
dplyr::bind_rows %...>%
assign("res", ., envir=.GlobalEnv) %>%
finally(~{
cat("\n--- Done :", success, "successes /", error, "errors /", cached, "cached ---\n")
pb$terminate()
chr_disconnect(chrome)
View(res %>% left_join(status, by="url"))
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment