Skip to content

Instantly share code, notes, and snippets.

@vinicius-ianni
Created February 18, 2018 00:03
Show Gist options
  • Save vinicius-ianni/65c0dd12c41f0dffece2834d4bad2bc7 to your computer and use it in GitHub Desktop.
Save vinicius-ianni/65c0dd12c41f0dffece2834d4bad2bc7 to your computer and use it in GitHub Desktop.
Distributed Web Scraping with R
### SETUP =====================================================================
# Install package if necessary
if (!require("WikipediR")) {
install.packages("WikipediR")
}
# Set seed for sampling
set.seed(1234)
### LINKS =====================================================================
# Recursive function to get multiple levels of links
get_links <- function(start, levels, so_far = character(0)) {
# End of recursion
if (levels == 0) {
return(so_far)
}
# Extract links from wikipedia response
links <- page_links("en", "wikipedia", page = start) %>%
purrr::pluck("query", "pages", 1, "links") %>%
purrr::map(purrr::keep, .p = is.character) %>%
purrr::flatten() %>%
purrr::flatten_chr()
# Join links obtained so far with current links
links <- unique(c(links, so_far))
# Run `get_links` for each link found
purrr::map(links, get_links, levels - 1, links)
}
# Get two levels of links starting with "R"
links <- "R (programming language)" %>%
get_links(2) %>%
rlang::squash_chr() %>%
unique() %>%
purrr::map(~page_info("en", "wikipedia", page = .x)) %>%
purrr::map(purrr::pluck, "query", "pages", 1, "fullurl") %>%
rlang::squash_chr()
### SEQUENTIAL ================================================================
# Function to download Wikipedia articles
download_wiki <- function(url, path) {
# Convert URL into a file name
file <- url %>%
utils::URLdecode() %>%
stringr::str_extract("(?<=/)[^/]+$") %>%
stringr::str_replace_all("[:punct:]", "") %>%
stringr::str_to_lower() %>%
stringr::str_c(normalizePath(path), "/", ., ".html")
# Save page to disk
httr::GET(url, httr::write_disk(file, TRUE))
return(file)
}
# Download all files sequentialy
files <- purrr::map_chr(links, download_wiki, "~/Desktop/Wiki")
# Remove all downloaded files for completeness
purrr::walk(files, file.remove)
### PARALLEL ==================================================================
# Create simplified version of download_wiki()
download_wiki_ <- purrr::partial(
download_wiki, path = "~/Desktop/Wiki", .first = FALSE)
# Download all files in parallel
files <- parallel::mcmapply(
download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)
# Remove all downloaded files for completeness
purrr::walk(files, file.remove)
### DISTRIBUTED ===============================================================
# # Code for the python server on each worker
# #!/usr/bin/env python
#
# from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
# import SocketServer
# from subprocess import call
#
# class S(BaseHTTPRequestHandler):
# def _set_headers(self):
# self.send_response(200)
# self.send_header('Content-type', 'text/html')
# self.end_headers()
#
# def do_POST(self):
# content_length = int(self.headers['Content-Length']) # <--- Gets the size of data
# post_data = self.rfile.read(content_length) # <--- Gets the data itself
# call(["Rscript", "/home/ctlente/script.R", post_data])
#
# def run(server_class=HTTPServer, handler_class=S, port=80):
# server_address = ('', port)
# httpd = server_class(server_address, handler_class)
# print 'Starting httpd...'
# httpd.serve_forever()
#
# if __name__ == "__main__":
# from sys import argv
#
# if len(argv) == 2:
# run(port=int(argv[1]))
# else:
# run()
# Code for the R script on each worker
# #!/usr/bin/env Rscript
# args = commandArgs(trailingOnly=TRUE)
# library(magrittr)
#
# links <- stringr::str_split(args[1], " ")[[1]]
#
# download_wiki <- function(url, path) {
#
# file <- url %>%
# utils::URLdecode() %>%
# stringr::str_extract("(?<=/)[^/]+$") %>%
# stringr::str_replace_all("[:punct:]", "") %>%
# stringr::str_to_lower() %>%
# stringr::str_c(normalizePath(path), "/", ., ".html")
#
# httr::GET(url, httr::write_disk(file, TRUE))
#
# return(file)
# }
#
# download_wiki_ <- purrr::partial(
# download_wiki, path = "~/wiki", .first = FALSE)
#
# parallel::mcmapply(
# download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)
# Split links into groups
num_workers <- 3
links_split <- links %>%
split(., ceiling(seq_along(.)/(length(.)/num_workers))) %>%
purrr::map(stringr::str_c, collapse = " ")
# Endpoint data
workers <- "localhost" # INSERT HERE YOUR WORKERS' IPS
endpoints <- stringr::str_c("http://", workers, ":8000")
# Call each worker but don't wait for them
for (i in seq_len(num_workers)) {
system(paste0("curl -d '", links_split[[i]], "' ", endpoints[i]), wait = FALSE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment