vinicius-ianni/distributed-scraper.R

## distributed-scraper.R

### SETUP =====================================================================

# Install package if necessary
if (!require("WikipediR")) {
  install.packages("WikipediR")
}

# Set seed for sampling
set.seed(1234)

### LINKS =====================================================================

# Recursive function to get multiple levels of links
get_links <- function(start, levels, so_far = character(0)) {

  # End of recursion
  if (levels == 0) {
    return(so_far)
  }

  # Extract links from wikipedia response
  links <- page_links("en", "wikipedia", page = start) %>%
    purrr::pluck("query", "pages", 1, "links") %>%
    purrr::map(purrr::keep, .p = is.character) %>%
    purrr::flatten() %>%
    purrr::flatten_chr()

  # Join links obtained so far with current links
  links <- unique(c(links, so_far))

  # Run `get_links` for each link found
  purrr::map(links, get_links, levels - 1, links)
}

# Get two levels of links starting with "R"
links <- "R (programming language)" %>%
  get_links(2) %>%
  rlang::squash_chr() %>%
  unique() %>%
  purrr::map(~page_info("en", "wikipedia", page = .x)) %>%
  purrr::map(purrr::pluck, "query", "pages", 1, "fullurl") %>%
  rlang::squash_chr()

### SEQUENTIAL ================================================================

# Function to download Wikipedia articles
download_wiki <- function(url, path) {

  # Convert URL into a file name
  file <- url %>%
    utils::URLdecode() %>%
    stringr::str_extract("(?<=/)[^/]+$") %>%
    stringr::str_replace_all("[:punct:]", "") %>%
    stringr::str_to_lower() %>%
    stringr::str_c(normalizePath(path), "/", ., ".html")

  # Save page to disk
  httr::GET(url, httr::write_disk(file, TRUE))

  return(file)
}

# Download all files sequentialy
files <- purrr::map_chr(links, download_wiki, "~/Desktop/Wiki")

# Remove all downloaded files for completeness
purrr::walk(files, file.remove)

### PARALLEL ==================================================================

# Create simplified version of download_wiki()
download_wiki_ <- purrr::partial(
  download_wiki, path = "~/Desktop/Wiki", .first = FALSE)

# Download all files in parallel
files <- parallel::mcmapply(
  download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

# Remove all downloaded files for completeness
purrr::walk(files, file.remove)

### DISTRIBUTED ===============================================================

# # Code for the python server on each worker
# #!/usr/bin/env python
#
# from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
# import SocketServer
# from subprocess import call
#
# class S(BaseHTTPRequestHandler):
#   def _set_headers(self):
#   self.send_response(200)
# self.send_header('Content-type', 'text/html')
# self.end_headers()
#
# def do_POST(self):
#   content_length = int(self.headers['Content-Length']) # <--- Gets the size of data
# post_data = self.rfile.read(content_length) # <--- Gets the data itself
# call(["Rscript", "/home/ctlente/script.R", post_data])
#
# def run(server_class=HTTPServer, handler_class=S, port=80):
#   server_address = ('', port)
# httpd = server_class(server_address, handler_class)
# print 'Starting httpd...'
# httpd.serve_forever()
#
# if __name__ == "__main__":
#   from sys import argv
#
# if len(argv) == 2:
#   run(port=int(argv[1]))
# else:
#   run()

# Code for the R script on each worker
# #!/usr/bin/env Rscript
# args = commandArgs(trailingOnly=TRUE)
# library(magrittr)
#
# links <- stringr::str_split(args[1], " ")[[1]]
#
# download_wiki <- function(url, path) {
#
#   file <- url %>%
#     utils::URLdecode() %>%
#     stringr::str_extract("(?<=/)[^/]+$") %>%
#     stringr::str_replace_all("[:punct:]", "") %>%
#     stringr::str_to_lower() %>%
#     stringr::str_c(normalizePath(path), "/", ., ".html")
#
#   httr::GET(url, httr::write_disk(file, TRUE))
#
#   return(file)
# }
#
# download_wiki_ <- purrr::partial(
#   download_wiki, path = "~/wiki", .first = FALSE)
#
# parallel::mcmapply(
#   download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

# Split links into groups
num_workers <- 3
links_split <- links %>%
  split(., ceiling(seq_along(.)/(length(.)/num_workers))) %>%
  purrr::map(stringr::str_c, collapse = " ")

# Endpoint data
workers <- "localhost" # INSERT HERE YOUR WORKERS' IPS
endpoints <- stringr::str_c("http://", workers, ":8000")

# Call each worker but don't wait for them
for (i in seq_len(num_workers)) {
  system(paste0("curl -d '", links_split[[i]], "' ", endpoints[i]), wait = FALSE)
}

	### SETUP =====================================================================

	# Install package if necessary
	if (!require("WikipediR")) {
	install.packages("WikipediR")
	}

	# Set seed for sampling
	set.seed(1234)

	### LINKS =====================================================================

	# Recursive function to get multiple levels of links
	get_links <- function(start, levels, so_far = character(0)) {

	# End of recursion
	if (levels == 0) {
	return(so_far)
	}

	# Extract links from wikipedia response
	links <- page_links("en", "wikipedia", page = start) %>%
	purrr::pluck("query", "pages", 1, "links") %>%
	purrr::map(purrr::keep, .p = is.character) %>%
	purrr::flatten() %>%
	purrr::flatten_chr()

	# Join links obtained so far with current links
	links <- unique(c(links, so_far))

	# Run `get_links` for each link found
	purrr::map(links, get_links, levels - 1, links)
	}

	# Get two levels of links starting with "R"
	links <- "R (programming language)" %>%
	get_links(2) %>%
	rlang::squash_chr() %>%
	unique() %>%
	purrr::map(~page_info("en", "wikipedia", page = .x)) %>%
	purrr::map(purrr::pluck, "query", "pages", 1, "fullurl") %>%
	rlang::squash_chr()

	### SEQUENTIAL ================================================================

	# Function to download Wikipedia articles
	download_wiki <- function(url, path) {

	# Convert URL into a file name
	file <- url %>%
	utils::URLdecode() %>%
	stringr::str_extract("(?<=/)[^/]+$") %>%
	stringr::str_replace_all("[:punct:]", "") %>%
	stringr::str_to_lower() %>%
	stringr::str_c(normalizePath(path), "/", ., ".html")

	# Save page to disk
	httr::GET(url, httr::write_disk(file, TRUE))

	return(file)
	}

	# Download all files sequentialy
	files <- purrr::map_chr(links, download_wiki, "~/Desktop/Wiki")

	# Remove all downloaded files for completeness
	purrr::walk(files, file.remove)

	### PARALLEL ==================================================================

	# Create simplified version of download_wiki()
	download_wiki_ <- purrr::partial(
	download_wiki, path = "~/Desktop/Wiki", .first = FALSE)

	# Download all files in parallel
	files <- parallel::mcmapply(
	download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

	# Remove all downloaded files for completeness
	purrr::walk(files, file.remove)

	### DISTRIBUTED ===============================================================

	# # Code for the python server on each worker
	# #!/usr/bin/env python
	#
	# from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
	# import SocketServer
	# from subprocess import call
	#
	# class S(BaseHTTPRequestHandler):
	# def _set_headers(self):
	# self.send_response(200)
	# self.send_header('Content-type', 'text/html')
	# self.end_headers()
	#
	# def do_POST(self):
	# content_length = int(self.headers['Content-Length']) # <--- Gets the size of data
	# post_data = self.rfile.read(content_length) # <--- Gets the data itself
	# call(["Rscript", "/home/ctlente/script.R", post_data])
	#
	# def run(server_class=HTTPServer, handler_class=S, port=80):
	# server_address = ('', port)
	# httpd = server_class(server_address, handler_class)
	# print 'Starting httpd...'
	# httpd.serve_forever()
	#
	# if __name__ == "__main__":
	# from sys import argv
	#
	# if len(argv) == 2:
	# run(port=int(argv[1]))
	# else:
	# run()

	# Code for the R script on each worker
	# #!/usr/bin/env Rscript
	# args = commandArgs(trailingOnly=TRUE)
	# library(magrittr)
	#
	# links <- stringr::str_split(args[1], " ")[[1]]
	#
	# download_wiki <- function(url, path) {
	#
	# file <- url %>%
	# utils::URLdecode() %>%
	# stringr::str_extract("(?<=/)[^/]+$") %>%
	# stringr::str_replace_all("[:punct:]", "") %>%
	# stringr::str_to_lower() %>%
	# stringr::str_c(normalizePath(path), "/", ., ".html")
	#
	# httr::GET(url, httr::write_disk(file, TRUE))
	#
	# return(file)
	# }
	#
	# download_wiki_ <- purrr::partial(
	# download_wiki, path = "~/wiki", .first = FALSE)
	#
	# parallel::mcmapply(
	# download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

	# Split links into groups
	num_workers <- 3
	links_split <- links %>%
	split(., ceiling(seq_along(.)/(length(.)/num_workers))) %>%
	purrr::map(stringr::str_c, collapse = " ")

	# Endpoint data
	workers <- "localhost" # INSERT HERE YOUR WORKERS' IPS
	endpoints <- stringr::str_c("http://", workers, ":8000")

	# Call each worker but don't wait for them
	for (i in seq_len(num_workers)) {
	system(paste0("curl -d '", links_split[[i]], "' ", endpoints[i]), wait = FALSE)
	}