knbknb/R-scrape-wikipedia.R

## R-scrape-wikipedia.R
library(tidyverse)
library(lubridate)
library(rvest)

# Even better to use the Internet Archive since web pages change over time
url <- "https://web.archive.org/web/20220908211042/https://en.wikipedia.org/wiki/..."

wiki_raw <- read_html(url)
wiki_raw

wiki_extracted <- wiki_raw %>%
  html_nodes(xpath = "//table[2]") %>%
  html_table() %>%
  bind_rows() # unlist
wiki_extracted

## url_tester.R
# just returns the error messages
url_tester <- function(url_list, type = c("result", "error")){
  type <- match.arg(type)
  url_list %>%
    # Create a safely() version of read_lines()
    map( safely(read_lines) ) %>%
    set_names( url_list ) %>%
    # Transpose into a list of $result and $error
    transpose() %>%
    pluck(type)

}

Try this function on an urls vector, return only the bad ones:

# Try this function on the urls object
url_tester(urls, type = "error") %>% compact()

# variant: return only the status_code
# use GET and possibly() to return NULL instead of an error,
# return only the status_code
url_tester <- function(url_list){
  url_list %>%
    # Map a version of HEAD() that would otherwise return NULL
    map(possibly(GET, otherwise = NULL) ) %>%
    # impure intermediate step:
    set_names( url_list ) %>%
    # Remove the NULLs
    compact() %>%
    # Extract all the "status_code" elements
    map("status_code")
}
	library(tidyverse)
	library(lubridate)
	library(rvest)

	# Even better to use the Internet Archive since web pages change over time
	url <- "https://web.archive.org/web/20220908211042/https://en.wikipedia.org/wiki/..."

	wiki_raw <- read_html(url)
	wiki_raw

	wiki_extracted <- wiki_raw %>%
	html_nodes(xpath = "//table[2]") %>%
	html_table() %>%
	bind_rows() # unlist
	wiki_extracted
	# just returns the error messages
	url_tester <- function(url_list, type = c("result", "error")){
	type <- match.arg(type)
	url_list %>%
	# Create a safely() version of read_lines()
	map( safely(read_lines) ) %>%
	set_names( url_list ) %>%
	# Transpose into a list of $result and $error
	transpose() %>%
	pluck(type)

	}

	Try this function on an urls vector, return only the bad ones:

	# Try this function on the urls object
	url_tester(urls, type = "error") %>% compact()

	# variant: return only the status_code
	# use GET and possibly() to return NULL instead of an error,
	# return only the status_code
	url_tester <- function(url_list){
	url_list %>%
	# Map a version of HEAD() that would otherwise return NULL
	map(possibly(GET, otherwise = NULL) ) %>%
	# impure intermediate step:
	set_names( url_list ) %>%
	# Remove the NULLs
	compact() %>%
	# Extract all the "status_code" elements
	map("status_code")
	}