shilohfling/web_scrape_r1_uni.R

## web_scrape_r1_uni.R
##################################################
# An RScript for scraping the R1 Universities    #
# and their established dates from Wikipedia.    #
##################################################
# Created: October 14, 2019                      #
##################################################

## Load packages -----
library(tidyverse)
library(here)

## Functions -----
scrape_uni_est_year <- function(link) {

  dt <- NULL
  for (i in 1:nrow(link)) {

    my_url <- link$LINK[i]
    my_url <- paste0("https://en.wikipedia.org/wiki/", my_url)
    my_con <- file(my_url, "r")
    my_html <- readLines(my_con, -1)
    close(my_con)

    est <- my_html %>%
      str_subset(pattern = "[<]table class[=]\"infobox vcard\" style[=]\"width[:]22em\"") %>%
      str_extract(pattern = "[0-9]{4}") %>%
      data.frame()

    if (!is.null(dt)) {
      dt <- bind_rows(dt, est)
    } else {
      dt <- est
    }
    print(i)
  }

  return(dt)
}

## Get the data -----
my_url <- "https://en.wikipedia.org/wiki/Research_I_university"
my_con <- file(my_url, "r")
my_html <- readLines(my_con, -1)
close(my_con)

names <- my_html %>%
  str_subset(pattern = "[<]td[>][<]a href[=]\"[/]wiki[/]") %>%
  str_replace_all("<.*?>", "") %>%
  str_remove("[<]a href[=]\"[/]wiki[/]G") %>%
  str_replace("[&]amp[;]", "%26") %>%
  str_replace("â€“", "-") %>%
  data.frame() %>%
  rename("NAMES" = ".") %>%
  filter(NAMES != "")

link <- str_replace_all(names$NAMES, " ", "_") %>%
  data.frame() %>%
  rename("LINK" = ".")

est <- scrape_uni_est_year(link) %>%
  data.frame() %>%
  rename("EST" = ".")

df <- bind_cols(names, link) %>%
      bind_cols(est)

# str_extract_all(est, "[0-9]{4}")

## Export the data -----
write.csv(df, here(paste0("r1_uni_est_date_", Sys.Date(), ".csv")))
	##################################################
	# An RScript for scraping the R1 Universities #
	# and their established dates from Wikipedia. #
	##################################################
	# Created: October 14, 2019 #
	##################################################

	## Load packages -----
	library(tidyverse)
	library(here)

	## Functions -----
	scrape_uni_est_year <- function(link) {

	dt <- NULL
	for (i in 1:nrow(link)) {

	my_url <- link$LINK[i]
	my_url <- paste0("https://en.wikipedia.org/wiki/", my_url)
	my_con <- file(my_url, "r")
	my_html <- readLines(my_con, -1)
	close(my_con)

	est <- my_html %>%
	str_subset(pattern = "[<]table class[=]\"infobox vcard\" style[=]\"width[:]22em\"") %>%
	str_extract(pattern = "[0-9]{4}") %>%
	data.frame()

	if (!is.null(dt)) {
	dt <- bind_rows(dt, est)
	} else {
	dt <- est
	}
	print(i)
	}

	return(dt)
	}

	## Get the data -----
	my_url <- "https://en.wikipedia.org/wiki/Research_I_university"
	my_con <- file(my_url, "r")
	my_html <- readLines(my_con, -1)
	close(my_con)

	names <- my_html %>%
	str_subset(pattern = "[<]td[>][<]a href[=]\"[/]wiki[/]") %>%
	str_replace_all("<.*?>", "") %>%
	str_remove("[<]a href[=]\"[/]wiki[/]G") %>%
	str_replace("[&]amp[;]", "%26") %>%
	str_replace("â€“", "-") %>%
	data.frame() %>%
	rename("NAMES" = ".") %>%
	filter(NAMES != "")

	link <- str_replace_all(names$NAMES, " ", "_") %>%
	data.frame() %>%
	rename("LINK" = ".")

	est <- scrape_uni_est_year(link) %>%
	data.frame() %>%
	rename("EST" = ".")

	df <- bind_cols(names, link) %>%
	bind_cols(est)

	# str_extract_all(est, "[0-9]{4}")

	## Export the data -----
	write.csv(df, here(paste0("r1_uni_est_date_", Sys.Date(), ".csv")))