Skip to content

Instantly share code, notes, and snippets.

@shilohfling
Created October 14, 2019 19:58
Show Gist options
  • Save shilohfling/9fd0823cc42e027d97d87e03d1bbb458 to your computer and use it in GitHub Desktop.
Save shilohfling/9fd0823cc42e027d97d87e03d1bbb458 to your computer and use it in GitHub Desktop.
An RScript for scraping the R1 Universities and the year they were established from Wikipedia. Some uni's are incorrect or NA because of the regex, but you get the gist.
##################################################
# An RScript for scraping the R1 Universities #
# and their established dates from Wikipedia. #
##################################################
# Created: October 14, 2019 #
##################################################
## Load packages -----
library(tidyverse)
library(here)
## Functions -----
scrape_uni_est_year <- function(link) {
dt <- NULL
for (i in 1:nrow(link)) {
my_url <- link$LINK[i]
my_url <- paste0("https://en.wikipedia.org/wiki/", my_url)
my_con <- file(my_url, "r")
my_html <- readLines(my_con, -1)
close(my_con)
est <- my_html %>%
str_subset(pattern = "[<]table class[=]\"infobox vcard\" style[=]\"width[:]22em\"") %>%
str_extract(pattern = "[0-9]{4}") %>%
data.frame()
if (!is.null(dt)) {
dt <- bind_rows(dt, est)
} else {
dt <- est
}
print(i)
}
return(dt)
}
## Get the data -----
my_url <- "https://en.wikipedia.org/wiki/Research_I_university"
my_con <- file(my_url, "r")
my_html <- readLines(my_con, -1)
close(my_con)
names <- my_html %>%
str_subset(pattern = "[<]td[>][<]a href[=]\"[/]wiki[/]") %>%
str_replace_all("<.*?>", "") %>%
str_remove("[<]a href[=]\"[/]wiki[/]G") %>%
str_replace("[&]amp[;]", "%26") %>%
str_replace("–", "-") %>%
data.frame() %>%
rename("NAMES" = ".") %>%
filter(NAMES != "")
link <- str_replace_all(names$NAMES, " ", "_") %>%
data.frame() %>%
rename("LINK" = ".")
est <- scrape_uni_est_year(link) %>%
data.frame() %>%
rename("EST" = ".")
df <- bind_cols(names, link) %>%
bind_cols(est)
# str_extract_all(est, "[0-9]{4}")
## Export the data -----
write.csv(df, here(paste0("r1_uni_est_date_", Sys.Date(), ".csv")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment