Created
October 14, 2019 19:58
-
-
Save shilohfling/9fd0823cc42e027d97d87e03d1bbb458 to your computer and use it in GitHub Desktop.
An RScript for scraping the R1 Universities and the year they were established from Wikipedia. Some uni's are incorrect or NA because of the regex, but you get the gist.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################## | |
# An RScript for scraping the R1 Universities # | |
# and their established dates from Wikipedia. # | |
################################################## | |
# Created: October 14, 2019 # | |
################################################## | |
## Load packages ----- | |
library(tidyverse) | |
library(here) | |
## Functions ----- | |
scrape_uni_est_year <- function(link) { | |
dt <- NULL | |
for (i in 1:nrow(link)) { | |
my_url <- link$LINK[i] | |
my_url <- paste0("https://en.wikipedia.org/wiki/", my_url) | |
my_con <- file(my_url, "r") | |
my_html <- readLines(my_con, -1) | |
close(my_con) | |
est <- my_html %>% | |
str_subset(pattern = "[<]table class[=]\"infobox vcard\" style[=]\"width[:]22em\"") %>% | |
str_extract(pattern = "[0-9]{4}") %>% | |
data.frame() | |
if (!is.null(dt)) { | |
dt <- bind_rows(dt, est) | |
} else { | |
dt <- est | |
} | |
print(i) | |
} | |
return(dt) | |
} | |
## Get the data ----- | |
my_url <- "https://en.wikipedia.org/wiki/Research_I_university" | |
my_con <- file(my_url, "r") | |
my_html <- readLines(my_con, -1) | |
close(my_con) | |
names <- my_html %>% | |
str_subset(pattern = "[<]td[>][<]a href[=]\"[/]wiki[/]") %>% | |
str_replace_all("<.*?>", "") %>% | |
str_remove("[<]a href[=]\"[/]wiki[/]G") %>% | |
str_replace("[&]amp[;]", "%26") %>% | |
str_replace("–", "-") %>% | |
data.frame() %>% | |
rename("NAMES" = ".") %>% | |
filter(NAMES != "") | |
link <- str_replace_all(names$NAMES, " ", "_") %>% | |
data.frame() %>% | |
rename("LINK" = ".") | |
est <- scrape_uni_est_year(link) %>% | |
data.frame() %>% | |
rename("EST" = ".") | |
df <- bind_cols(names, link) %>% | |
bind_cols(est) | |
# str_extract_all(est, "[0-9]{4}") | |
## Export the data ----- | |
write.csv(df, here(paste0("r1_uni_est_date_", Sys.Date(), ".csv"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment