Created
January 17, 2022 17:49
-
-
Save shawngraham/a3f8a74493a5f108466c4474125d49a1 to your computer and use it in GitHub Desktop.
script to generate a table of metadata concerning theses in CURVE repository; curve is changing soon so this will become dated
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(dplyr) | |
library(xml2) | |
library(stringr) | |
library(purrr) | |
base_url <- "https://curve.carleton.ca" | |
theses = data.frame() | |
# there are 316 theses as of jan 17 2022 in the CURVE repository, so 15 pages | |
for(page_result in seq(from =0, to = 15, by = 1)){ | |
link = paste("https://curve.carleton.ca/search?f%5B0%5D=parent_collection%253A41&f%5B1%5D=thesis_degree_discipline%253AHistory&f%5B2%5D=thesis_degree_discipline%3AHistory&page=", page_result, sep='') | |
page = read_html(link) | |
title = page %>% html_nodes("h2.field-content") %>% html_text() | |
thesis_link = page %>% html_nodes("h2:nth-child(1) > a:nth-child(1)") %>% html_attr("href") | |
author = page %>% html_nodes("div.views-field-dcterms-creator-1") %>% html_text() | |
degree = page %>% html_nodes("div.views-field-thesis-degree-name") %>% html_text() | |
year = page %>% html_nodes("div.views-field-views-conditional-3") %>% html_text() | |
theses = rbind(theses, data.frame(title, author, year, degree, thesis_url = paste(base_url,thesis_link, sep=""), stringsAsFactors = FALSE)) | |
print(paste("Page:", page_result)) | |
} | |
theses <- theses %>% | |
mutate_at("author", str_replace, " Creator: ", "") %>% | |
mutate_at("degree", str_replace, " Degree Name: ", "") %>% | |
mutate_at("year", str_replace, " Date: ", "") | |
View(theses) | |
write.csv(theses, "history_theses.csv") | |
## part two, get the abstracts & pdf file links | |
# https://stackoverflow.com/questions/57786334/webscraping-from-list-of-urls-in-dataframe-in-r | |
get_thesis_abstract <- function(url) { | |
if (!nchar(url)) return(NA_character_) | |
page <- url %>% | |
read_html() | |
## try to read channel | |
abstract <- page %>% | |
html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[3]/div/div/p') %>% | |
html_text() | |
if (!length(abstract)) { | |
abstract <- "null" | |
} | |
ifelse(length(abstract), abstract, NA_character_) | |
} | |
get_thesis_pdf <- function(url) { | |
if (!nchar(url)) return(NA_character_) | |
page <- url %>% | |
read_html() | |
## try to read channel | |
t_pdf <- page %>% | |
html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/div/div/div/a') %>% | |
html_attr("href") | |
if (!length(t_pdf)) { | |
t_pdf <- "null" | |
} | |
ifelse(length(t_pdf), t_pdf, NA_character_) | |
} | |
get_thesis_subject <- function(url) { | |
if (!nchar(url)) return(NA_character_) | |
page <- url %>% | |
read_html() | |
## try to read channel | |
t_subject <- page %>% | |
html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[4]/div') %>% | |
html_text() | |
if (!length(t_subject)) { | |
t_subject <- "null" | |
} | |
ifelse(length(t_subject), t_subject, NA_character_) | |
} | |
thesis_abstract = purrr::map_chr(as.character(theses$thesis_url), get_thesis_abstract) | |
thesis_pdf = purrr::map_chr(as.character(theses$thesis_url), get_thesis_pdf) | |
thesis_subject = purrr::map_chr(as.character(theses$thesis_url), get_thesis_subject) | |
thesis_subject <- gsub("([a-z])([A-Z])", "\\1; \\2", thesis_subject) | |
theses <- cbind(theses,thesis_abstract) | |
theses <- cbind(theses,thesis_pdf) | |
theses <- cbind(theses, thesis_subject) | |
# part three, write to csv! | |
write.csv(theses, "history_theses.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment