Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Created January 17, 2022 17:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shawngraham/a3f8a74493a5f108466c4474125d49a1 to your computer and use it in GitHub Desktop.
Save shawngraham/a3f8a74493a5f108466c4474125d49a1 to your computer and use it in GitHub Desktop.
script to generate a table of metadata concerning theses in CURVE repository; curve is changing soon so this will become dated
library(rvest)
library(dplyr)
library(xml2)
library(stringr)
library(purrr)
base_url <- "https://curve.carleton.ca"
theses = data.frame()
# there are 316 theses as of jan 17 2022 in the CURVE repository, so 15 pages
for(page_result in seq(from =0, to = 15, by = 1)){
link = paste("https://curve.carleton.ca/search?f%5B0%5D=parent_collection%253A41&f%5B1%5D=thesis_degree_discipline%253AHistory&f%5B2%5D=thesis_degree_discipline%3AHistory&page=", page_result, sep='')
page = read_html(link)
title = page %>% html_nodes("h2.field-content") %>% html_text()
thesis_link = page %>% html_nodes("h2:nth-child(1) > a:nth-child(1)") %>% html_attr("href")
author = page %>% html_nodes("div.views-field-dcterms-creator-1") %>% html_text()
degree = page %>% html_nodes("div.views-field-thesis-degree-name") %>% html_text()
year = page %>% html_nodes("div.views-field-views-conditional-3") %>% html_text()
theses = rbind(theses, data.frame(title, author, year, degree, thesis_url = paste(base_url,thesis_link, sep=""), stringsAsFactors = FALSE))
print(paste("Page:", page_result))
}
theses <- theses %>%
mutate_at("author", str_replace, " Creator: ", "") %>%
mutate_at("degree", str_replace, " Degree Name: ", "") %>%
mutate_at("year", str_replace, " Date: ", "")
View(theses)
write.csv(theses, "history_theses.csv")
## part two, get the abstracts & pdf file links
# https://stackoverflow.com/questions/57786334/webscraping-from-list-of-urls-in-dataframe-in-r
get_thesis_abstract <- function(url) {
if (!nchar(url)) return(NA_character_)
page <- url %>%
read_html()
## try to read channel
abstract <- page %>%
html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[3]/div/div/p') %>%
html_text()
if (!length(abstract)) {
abstract <- "null"
}
ifelse(length(abstract), abstract, NA_character_)
}
get_thesis_pdf <- function(url) {
if (!nchar(url)) return(NA_character_)
page <- url %>%
read_html()
## try to read channel
t_pdf <- page %>%
html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/div/div/div/a') %>%
html_attr("href")
if (!length(t_pdf)) {
t_pdf <- "null"
}
ifelse(length(t_pdf), t_pdf, NA_character_)
}
get_thesis_subject <- function(url) {
if (!nchar(url)) return(NA_character_)
page <- url %>%
read_html()
## try to read channel
t_subject <- page %>%
html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[4]/div') %>%
html_text()
if (!length(t_subject)) {
t_subject <- "null"
}
ifelse(length(t_subject), t_subject, NA_character_)
}
thesis_abstract = purrr::map_chr(as.character(theses$thesis_url), get_thesis_abstract)
thesis_pdf = purrr::map_chr(as.character(theses$thesis_url), get_thesis_pdf)
thesis_subject = purrr::map_chr(as.character(theses$thesis_url), get_thesis_subject)
thesis_subject <- gsub("([a-z])([A-Z])", "\\1; \\2", thesis_subject)
theses <- cbind(theses,thesis_abstract)
theses <- cbind(theses,thesis_pdf)
theses <- cbind(theses, thesis_subject)
# part three, write to csv!
write.csv(theses, "history_theses.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment