shawngraham/retrieve-theses-metadata.R

## retrieve-theses-metadata.R
library(rvest)
library(dplyr)
library(xml2)
library(stringr)
library(purrr)

base_url <- "https://curve.carleton.ca"

theses = data.frame()

# there are 316 theses as of jan 17 2022 in the CURVE repository, so 15 pages

for(page_result in seq(from =0, to = 15, by = 1)){
  link = paste("https://curve.carleton.ca/search?f%5B0%5D=parent_collection%253A41&f%5B1%5D=thesis_degree_discipline%253AHistory&f%5B2%5D=thesis_degree_discipline%3AHistory&page=", page_result, sep='')
  page = read_html(link)

  title = page %>% html_nodes("h2.field-content") %>% html_text()
  thesis_link = page %>% html_nodes("h2:nth-child(1) > a:nth-child(1)") %>% html_attr("href")
  author = page %>% html_nodes("div.views-field-dcterms-creator-1") %>% html_text()
  degree = page %>% html_nodes("div.views-field-thesis-degree-name") %>% html_text()
  year = page %>% html_nodes("div.views-field-views-conditional-3") %>% html_text()
  theses = rbind(theses, data.frame(title, author, year, degree, thesis_url = paste(base_url,thesis_link, sep=""), stringsAsFactors = FALSE))

  print(paste("Page:", page_result))
}

theses <- theses %>%
  mutate_at("author", str_replace, "    Creator:  ", "") %>%
  mutate_at("degree", str_replace, "    Degree Name:     ", "") %>%
  mutate_at("year", str_replace, "    Date:    ", "")

View(theses)
write.csv(theses, "history_theses.csv")

## part two, get the abstracts & pdf file links
# https://stackoverflow.com/questions/57786334/webscraping-from-list-of-urls-in-dataframe-in-r

get_thesis_abstract <- function(url) {
  if (!nchar(url)) return(NA_character_)
  page <- url %>%
    read_html()
  ## try to read channel
  abstract <- page %>%
    html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[3]/div/div/p') %>%
    html_text()
  if (!length(abstract)) {
    abstract <- "null"
  }
  ifelse(length(abstract), abstract, NA_character_)

}

get_thesis_pdf <- function(url) {
  if (!nchar(url)) return(NA_character_)
  page <- url %>%
    read_html()
  ## try to read channel
  t_pdf <- page %>%
    html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/div/div/div/a') %>%
    html_attr("href")
  if (!length(t_pdf)) {
    t_pdf <- "null"
  }
  ifelse(length(t_pdf), t_pdf, NA_character_)

}

get_thesis_subject <- function(url) {
  if (!nchar(url)) return(NA_character_)
  page <- url %>%
    read_html()
  ## try to read channel
  t_subject <- page %>%
    html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[4]/div') %>%
    html_text()
  if (!length(t_subject)) {
    t_subject <- "null"
  }
  ifelse(length(t_subject), t_subject, NA_character_)

}


thesis_abstract = purrr::map_chr(as.character(theses$thesis_url), get_thesis_abstract)
thesis_pdf = purrr::map_chr(as.character(theses$thesis_url), get_thesis_pdf)
thesis_subject = purrr::map_chr(as.character(theses$thesis_url), get_thesis_subject)

thesis_subject <- gsub("([a-z])([A-Z])", "\\1; \\2", thesis_subject)

theses <- cbind(theses,thesis_abstract)
theses <- cbind(theses,thesis_pdf)
theses <- cbind(theses, thesis_subject)

# part three, write to csv!
write.csv(theses, "history_theses.csv")
	library(rvest)
	library(dplyr)
	library(xml2)
	library(stringr)
	library(purrr)

	base_url <- "https://curve.carleton.ca"

	theses = data.frame()

	# there are 316 theses as of jan 17 2022 in the CURVE repository, so 15 pages

	for(page_result in seq(from =0, to = 15, by = 1)){
	link = paste("https://curve.carleton.ca/search?f%5B0%5D=parent_collection%253A41&f%5B1%5D=thesis_degree_discipline%253AHistory&f%5B2%5D=thesis_degree_discipline%3AHistory&page=", page_result, sep='')
	page = read_html(link)

	title = page %>% html_nodes("h2.field-content") %>% html_text()
	thesis_link = page %>% html_nodes("h2:nth-child(1) > a:nth-child(1)") %>% html_attr("href")
	author = page %>% html_nodes("div.views-field-dcterms-creator-1") %>% html_text()
	degree = page %>% html_nodes("div.views-field-thesis-degree-name") %>% html_text()
	year = page %>% html_nodes("div.views-field-views-conditional-3") %>% html_text()
	theses = rbind(theses, data.frame(title, author, year, degree, thesis_url = paste(base_url,thesis_link, sep=""), stringsAsFactors = FALSE))

	print(paste("Page:", page_result))
	}

	theses <- theses %>%
	mutate_at("author", str_replace, " Creator: ", "") %>%
	mutate_at("degree", str_replace, " Degree Name: ", "") %>%
	mutate_at("year", str_replace, " Date: ", "")

	View(theses)
	write.csv(theses, "history_theses.csv")

	## part two, get the abstracts & pdf file links
	# https://stackoverflow.com/questions/57786334/webscraping-from-list-of-urls-in-dataframe-in-r

	get_thesis_abstract <- function(url) {
	if (!nchar(url)) return(NA_character_)
	page <- url %>%
	read_html()
	## try to read channel
	abstract <- page %>%
	html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[3]/div/div/p') %>%
	html_text()
	if (!length(abstract)) {
	abstract <- "null"
	}
	ifelse(length(abstract), abstract, NA_character_)

	}

	get_thesis_pdf <- function(url) {
	if (!nchar(url)) return(NA_character_)
	page <- url %>%
	read_html()
	## try to read channel
	t_pdf <- page %>%
	html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/div/div/div/a') %>%
	html_attr("href")
	if (!length(t_pdf)) {
	t_pdf <- "null"
	}
	ifelse(length(t_pdf), t_pdf, NA_character_)

	}

	get_thesis_subject <- function(url) {
	if (!nchar(url)) return(NA_character_)
	page <- url %>%
	read_html()
	## try to read channel
	t_subject <- page %>%
	html_nodes(xpath = '/html/body/div[2]/div/div[4]/div/div/div/div[2]/div/section/div/div/article/div/div/div/section[4]/div') %>%
	html_text()
	if (!length(t_subject)) {
	t_subject <- "null"
	}
	ifelse(length(t_subject), t_subject, NA_character_)

	}




	thesis_abstract = purrr::map_chr(as.character(theses$thesis_url), get_thesis_abstract)
	thesis_pdf = purrr::map_chr(as.character(theses$thesis_url), get_thesis_pdf)
	thesis_subject = purrr::map_chr(as.character(theses$thesis_url), get_thesis_subject)

	thesis_subject <- gsub("([a-z])([A-Z])", "\\1; \\2", thesis_subject)

	theses <- cbind(theses,thesis_abstract)
	theses <- cbind(theses,thesis_pdf)
	theses <- cbind(theses, thesis_subject)

	# part three, write to csv!
	write.csv(theses, "history_theses.csv")