Skip to content

Instantly share code, notes, and snippets.

@giannetti
Last active May 23, 2025 15:31
Show Gist options
  • Select an option

  • Save giannetti/752ff7760f633f7cbfd194a5a1212948 to your computer and use it in GitHub Desktop.

Select an option

Save giannetti/752ff7760f633f7cbfd194a5a1212948 to your computer and use it in GitHub Desktop.
a script to join library catalog data to HathiTrust bibliographic data for the purposes of weeding
# a script to join RUL catalog data to HathiTrust bibliographic data for the purposes of weeding
# libraries
library(dplyr)
library(stringr)
library(tidyr)
# use awk to filter hathi_full list for just pd and pdus
# awk -F "\t" '{ if($3 ~ /pd|pdus/) { print >> "hathi_full_pd.txt" }}' hathi_full_20250401.txt
# this splits the resulting text file every 250000 lines to make it more manageable
# split -l 250000 hathi_full_pd.txt segment
# use this to count rows in output file
# wc -l file
# load library data
alex_pclass <- read_csv("alex_alstackc_pclass_fewerthan5circs_pre1930pub_oclc.csv")
alex_pclass <- rename(alex_pclass, oclc = `OCLC Control Number (035a)`)
# some don't have oclc numbers and will have to be manually checked later
alex_pclass_oclc <- alex_pclass %>% filter(!is.na(oclc))
# HTDL field names
# I made one small change from the HTDL and am calling the column "oclc" instead of "oclc_num" to be able to write a shorter join function
headers <- c("htid","access","rights","ht_bib_key", "description","source", "source_bib_num", "oclc", "isbn", "issn", "lccn","title","imprint", "rights_reason_code", "rights_timestamp", "us_gov_doc_flag","rights_date_used", "pub_place","lang","bib_fmt","collection_code","content_provider_code","responsible_entity_code","digitization_agent_code","access_profile_code","author")
# function to read file with some arguments and add headers
ht_read <- function(file_path, column_names) {
data <- read_delim(file_path, col_names = FALSE, delim = '\t', col_types = cols(.default = "c"))
colnames(data) <- column_names
return(data)
}
# read all the HT segment datasets in as a list of dataframes and explode into individual dfs
files <- Sys.glob("segment*")
file_list <- lapply(files, function(file) {
ht_read(file, headers)
})
names(file_list) <- sprintf("htseg%02d", 1:length(file_list))
list2env(file_list, envir = .GlobalEnv)
# separate oclc values, if more than one, and re-type as numeric
oclc_split <- function(data, column_name) {
split_values <- strsplit(as.character(data[[column_name]]), ",")
data$oclc <- sapply(split_values, function(x) ifelse(length(x) >= 1, x[1], NA))
data$oclc <- as.numeric(data$oclc)
data$oclc2 <- sapply(split_values, function(x) ifelse(length(x) >=2, x[2], NA))
return(data)
}
# split oclc column in each df
for (i in 1:31) {
df_name <- sprintf("htseg%02d", i)
df <- get(df_name)
df <- oclc_split(df, "oclc")
assign(df_name, df, envir = .GlobalEnv)
}
# join with alex p class data
oclc_compare <- function(dataset1, dataset2, shared_column) {
matches <- semi_join(dataset1, dataset2, by = shared_column)
return(matches)
}
# apply oclc_compare to each df
for (i in 1:31) {
df_name <- sprintf("htseg%02d", i)
df <- get(df_name)
df <- oclc_compare(alex_pclass_oclc, df, "oclc")
assign(df_name, df, envir = .GlobalEnv)
}
# roll the results into one dataframe
data_list <- list()
for (i in 1:31) {
df_name <- sprintf("htseg%02d", i)
df <- get(df_name)
data_list[[i]] <- df
}
combined_data <- bind_rows(data_list)
write_csv(combined_data, "alex_pclass_in_htdl_fullview.csv")
# check to see if there are duplicated print holdings in RUL
# draw everything in p class that is not in Alex, dropping low use filters b/c not important
notalex_pclass <- read_csv("NOTalex_pclass_pre1930.csv")
notalex_pclass <- rename(notalex_pclass, oclc = `OCLC Control Number (035a)`)
notalex_pclass <- notalex_pclass %>% filter(!is.na(oclc))
# trying this differently to retain fields from HTDL
oclc_match <- function(dataset1, dataset2, shared_column) {
matches <- inner_join(dataset1, dataset2, by = shared_column, multiple = "any") %>%
mutate(uri = paste0("https://hdl.handle.net/2027/", htid, sep = ""))
return(matches)
}
print_overlap <- combined_data %>%
inner_join(notalex_pclass, by = "oclc", multiple = "any")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment