Last active
May 23, 2025 15:31
-
-
Save giannetti/752ff7760f633f7cbfd194a5a1212948 to your computer and use it in GitHub Desktop.
a script to join library catalog data to HathiTrust bibliographic data for the purposes of weeding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # a script to join RUL catalog data to HathiTrust bibliographic data for the purposes of weeding | |
| # libraries | |
| library(dplyr) | |
| library(stringr) | |
| library(tidyr) | |
| # use awk to filter hathi_full list for just pd and pdus | |
| # awk -F "\t" '{ if($3 ~ /pd|pdus/) { print >> "hathi_full_pd.txt" }}' hathi_full_20250401.txt | |
| # this splits the resulting text file every 250000 lines to make it more manageable | |
| # split -l 250000 hathi_full_pd.txt segment | |
| # use this to count rows in output file | |
| # wc -l file | |
| # load library data | |
| alex_pclass <- read_csv("alex_alstackc_pclass_fewerthan5circs_pre1930pub_oclc.csv") | |
| alex_pclass <- rename(alex_pclass, oclc = `OCLC Control Number (035a)`) | |
| # some don't have oclc numbers and will have to be manually checked later | |
| alex_pclass_oclc <- alex_pclass %>% filter(!is.na(oclc)) | |
| # HTDL field names | |
| # I made one small change from the HTDL and am calling the column "oclc" instead of "oclc_num" to be able to write a shorter join function | |
| headers <- c("htid","access","rights","ht_bib_key", "description","source", "source_bib_num", "oclc", "isbn", "issn", "lccn","title","imprint", "rights_reason_code", "rights_timestamp", "us_gov_doc_flag","rights_date_used", "pub_place","lang","bib_fmt","collection_code","content_provider_code","responsible_entity_code","digitization_agent_code","access_profile_code","author") | |
| # function to read file with some arguments and add headers | |
| ht_read <- function(file_path, column_names) { | |
| data <- read_delim(file_path, col_names = FALSE, delim = '\t', col_types = cols(.default = "c")) | |
| colnames(data) <- column_names | |
| return(data) | |
| } | |
| # read all the HT segment datasets in as a list of dataframes and explode into individual dfs | |
| files <- Sys.glob("segment*") | |
| file_list <- lapply(files, function(file) { | |
| ht_read(file, headers) | |
| }) | |
| names(file_list) <- sprintf("htseg%02d", 1:length(file_list)) | |
| list2env(file_list, envir = .GlobalEnv) | |
| # separate oclc values, if more than one, and re-type as numeric | |
| oclc_split <- function(data, column_name) { | |
| split_values <- strsplit(as.character(data[[column_name]]), ",") | |
| data$oclc <- sapply(split_values, function(x) ifelse(length(x) >= 1, x[1], NA)) | |
| data$oclc <- as.numeric(data$oclc) | |
| data$oclc2 <- sapply(split_values, function(x) ifelse(length(x) >=2, x[2], NA)) | |
| return(data) | |
| } | |
| # split oclc column in each df | |
| for (i in 1:31) { | |
| df_name <- sprintf("htseg%02d", i) | |
| df <- get(df_name) | |
| df <- oclc_split(df, "oclc") | |
| assign(df_name, df, envir = .GlobalEnv) | |
| } | |
| # join with alex p class data | |
| oclc_compare <- function(dataset1, dataset2, shared_column) { | |
| matches <- semi_join(dataset1, dataset2, by = shared_column) | |
| return(matches) | |
| } | |
| # apply oclc_compare to each df | |
| for (i in 1:31) { | |
| df_name <- sprintf("htseg%02d", i) | |
| df <- get(df_name) | |
| df <- oclc_compare(alex_pclass_oclc, df, "oclc") | |
| assign(df_name, df, envir = .GlobalEnv) | |
| } | |
| # roll the results into one dataframe | |
| data_list <- list() | |
| for (i in 1:31) { | |
| df_name <- sprintf("htseg%02d", i) | |
| df <- get(df_name) | |
| data_list[[i]] <- df | |
| } | |
| combined_data <- bind_rows(data_list) | |
| write_csv(combined_data, "alex_pclass_in_htdl_fullview.csv") | |
| # check to see if there are duplicated print holdings in RUL | |
| # draw everything in p class that is not in Alex, dropping low use filters b/c not important | |
| notalex_pclass <- read_csv("NOTalex_pclass_pre1930.csv") | |
| notalex_pclass <- rename(notalex_pclass, oclc = `OCLC Control Number (035a)`) | |
| notalex_pclass <- notalex_pclass %>% filter(!is.na(oclc)) | |
| # trying this differently to retain fields from HTDL | |
| oclc_match <- function(dataset1, dataset2, shared_column) { | |
| matches <- inner_join(dataset1, dataset2, by = shared_column, multiple = "any") %>% | |
| mutate(uri = paste0("https://hdl.handle.net/2027/", htid, sep = "")) | |
| return(matches) | |
| } | |
| print_overlap <- combined_data %>% | |
| inner_join(notalex_pclass, by = "oclc", multiple = "any") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment