Skip to content

Instantly share code, notes, and snippets.

@MichaelChirico
Created July 7, 2023 16:15
Show Gist options
  • Save MichaelChirico/ad3785afce84128683f3e12c05866196 to your computer and use it in GitHub Desktop.
Save MichaelChirico/ad3785afce84128683f3e12c05866196 to your computer and use it in GitHub Desktop.
Citations count in CA Math Framework
# As downloaded 2023-07-06
library(data.table)
library(xml2)
# NB: a .docx is "just" a .zip directory
unzip("mathfwappendixbsbe.docx", exdir=tempdir())
doc_xml = read_xml(file.path(tempdir(), "word", "document.xml"))
# XPath by inspection
paragraphs = xml_find_all(doc_xml, "//w:p")
paragraphs_text = xml_text(paragraphs)
# Exclude the ToC copies which have 'PAGEREF' tags
chapter_markers = grep("^Chapter [0-9]+$", paragraphs_text)
citations = data.table(
text = paragraphs_text,
chapter = findInterval(seq_along(paragraphs_text), chapter_markers)
)
citations = citations[
chapter > 0
& !grepl("^Chapter [0-9]+$", text)
& !grepl("^Appendix [A-Z]+$", text)
]
# now things get a bit messy/heuristic... citations not easy to parse
citations[, author_string := {
# split at years, or 'n.d.' for "no date". years have wildly different formatting by entry,
# though most appear like '$AUTHOR. $YEAR. $REST.', we also see '$YEAR,', '($YEAR)', etc.
year_chunks = text |>
strsplit(R"{\s+(19|20)[0-9]{2}[a-z]?([., ]|$)|\((19|20)[0-9]{2}\)|n\.d\.([a-z]\.)?}")
author = trimws(vapply(year_chunks, `[`, 1L, FUN.VALUE=character(1L)))
author[lengths(year_chunks) == 1L & !grepl("[0-9]{4}[.]?$", text)] = NA_character_
gsub(sprintf(", (%s),?|\\.$", paste(month.name, collapse = "|")), "", author)
}]
# Mostly, authors appear like '$AUTHOR1_LAST, $AUTHOR1_FIRST, $AUTHOR2, ... and $AUTHORn'
citations[, author_list := {
author_names = strsplit(gsub(" and ", " ", author_string), ",", fixed=TRUE)
multiname = lengths(author_names) > 1L
author_names[multiname] =
lapply(author_names[multiname], \(x) c(paste(x[2L], x[1L]), tail(x, -2L)))
author_names
}]
citations[, sort(table(substring(unlist(author_list), 1, 30)))]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment