Skip to content

Instantly share code, notes, and snippets.

@steveharoz
Created January 7, 2023 08:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save steveharoz/f4b87890aa971b6d279907c8beaabe1f to your computer and use it in GitHub Desktop.
Save steveharoz/f4b87890aa971b6d279907c8beaabe1f to your computer and use it in GitHub Desktop.
Extract IEEE citation numbers from a PDF
# Extract all citation numbers such as [1] from a PDF's text
# It also includes cases for multiples [1, 3] and ranges [1-5]
# It tries to exclude confidence intervals by skipping
#
# written by Steve Haroz with help from ChatGPT
# MIT license
library(tidyverse)
library(pdftools)
filename = "path/to/file.pdf"
# get text
text = pdftools::pdf_text(filename)
text = paste(text, collapse = "\n")
# Extract the citation numbers using regular expressions
citation_strings <- str_extract_all(text, "\\[(\\d+(?:, \\d+)*)(?:\\-\\d+)?\\]", simplify = TRUE)
# convert a bracket string to numbers
extract_citation_numbers <- function(citation_string) {
# Initialize an empty vector to store the citation numbers
citation_numbers <- c()
# Drop the brackets
citation_string = str_sub(citation_string, 2, -2)
# Detect confidence interval (a decimal point or percent sign)
if (str_detect(citation_string, "[\\.%]"))
return(NA)
# Split the text into separate entries
entries <- str_split(citation_string, ", ", simplify = TRUE)
# Loop through each entry
for (entry in entries) {
# Check if the entry is a range of numbers
if (str_detect(entry, "\\d+\\s*[-–]\\s*\\d+")) {
# Extract the start and end of the range
start <- as.integer(str_extract(entry, "^\\d+"))
end <- as.integer(str_extract(entry, "\\d+$"))
# Add the numbers in the range to the vector
citation_numbers <- c(citation_numbers, start:end)
} else {
# Extract the number
number <- as.integer(str_extract(entry, "\\d+"))
# Add the number to the vector
citation_numbers <- c(citation_numbers, number)
}
}
# Return the citation numbers
return(citation_numbers)
}
# Test the function
extract_citation_numbers("[1, 3 - 5, 8]")
# Output: 1 3 4 5 8
extract_citation_numbers("[0.2 - 7.4]")
# Output: NA
citation_numbers = sapply(citation_strings, extract_citation_numbers, simplify = TRUE) %>%
unlist(use.names = FALSE)
citation_numbers = tibble(citation_number = citation_numbers) %>%
filter(!is.na(citation_number)) %>%
count(citation_number) %>%
arrange(citation_number)
print(citation_numbers, n=999)
# a citation should appear at least twice (the citation and the reference)
citation_numbers %>%
filter(n < 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment