Created
January 7, 2023 08:07
-
-
Save steveharoz/f4b87890aa971b6d279907c8beaabe1f to your computer and use it in GitHub Desktop.
Extract IEEE citation numbers from a PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract all citation numbers such as [1] from a PDF's text | |
# It also includes cases for multiples [1, 3] and ranges [1-5] | |
# It tries to exclude confidence intervals by skipping | |
# | |
# written by Steve Haroz with help from ChatGPT | |
# MIT license | |
library(tidyverse) | |
library(pdftools) | |
filename = "path/to/file.pdf" | |
# get text | |
text = pdftools::pdf_text(filename) | |
text = paste(text, collapse = "\n") | |
# Extract the citation numbers using regular expressions | |
citation_strings <- str_extract_all(text, "\\[(\\d+(?:, \\d+)*)(?:\\-\\d+)?\\]", simplify = TRUE) | |
# convert a bracket string to numbers | |
extract_citation_numbers <- function(citation_string) { | |
# Initialize an empty vector to store the citation numbers | |
citation_numbers <- c() | |
# Drop the brackets | |
citation_string = str_sub(citation_string, 2, -2) | |
# Detect confidence interval (a decimal point or percent sign) | |
if (str_detect(citation_string, "[\\.%]")) | |
return(NA) | |
# Split the text into separate entries | |
entries <- str_split(citation_string, ", ", simplify = TRUE) | |
# Loop through each entry | |
for (entry in entries) { | |
# Check if the entry is a range of numbers | |
if (str_detect(entry, "\\d+\\s*[-–]\\s*\\d+")) { | |
# Extract the start and end of the range | |
start <- as.integer(str_extract(entry, "^\\d+")) | |
end <- as.integer(str_extract(entry, "\\d+$")) | |
# Add the numbers in the range to the vector | |
citation_numbers <- c(citation_numbers, start:end) | |
} else { | |
# Extract the number | |
number <- as.integer(str_extract(entry, "\\d+")) | |
# Add the number to the vector | |
citation_numbers <- c(citation_numbers, number) | |
} | |
} | |
# Return the citation numbers | |
return(citation_numbers) | |
} | |
# Test the function | |
extract_citation_numbers("[1, 3 - 5, 8]") | |
# Output: 1 3 4 5 8 | |
extract_citation_numbers("[0.2 - 7.4]") | |
# Output: NA | |
citation_numbers = sapply(citation_strings, extract_citation_numbers, simplify = TRUE) %>% | |
unlist(use.names = FALSE) | |
citation_numbers = tibble(citation_number = citation_numbers) %>% | |
filter(!is.na(citation_number)) %>% | |
count(citation_number) %>% | |
arrange(citation_number) | |
print(citation_numbers, n=999) | |
# a citation should appear at least twice (the citation and the reference) | |
citation_numbers %>% | |
filter(n < 2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment