steveharoz/extract citation numbers.R

## extract citation numbers.R
# Extract all citation numbers such as [1] from a PDF's text
# It also includes cases for multiples [1, 3] and ranges [1-5]
# It tries to exclude confidence intervals by skipping
#
# written by Steve Haroz with help from ChatGPT
# MIT license

library(tidyverse)
library(pdftools)

filename = "path/to/file.pdf"

# get text
text = pdftools::pdf_text(filename)
text = paste(text, collapse = "\n")

# Extract the citation numbers using regular expressions
citation_strings <- str_extract_all(text, "\\[(\\d+(?:, \\d+)*)(?:\\-\\d+)?\\]", simplify = TRUE)

# convert a bracket string to numbers
extract_citation_numbers <- function(citation_string) {
  # Initialize an empty vector to store the citation numbers
  citation_numbers <- c()

  # Drop the brackets
  citation_string = str_sub(citation_string, 2, -2)

  # Detect confidence interval (a decimal point or percent sign)
  if (str_detect(citation_string, "[\\.%]"))
    return(NA)

  # Split the text into separate entries
  entries <- str_split(citation_string, ", ", simplify = TRUE)

  # Loop through each entry
  for (entry in entries) {
    # Check if the entry is a range of numbers
    if (str_detect(entry, "\\d+\\s*[-–]\\s*\\d+")) {
      # Extract the start and end of the range
      start <- as.integer(str_extract(entry, "^\\d+"))
      end <- as.integer(str_extract(entry, "\\d+$"))

      # Add the numbers in the range to the vector
      citation_numbers <- c(citation_numbers, start:end)
    } else {
      # Extract the number
      number <- as.integer(str_extract(entry, "\\d+"))

      # Add the number to the vector
      citation_numbers <- c(citation_numbers, number)
    }
  }

  # Return the citation numbers
  return(citation_numbers)
}

# Test the function
extract_citation_numbers("[1, 3 - 5, 8]")
# Output: 1 3 4 5 8
extract_citation_numbers("[0.2 - 7.4]")
# Output: NA

citation_numbers = sapply(citation_strings, extract_citation_numbers, simplify = TRUE) %>%
  unlist(use.names = FALSE)

citation_numbers = tibble(citation_number = citation_numbers) %>%
  filter(!is.na(citation_number)) %>%
  count(citation_number) %>%
  arrange(citation_number)

print(citation_numbers, n=999)

# a citation should appear at least twice (the citation and the reference)
citation_numbers %>%
  filter(n < 2)
	# Extract all citation numbers such as [1] from a PDF's text
	# It also includes cases for multiples [1, 3] and ranges [1-5]
	# It tries to exclude confidence intervals by skipping
	#
	# written by Steve Haroz with help from ChatGPT
	# MIT license

	library(tidyverse)
	library(pdftools)

	filename = "path/to/file.pdf"

	# get text
	text = pdftools::pdf_text(filename)
	text = paste(text, collapse = "\n")

	# Extract the citation numbers using regular expressions
	citation_strings <- str_extract_all(text, "\\[(\\d+(?:, \\d+)*)(?:\\-\\d+)?\\]", simplify = TRUE)

	# convert a bracket string to numbers
	extract_citation_numbers <- function(citation_string) {
	# Initialize an empty vector to store the citation numbers
	citation_numbers <- c()

	# Drop the brackets
	citation_string = str_sub(citation_string, 2, -2)

	# Detect confidence interval (a decimal point or percent sign)
	if (str_detect(citation_string, "[\\.%]"))
	return(NA)

	# Split the text into separate entries
	entries <- str_split(citation_string, ", ", simplify = TRUE)

	# Loop through each entry
	for (entry in entries) {
	# Check if the entry is a range of numbers
	if (str_detect(entry, "\\d+\\s[-–]\\s\\d+")) {
	# Extract the start and end of the range
	start <- as.integer(str_extract(entry, "^\\d+"))
	end <- as.integer(str_extract(entry, "\\d+$"))

	# Add the numbers in the range to the vector
	citation_numbers <- c(citation_numbers, start:end)
	} else {
	# Extract the number
	number <- as.integer(str_extract(entry, "\\d+"))

	# Add the number to the vector
	citation_numbers <- c(citation_numbers, number)
	}
	}

	# Return the citation numbers
	return(citation_numbers)
	}

	# Test the function
	extract_citation_numbers("[1, 3 - 5, 8]")
	# Output: 1 3 4 5 8
	extract_citation_numbers("[0.2 - 7.4]")
	# Output: NA

	citation_numbers = sapply(citation_strings, extract_citation_numbers, simplify = TRUE) %>%
	unlist(use.names = FALSE)

	citation_numbers = tibble(citation_number = citation_numbers) %>%
	filter(!is.na(citation_number)) %>%
	count(citation_number) %>%
	arrange(citation_number)

	print(citation_numbers, n=999)

	# a citation should appear at least twice (the citation and the reference)
	citation_numbers %>%
	filter(n < 2)