zackbatist/FetchAbstracts.R

## FetchAbstracts.R
# install and run packages
# install.packages("bib2df")
# install.packages("rcrossref")
library(bib2df)
library(rcrossref)

# import the bibtex to a data frame
# biblio.bib is a file in the working directory
df <- bib2df("biblio.bib")

# fetch the bibtex file from url:
# url <- "https://gist.githubusercontent.com/zackbatist/46c14011fd5dd4e2763842cd98627927/raw/e8678589cbb9f73ada52e7944bf617e588e1a5fe/GS01ax.bib"
# df <- bib2df(url)

# loop through many DOIs, allowing for failures
x <- lapply(df$DOI, function(z) tryCatch(cr_abstract(z), error = function(e) e))

# write the results to a new field called ABSTRACT as character string
df$ABSTRACT <- unlist(x)

# clean up the abstract field
# add any other regular expressions as you see fit
df$ABSTRACT[grepl("HTTP 404", df$ABSTRACT, ignore.case=FALSE)] <- NA
df$ABSTRACT[grepl("no abstract found for", df$ABSTRACT, ignore.case=FALSE)] <- NA
df$ABSTRACT <- gsub("<p>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("</p>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("<strong>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("</strong>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("<li>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("</li>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("<ul>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("</ul>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("<em>", "", df$ABSTRACT)
df$ABSTRACT <- gsub("</em>", "", df$ABSTRACT)

# write to bibtex file
# following bibtex formatting rules, if there is no text following the abstract field the field will not be written at all
df2bib(df, file = "biblio.bib", append = FALSE)
	# install and run packages
	# install.packages("bib2df")
	# install.packages("rcrossref")
	library(bib2df)
	library(rcrossref)

	# import the bibtex to a data frame
	# biblio.bib is a file in the working directory
	df <- bib2df("biblio.bib")

	# fetch the bibtex file from url:
	# url <- "https://gist.githubusercontent.com/zackbatist/46c14011fd5dd4e2763842cd98627927/raw/e8678589cbb9f73ada52e7944bf617e588e1a5fe/GS01ax.bib"
	# df <- bib2df(url)

	# loop through many DOIs, allowing for failures
	x <- lapply(df$DOI, function(z) tryCatch(cr_abstract(z), error = function(e) e))

	# write the results to a new field called ABSTRACT as character string
	df$ABSTRACT <- unlist(x)

	# clean up the abstract field
	# add any other regular expressions as you see fit
	df$ABSTRACT[grepl("HTTP 404", df$ABSTRACT, ignore.case=FALSE)] <- NA
	df$ABSTRACT[grepl("no abstract found for", df$ABSTRACT, ignore.case=FALSE)] <- NA
	df$ABSTRACT <- gsub("<p>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("</p>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("<strong>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("</strong>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("<li>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("</li>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("<ul>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("</ul>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("<em>", "", df$ABSTRACT)
	df$ABSTRACT <- gsub("</em>", "", df$ABSTRACT)

	# write to bibtex file
	# following bibtex formatting rules, if there is no text following the abstract field the field will not be written at all
	df2bib(df, file = "biblio.bib", append = FALSE)