Skip to content

Instantly share code, notes, and snippets.

@anamariaelek
Created December 3, 2018 13:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anamariaelek/bfc04de34bbfea5c61523d439fb68139 to your computer and use it in GitHub Desktop.
Save anamariaelek/bfc04de34bbfea5c61523d439fb68139 to your computer and use it in GitHub Desktop.
E-utilities pipeline for exploration and retrieval of PubMed citations
#!/bin/bash
# find interesting articles PMIDs
db='pubmed'
query='Metagenome[mesh]+AND+Gastrointestinal+Microbiome[mesh]+AND+2008:2018[pdat]+AND+Journal+Article[ptyp]'
base='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url=$base"esearch.fcgi?db=$db&term=$query&retmax=100000"
curl $url -g -o esearch_pmid.xml
# save list of PMIDs to a text file
cat >> parsePMIDxml.R << EOF
#!/usr/bin/env Rscript
if (!requireNamespace("xml2", quietly = TRUE))
install.packages("xml2")
library(xml2)
args <- commandArgs(trailingOnly=TRUE)
# test if there is at least one argument
if (length(args)==0) {
stop("The script a takes single argument, input file.n", call.=FALSE)
}
args[2] = "PMIDs.txt" # default output file
xmlinput <- read_xml(args[1])
ids <- xml_text(xml_find_all(xmlinput, "//Id"))
writeLines(ids, args[2])
EOF
Rscript --vanilla parsePMIDxml.R esearch_pmid.xml
# find summary for interesting articles
url_search=$base"esearch.fcgi?db=$db&term=$query&usehistory=y"
curl $url_search -g -o esearch.xml
web=$(sed '3q;d' esearch.xml | sed -E 's/.*<WebEnv>([^<]+)<\/WebEnv>.*/\1/')
key=$(sed '3q;d' esearch.xml | sed -E 's/.*<QueryKey>([^<]+)<\/QueryKey>.*/\1/')
url_summary=$base"esummary.fcgi?db=pubmed&query_key=$key&WebEnv=$web"
curl $url_summary -g -o esummary.xml
# find articles citing interesting articles
ids=$(sed ':a;N;$!ba;s/\n/\&id=/g' PMIDs.txt | tr -d '\r')
url_link_byid=$base"elink.fcgi?dbfrom=$db&linkname=pubmed_pubmed_citedin&id=$ids"
curl $url_link_byid -o elink.byid.xml
# post-processing
cat >> processPMIDxml.R << EOF
#!/usr/bin/env Rscript
pkgs <- c("xml2","data.table","ggplot2","stringr")
for (i in pkgs){
if (!requireNamespace(i, quietly = TRUE))
install.packages(i)
}
library(xml2)
library(data.table)
library(ggplot2)
# load data
esearch <- read_xml("esearch_pmid.xml")
esummary <- read_xml("esummary.xml")
elink <- read_xml("elink.byid.xml")
# extract data
ids <- xml_text(xml_find_all(esearch, "//Id"))
paper_info <- lapply(xml_find_all(esummary, "//DocSum"), function(x) {
# get summary items
items <- xml_find_all(x, xpath=".//Item")
# for each summary item, get item name
attnames <- sapply(xml_attrs(items), function(ats) ats["Name"])
# and item content
atts <- xml_text(items)
names(atts) <- attnames
atts
})
names(paper_info) <- ids
infos <- Reduce(intersect, lapply(paper_info,names))
summary <- lapply(paper_info, function(x){
data.table(init=NA)[,I(infos):=as.list(x[infos])][,init:=NULL]
})
papersDT <- rbindlist(summary,idcol="PMID")
fwrite(papersDT,"papersSummary.csv")
# per year
papersDT[,year:={
y <- stringr::str_extract(PubDate,"[0-9]{4}")
factor(y, levels = 2008:2018)
}][,Citations:=as.integer(PmcRefCount)][
,AvgCitationYear:=mean(Citations),year]
cbPalette <- c("#E69F00", "#00bf8b", "#CC79A7", "#56B4E9", "#F0E442", "#0072B2", "#D55E00", "#999999")
ggp_year <- ggplot(papersDT,aes(year)) +
geom_bar(aes(fill=PubType)) +
scale_x_discrete(limits=levels(papersDT$year)) +
scale_fill_manual(name="Publication type:", breaks=papersDT$PubType, values=cbPalette)
maxy <- max(ggplot_build(ggp_year)[[1]][[1]][,"y"])
maxy2 <- max(papersDT$AvgCitationYear)
scaling <- round(maxy/maxy2)
ggp_year <- ggp_year +
geom_line(aes(year,AvgCitationYear*scaling,colour=""),group=1,linetype=1) +
geom_point(aes(year,AvgCitationYear*scaling,colour="")) +
scale_color_discrete("Average PMC citations: ") +
scale_y_continuous(
name="papers\n",
sec.axis=sec_axis(~./scaling,name="average PMC citations\n")) +
theme_light() + theme(
text=element_text(size=12),
panel.grid=element_line(colour="lightgray", linetype=3),
legend.box="vertical", legend.direction="horizontal", legend.position="bottom"
)
# per journal
papersDT[,JournalN:=.N,FullJournalName][
,Journal:=factor(Source,levels=unique(.SD[order(-JournalN),Source]))][
,AvgCitationJournal:=mean(Citations),Source]
ggp_journal <- ggplot(papersDT[JournalN>1],aes(Journal,fill=AvgCitationJournal)) +
geom_bar() +
scale_fill_gradient(name="Average\nPMC\ncitations", low="#68c0f2", high="#00599e") +
scale_y_continuous(
name="papers\n",
limits=c(0,max(papersDT$JournalN)),expand=c(0,0)) +
theme_light() + theme(
text = element_text(size=12),
panel.grid = element_line(colour="lightgray", linetype=3),
axis.text.x=element_text(angle=45,hjust=1)
)
# save output to pdf
pdf("eutilplots.pdf", width=8,height=5)
ggp_year
ggp_journal
dev.off()
# citations
paper_citations <- lapply(xml_find_all(elink, "//LinkSet"), function(x) {
xml_text(xml_find_all(xml_find_all(x, ".//Link"), xpath=".//Id"))
})
names(paper_citations) <- ids
# save citations for papers
# format: each line starting with '#' contains PMID for a paper,
# and the line below contains PMIDs for all the papers citing it
tofile <- sapply(ids, function(x) paste0("#",x,"\n",sapply(paper_citations[x],paste,collapse=",")))
writeLines(tofile, "papersCitations.txt")
EOF
Rscript --vanilla processPMIDxml.R
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment