Skip to content

Instantly share code, notes, and snippets.

@knbknb
Forked from leipzig/microarray.R
Last active December 24, 2020 11:49
Show Gist options
  • Save knbknb/1638899 to your computer and use it in GitHub Desktop.
Save knbknb/1638899 to your computer and use it in GitHub Desktop.
Pubmed Queries: last paper on microarray bioinformatics, when to appear?
# Tidyverse-R-code from 2020
# (this code is also artificially slowed down, with purrr::slowly(),
# to avoid HTTP 429 errors (too many requests)
library(tidyverse)
library(httr)
library(xml2)
library(lubridate)
theme_set(theme_bw())
httr::set_config(httr::config(http_version = 0))
# time frame we are considering:
# year 1997 - today
start_year <- 1995
#start_year <- 2015
end_year <- lubridate::year( today())
end_prediction <- 2023 # a few years into the future
y_max <- 1000
delay_seconds <- 1
loess_smoothness <- 0.5 # 1 - smooth, 0 - wiggly
years <- start_year:end_year
#terms <- c("term1"="microarray[title]", "term2"="((next generation sequencing[title]) OR (high throughput sequencing[title]))")
#term_labels <- c("term1"="microarray", "term2"="next generation sequencing (NGS)")
#terms <- c("term1"="COVID-19[title]", "term2"="((MERS[title]) OR (SARS[title]))", "term3"="(influenza[title])")
#term_labels <- c("COVID-19"="blue", "MERS/SARS"="red", "Influenza"="brown")
#terms <- c("term1"="Zika[title]", "term2"="(Chikungunya[title])", "term3"="(ebola[title])")
#term_labels <- c("Zika"="blue", "Chikungunya"="red", "Ebola"="brown")
#terms <- c("term1"="Influenza[title]", "term2"="(Malaria[title])", "term3"="(Dengue[title])")
#term_labels <- c("Influenza"="blue", "Malaria"="red", "Dengue"="brown")
terms <- c("term1"="COVID-19[title] AND superspread*[title]",
"term2"="((virus[title]) AND (infectious[title]))",
"term3"="(COVID-19[title] AND infecti*[title])")
term_labels <- c("COVID-19/superspread*"="blue", "virus/infectious"="red", "COVID-19/infecti*"="brown")
base_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
query_params <- list("db" = "pubmed")
dfr_lbl <- data.frame(
year = c(end_year),
value = c(y_max),
text = c("current year")
)
# optional function: show NCPI endpoint url in warnings
show_warning <- function(base_url, query_params) {
l <- purrr::reduce(map2(names(query_params), query_params, str_c, sep = "="), str_c, sep = "&")
l <- str_c(base_url, l, sep = "?")
warning(l)
}
# call purrr::slowly() avoid HTTP error 429 - too many requests
throttled_get <- slowly(~ GET(url = .x, query = .y), rate = rate_delay(delay_seconds), quiet = FALSE)
get_count <- function(year, x) {
# add to list
query_params <- x[["query_params"]]
term <- x[["term"]]
query_params["term"] = sprintf("%s AND %s[pdat]", term, year)
resp <- throttled_get(url = x[["base_url"]], query= query_params)
xml <- read_html(resp)
err <- xml_contents(xml) %>%
xml2::xml_find_first(x = ., xpath = ".//error/text()") %>%
as.character() %>% str_length() %>% coalesce(., 0)
if(err) return(-1)
rv <- xml_contents(xml) %>%
xml2::xml_find_first(x = ., xpath = ".//count/text()") %>%
as.character() %>%
as.integer()
rv
}
#show_warning(base_url, query_params)
argslist <- map(c("term1", "term2", "term3"), function(x, terms){
list("base_url" = base_url,
"term" = terms[[x]],
"query_params" = query_params)
}, terms)
ncbi_data <- tibble(
type = "observed", `year` = years,
term1 = map(years, get_count, argslist[[1]]),
term2 = map(years, get_count, argslist[[2]]),
term3 = map(years, get_count, argslist[[3]])
) %>%
unnest(c(term1, term2, term3))
dfr <- ncbi_data %>%
filter(year >= start_year)
mdf <- pivot_longer(dfr, cols=c("term1", "term2", "term3"), names_to = "search_term")
mdf
format_my_plot <- function(gg, maxval_y, to_year = end_year){
# some constant values are defined out of the scope of this function
gg +
geom_vline(xintercept = end_year, linetype = "dashed", size = 0.5, color ="dodgerblue") +
geom_text(data = dfr_lbl, aes(x = year, y = value, label= text), color = "dodgerblue", hjust = 1.1) +
scale_y_continuous(breaks = seq(from = 0, to = maxval_y %/% 200 * 200 + 200, by = 200),
limits = c(0, y_max)) +
scale_x_continuous(breaks = seq(from = start_year, to = to_year, by = 2),
limits = c(start_year, end_prediction)) +
stat_smooth(aes(y = value, color = search_term),
data = mdf, #subset(mdf, search_term == "term3"),
method = "loess", se = FALSE, span = loess_smoothness) +
theme(axis.text.x = element_text(size = 7),
legend.position = "bottom") +
scale_color_manual(labels = names(term_labels), values = as.character(term_labels)) +
guides(color=guide_legend("Search terms")) + # add guide properties by aesthetic
labs(y = "# papers / year",
title = "Pubmed Search: Comparing search terms: ",
subtitle = glue::glue("{ names(term_labels[1]) }, { names(term_labels[2]) }, { names(term_labels[3]) }\nNumbers of articles published, { min(years) }-{ max(years) }"),
caption = "Source: NCBI/PubMed")
}
p <- ggplot(mdf, aes(x = year))
p <- p + geom_point(aes(y = value, color = search_term), size = 3)
mx <- max(mdf$value)
mx <- min(y_max, mx)
p <- format_my_plot(p, mx)
print(p)
# let's peek into the future
mdl_loess_term1 <- loess(term1 ~ year, data = dfr, span = loess_smoothness,
control = loess.control(surface = "direct"))
# when will it stop?
term1_predict <- predict(
mdl_loess_term1,
newdata = tibble(
year = (max(years) + 1):end_prediction)
) %>%
as.integer()
# predict term2 growth
extrap_years <- tibble(year = (max(years) + 1):end_prediction)
mdl_loess_term2 <- loess(term2 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness)
term2_predict <- predict(mdl_loess_term2,
newdata = extrap_years) %>%
as.integer()
mdl_loess_term3 <- loess(term3 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness)
term3_predict <- predict(mdl_loess_term3,
newdata = extrap_years) %>%
as.integer()
pred_df <- tibble(type = "predicted",
year = extrap_years$year,
term1 = term1_predict,
term2 = term2_predict,
term3 = term3_predict)
df2 <- bind_rows(dfr, pred_df)
n_curr <- dfr %>% filter(year == end_year) %>% pull(term1)
dfr_lbl <- dfr_lbl %>%
mutate(text = glue::glue("{text} ({end_year}):\nn={n_curr} papers about {names(term_labels[1])}"))
mdf2 <- pivot_longer(df2, cols=c("term1", "term2", "term3"), names_to = "search_term")
mx2 <- max(mdf2$value)
p2 <- ggplot(mdf2, aes(x = year))
p2 <- p2 + geom_point(aes(y = value, color = search_term, shape = type), size = 3)
p2 <- format_my_plot(p2, mx2, to_year = end_prediction)
print(p2)
library("plyr")
library("reshape2")
library("XML")
library("ggplot2")
#Concatenate SQL-style
concat<-function(...,sep="",collapse=NULL){
strings<-list(...)
#NULL, NA
if(
all(unlist(llply(strings,length))>0)
&&
all(!is.na(unlist(strings)))
){
do.call("paste", c(strings, list(sep = sep, collapse = collapse)))
}else{
NULL
}
}
getCount<-function(term){function(year){
nihUrl<-concat("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",term," AND ",year,"[pdat]")
#cleanurl<-gsub('\\]','%5D',gsub('\\[','%5B',x=url))
#http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=microarray%5btitle%5d+2003%5bpdat%5d
#xml<-xmlTreeParse(URLencode(nihUrl),isURL=TRUE)
xml<-xmlTreeParse(nihUrl,isURL=TRUE)
#Data Mashups in R, pg17
as.numeric(xmlValue(xml$doc$children$eSearchResult$children$Count$children$text))
}}
years<-1995:2015
df<-data.frame(type="obs",year=years,
mic=sapply(years,function(x){do.call(getCount('microarray[title]'),list(x))}),
ngs=sapply(years,function(x){do.call(getCount('((next generation sequencing[title]) OR (high throughput sequencing[title]))'),list(x))})
)
df
#97 is a fair start
df<-subset(df,year>=1997)
mdf<-melt(df,id.vars=c("type","year"),variable_name="citation")
mdf
c<-ggplot(mdf,aes(x=year))
p<-c+geom_point(aes(y=value,color=variable),size=3) +
ylab("# papers / year") +
stat_smooth(aes(y=value,color=variable),data=subset(mdf,variable=="mic"),method="loess") +
scale_x_continuous(breaks=seq(from=1997,to=max(years),by=2))
print(p)
#Return 0 for negative elements
# noNeg(c(3,2,1,0,-1,-2,2))
# [1] 3 2 1 0 0 0 2
noNeg<-function(v){sapply(v,function(x){max(x,0)})}
#Return up to the first negative/zero element inclusive
# toZeroNoNeg(c(3,2,1,0,-1,-2,2))
# [1] 3 2 1 0
toZeroNoNeg<-function(v){noNeg(v)[1:firstZero(noNeg(v))]}
#return index of first zero
firstZero<-function(v){which(noNeg(v)==0)[1]}
#let's peer into the future
#df.lo.mic<-loess(mic ~ year,df,control=loess.control(surface="direct"))
df.lo.mic<-loess(mic ~ year,data=df,control=loess.control(surface="direct"))
#when will it stop?
mic_predict<-as.integer(predict(df.lo.mic,data.frame(year=(max(years)+1):2035),se=FALSE))
zero_year<-max(years)+firstZero(mic_predict)
cat(concat("LOESS projects ",sum(toZeroNoNeg(mic_predict))," more damn microarray papers."))
cat(concat("The last damn microarray paper is projected to be in ",(zero_year-1),"."))
#predict ngs growth
df.lo.ngs<-loess(ngs ~ year,df,control=loess.control(surface="direct"))
ngs_predict<-as.integer(predict(df.lo.ngs,data.frame(year=(max(years)+1):zero_year),se=FALSE))
head(df,1)
pred_df<-data.frame(type="pred",year=c((max(years)+1):zero_year),mic=c(toZeroNoNeg(mic_predict)),ngs=ngs_predict)
df2<-rbind(df,pred_df)
mdf2<-melt(df2,id.vars=c("type","year"),variable_name="citation")
c2<-ggplot(mdf2,aes(x=year))
p2 <- c2+geom_point(aes(y=value,color=variable,shape=type),size=3) +
ylab("# papers / year") +
scale_y_continuous(breaks=seq(from=0,to=max(mdf2$value) %/% 200 * 200 + 200, by=200))+
scale_x_continuous(breaks=seq(from=1997,to=zero_year,by=2))
print(p2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment