knbknb/query-pubmed-3-terms.R

## query-pubmed-3-terms.R
# Tidyverse-R-code from 2020
# (this code is also artificially slowed down, with purrr::slowly(),
# to avoid HTTP 429 errors (too many requests)

library(tidyverse)
library(httr)
library(xml2)
library(lubridate)
theme_set(theme_bw())

httr::set_config(httr::config(http_version = 0))

# time frame we are considering:
# year 1997 -  today
start_year     <- 1995
#start_year     <- 2015
end_year       <- lubridate::year( today())
end_prediction <- 2023 # a few years into the future
y_max          <- 1000
delay_seconds  <- 1
loess_smoothness <- 0.5 # 1 - smooth, 0 - wiggly

years  <- start_year:end_year
#terms  <- c("term1"="microarray[title]", "term2"="((next generation sequencing[title]) OR (high throughput sequencing[title]))")
#term_labels <- c("term1"="microarray", "term2"="next generation sequencing (NGS)")

#terms  <- c("term1"="COVID-19[title]", "term2"="((MERS[title]) OR (SARS[title]))", "term3"="(influenza[title])")
#term_labels <- c("COVID-19"="blue", "MERS/SARS"="red", "Influenza"="brown")

#terms  <- c("term1"="Zika[title]", "term2"="(Chikungunya[title])", "term3"="(ebola[title])")
#term_labels <- c("Zika"="blue", "Chikungunya"="red", "Ebola"="brown")

#terms  <- c("term1"="Influenza[title]", "term2"="(Malaria[title])", "term3"="(Dengue[title])")
#term_labels <- c("Influenza"="blue", "Malaria"="red", "Dengue"="brown")

terms  <- c("term1"="COVID-19[title] AND superspread*[title]",
            "term2"="((virus[title]) AND (infectious[title]))",
            "term3"="(COVID-19[title] AND infecti*[title])")
term_labels <- c("COVID-19/superspread*"="blue", "virus/infectious"="red", "COVID-19/infecti*"="brown")

base_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
query_params <- list("db" = "pubmed")


dfr_lbl <- data.frame(
  year = c(end_year),
  value = c(y_max),
  text = c("current year")
)

# optional function: show NCPI endpoint url in warnings
show_warning <- function(base_url, query_params) {
  l <- purrr::reduce(map2(names(query_params), query_params, str_c, sep = "="), str_c, sep = "&")
  l <- str_c(base_url, l, sep = "?")
  warning(l)
}

# call purrr::slowly() avoid HTTP error 429 - too many requests
throttled_get <- slowly(~ GET(url = .x, query = .y), rate = rate_delay(delay_seconds), quiet = FALSE)

get_count <- function(year, x) {
  # add to list

  query_params <- x[["query_params"]]
  term <- x[["term"]]

  query_params["term"] = sprintf("%s AND %s[pdat]", term, year)


  resp <- throttled_get(url = x[["base_url"]], query= query_params)
  xml <- read_html(resp)
  err <- xml_contents(xml) %>%
    xml2::xml_find_first(x = ., xpath = ".//error/text()") %>%
    as.character() %>% str_length() %>% coalesce(., 0)

  if(err) return(-1)
  rv <- xml_contents(xml) %>%
    xml2::xml_find_first(x = ., xpath = ".//count/text()") %>%
    as.character() %>%
    as.integer()
  rv
}

#show_warning(base_url, query_params)
argslist <- map(c("term1", "term2", "term3"), function(x, terms){
  list("base_url" = base_url,
       "term" = terms[[x]],
       "query_params" = query_params)
  }, terms)


ncbi_data <- tibble(
  type = "observed", `year` = years,
  term1 = map(years, get_count, argslist[[1]]),
  term2 = map(years, get_count, argslist[[2]]),
  term3 = map(years, get_count, argslist[[3]])
) %>%
  unnest(c(term1, term2, term3))


dfr <- ncbi_data %>%
  filter(year >= start_year)
mdf <- pivot_longer(dfr, cols=c("term1", "term2", "term3"), names_to = "search_term")
mdf


format_my_plot <- function(gg, maxval_y, to_year = end_year){
# some constant values are defined out of the scope of this function
 gg +
    geom_vline(xintercept = end_year, linetype = "dashed", size = 0.5, color ="dodgerblue") +
    geom_text(data = dfr_lbl, aes(x = year, y = value, label= text), color = "dodgerblue", hjust = 1.1) +
    scale_y_continuous(breaks = seq(from = 0, to = maxval_y %/% 200 * 200 + 200, by = 200),
                       limits = c(0, y_max)) +
    scale_x_continuous(breaks = seq(from = start_year, to = to_year, by = 2),
                       limits = c(start_year, end_prediction)) +
    stat_smooth(aes(y = value, color = search_term),
                data = mdf, #subset(mdf, search_term == "term3"),
                method = "loess", se = FALSE, span = loess_smoothness) +
    theme(axis.text.x = element_text(size = 7),
          legend.position = "bottom") +
    scale_color_manual(labels = names(term_labels), values = as.character(term_labels)) +
    guides(color=guide_legend("Search terms")) +  # add guide properties by aesthetic
    labs(y = "# papers / year",
         title = "Pubmed Search: Comparing search terms: ",
         subtitle = glue::glue("{ names(term_labels[1]) },   { names(term_labels[2]) },  { names(term_labels[3]) }\nNumbers of articles published, { min(years) }-{ max(years) }"),
         caption = "Source: NCBI/PubMed")

}

p <- ggplot(mdf, aes(x = year))
p <- p + geom_point(aes(y = value, color = search_term), size = 3)
mx <- max(mdf$value)
mx <- min(y_max, mx)
p <- format_my_plot(p, mx)
print(p)


# let's peek into the future
mdl_loess_term1 <- loess(term1 ~ year, data = dfr, span = loess_smoothness,
                         control = loess.control(surface = "direct"))

# when will it stop?
term1_predict <- predict(
  mdl_loess_term1,
  newdata = tibble(
  year = (max(years) + 1):end_prediction)
) %>%
  as.integer()

# predict term2 growth
extrap_years    <- tibble(year = (max(years) + 1):end_prediction)
mdl_loess_term2 <- loess(term2 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness)
term2_predict   <- predict(mdl_loess_term2,
                       newdata = extrap_years) %>%
                  as.integer()

mdl_loess_term3 <- loess(term3 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness)
term3_predict   <- predict(mdl_loess_term3,
      newdata = extrap_years) %>%
      as.integer()
pred_df         <- tibble(type = "predicted",
                           year = extrap_years$year,
                           term1 = term1_predict,
                           term2 = term2_predict,
                           term3 = term3_predict)
df2 <- bind_rows(dfr, pred_df)


n_curr <- dfr %>% filter(year == end_year) %>% pull(term1)
dfr_lbl <- dfr_lbl %>%
  mutate(text = glue::glue("{text} ({end_year}):\nn={n_curr} papers about {names(term_labels[1])}"))
mdf2 <- pivot_longer(df2, cols=c("term1", "term2", "term3"), names_to = "search_term")
mx2 <- max(mdf2$value)

p2 <- ggplot(mdf2, aes(x = year))
p2 <- p2 + geom_point(aes(y = value, color = search_term, shape = type), size = 3)
p2 <- format_my_plot(p2, mx2, to_year = end_prediction)
print(p2)

## query-pubmed-micoarray-ngs.R
library("plyr")
library("reshape2")
library("XML")
library("ggplot2")

#Concatenate SQL-style
concat<-function(...,sep="",collapse=NULL){
        strings<-list(...)
        #NULL, NA
        if(
                all(unlist(llply(strings,length))>0)
                &&
                all(!is.na(unlist(strings)))
        ){
                do.call("paste", c(strings, list(sep = sep, collapse = collapse)))
        }else{
                NULL
        }
}

getCount<-function(term){function(year){
        nihUrl<-concat("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",term," AND ",year,"[pdat]")
        #cleanurl<-gsub('\\]','%5D',gsub('\\[','%5B',x=url))
        #http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=microarray%5btitle%5d+2003%5bpdat%5d
        #xml<-xmlTreeParse(URLencode(nihUrl),isURL=TRUE)
        xml<-xmlTreeParse(nihUrl,isURL=TRUE)
        #Data Mashups in R, pg17
        as.numeric(xmlValue(xml$doc$children$eSearchResult$children$Count$children$text))

}}

years<-1995:2015
df<-data.frame(type="obs",year=years,
               mic=sapply(years,function(x){do.call(getCount('microarray[title]'),list(x))}),
               ngs=sapply(years,function(x){do.call(getCount('((next generation sequencing[title]) OR (high throughput sequencing[title]))'),list(x))})
)
df
#97 is a fair start
df<-subset(df,year>=1997)
mdf<-melt(df,id.vars=c("type","year"),variable_name="citation")
mdf
c<-ggplot(mdf,aes(x=year))
p<-c+geom_point(aes(y=value,color=variable),size=3) +
        ylab("# papers / year") +
        stat_smooth(aes(y=value,color=variable),data=subset(mdf,variable=="mic"),method="loess") +
        scale_x_continuous(breaks=seq(from=1997,to=max(years),by=2))
print(p)

#Return 0 for negative elements
# noNeg(c(3,2,1,0,-1,-2,2))
# [1] 3 2 1 0 0 0 2
noNeg<-function(v){sapply(v,function(x){max(x,0)})}

#Return up to the first negative/zero element inclusive
# toZeroNoNeg(c(3,2,1,0,-1,-2,2))
# [1] 3 2 1 0
toZeroNoNeg<-function(v){noNeg(v)[1:firstZero(noNeg(v))]}

#return index of first zero
firstZero<-function(v){which(noNeg(v)==0)[1]}

#let's peer into the future
#df.lo.mic<-loess(mic ~ year,df,control=loess.control(surface="direct"))

df.lo.mic<-loess(mic ~ year,data=df,control=loess.control(surface="direct"))

#when will it stop?
mic_predict<-as.integer(predict(df.lo.mic,data.frame(year=(max(years)+1):2035),se=FALSE))
zero_year<-max(years)+firstZero(mic_predict)
cat(concat("LOESS projects ",sum(toZeroNoNeg(mic_predict))," more damn microarray papers."))
cat(concat("The last damn microarray paper is projected to be in ",(zero_year-1),"."))

#predict ngs growth
df.lo.ngs<-loess(ngs ~ year,df,control=loess.control(surface="direct"))
ngs_predict<-as.integer(predict(df.lo.ngs,data.frame(year=(max(years)+1):zero_year),se=FALSE))
head(df,1)
pred_df<-data.frame(type="pred",year=c((max(years)+1):zero_year),mic=c(toZeroNoNeg(mic_predict)),ngs=ngs_predict)
df2<-rbind(df,pred_df)

mdf2<-melt(df2,id.vars=c("type","year"),variable_name="citation")

c2<-ggplot(mdf2,aes(x=year))
p2 <- c2+geom_point(aes(y=value,color=variable,shape=type),size=3) +
        ylab("# papers / year") +
        scale_y_continuous(breaks=seq(from=0,to=max(mdf2$value) %/% 200 * 200 + 200, by=200))+
        scale_x_continuous(breaks=seq(from=1997,to=zero_year,by=2))
print(p2)
	# Tidyverse-R-code from 2020
	# (this code is also artificially slowed down, with purrr::slowly(),
	# to avoid HTTP 429 errors (too many requests)

	library(tidyverse)
	library(httr)
	library(xml2)
	library(lubridate)
	theme_set(theme_bw())

	httr::set_config(httr::config(http_version = 0))

	# time frame we are considering:
	# year 1997 - today
	start_year <- 1995
	#start_year <- 2015
	end_year <- lubridate::year( today())
	end_prediction <- 2023 # a few years into the future
	y_max <- 1000
	delay_seconds <- 1
	loess_smoothness <- 0.5 # 1 - smooth, 0 - wiggly

	years <- start_year:end_year
	#terms <- c("term1"="microarray[title]", "term2"="((next generation sequencing[title]) OR (high throughput sequencing[title]))")
	#term_labels <- c("term1"="microarray", "term2"="next generation sequencing (NGS)")

	#terms <- c("term1"="COVID-19[title]", "term2"="((MERS[title]) OR (SARS[title]))", "term3"="(influenza[title])")
	#term_labels <- c("COVID-19"="blue", "MERS/SARS"="red", "Influenza"="brown")

	#terms <- c("term1"="Zika[title]", "term2"="(Chikungunya[title])", "term3"="(ebola[title])")
	#term_labels <- c("Zika"="blue", "Chikungunya"="red", "Ebola"="brown")

	#terms <- c("term1"="Influenza[title]", "term2"="(Malaria[title])", "term3"="(Dengue[title])")
	#term_labels <- c("Influenza"="blue", "Malaria"="red", "Dengue"="brown")

	terms <- c("term1"="COVID-19[title] AND superspread*[title]",
	"term2"="((virus[title]) AND (infectious[title]))",
	"term3"="(COVID-19[title] AND infecti*[title])")
	term_labels <- c("COVID-19/superspread"="blue", "virus/infectious"="red", "COVID-19/infecti"="brown")

	base_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
	query_params <- list("db" = "pubmed")


	dfr_lbl <- data.frame(
	year = c(end_year),
	value = c(y_max),
	text = c("current year")
	)

	# optional function: show NCPI endpoint url in warnings
	show_warning <- function(base_url, query_params) {
	l <- purrr::reduce(map2(names(query_params), query_params, str_c, sep = "="), str_c, sep = "&")
	l <- str_c(base_url, l, sep = "?")
	warning(l)
	}

	# call purrr::slowly() avoid HTTP error 429 - too many requests
	throttled_get <- slowly(~ GET(url = .x, query = .y), rate = rate_delay(delay_seconds), quiet = FALSE)

	get_count <- function(year, x) {
	# add to list

	query_params <- x[["query_params"]]
	term <- x[["term"]]

	query_params["term"] = sprintf("%s AND %s[pdat]", term, year)


	resp <- throttled_get(url = x[["base_url"]], query= query_params)
	xml <- read_html(resp)
	err <- xml_contents(xml) %>%
	xml2::xml_find_first(x = ., xpath = ".//error/text()") %>%
	as.character() %>% str_length() %>% coalesce(., 0)

	if(err) return(-1)
	rv <- xml_contents(xml) %>%
	xml2::xml_find_first(x = ., xpath = ".//count/text()") %>%
	as.character() %>%
	as.integer()
	rv
	}

	#show_warning(base_url, query_params)
	argslist <- map(c("term1", "term2", "term3"), function(x, terms){
	list("base_url" = base_url,
	"term" = terms[[x]],
	"query_params" = query_params)
	}, terms)


	ncbi_data <- tibble(
	type = "observed", `year` = years,
	term1 = map(years, get_count, argslist[[1]]),
	term2 = map(years, get_count, argslist[[2]]),
	term3 = map(years, get_count, argslist[[3]])
	) %>%
	unnest(c(term1, term2, term3))



	dfr <- ncbi_data %>%
	filter(year >= start_year)
	mdf <- pivot_longer(dfr, cols=c("term1", "term2", "term3"), names_to = "search_term")
	mdf


	format_my_plot <- function(gg, maxval_y, to_year = end_year){
	# some constant values are defined out of the scope of this function
	gg +
	geom_vline(xintercept = end_year, linetype = "dashed", size = 0.5, color ="dodgerblue") +
	geom_text(data = dfr_lbl, aes(x = year, y = value, label= text), color = "dodgerblue", hjust = 1.1) +
	scale_y_continuous(breaks = seq(from = 0, to = maxval_y %/% 200 * 200 + 200, by = 200),
	limits = c(0, y_max)) +
	scale_x_continuous(breaks = seq(from = start_year, to = to_year, by = 2),
	limits = c(start_year, end_prediction)) +
	stat_smooth(aes(y = value, color = search_term),
	data = mdf, #subset(mdf, search_term == "term3"),
	method = "loess", se = FALSE, span = loess_smoothness) +
	theme(axis.text.x = element_text(size = 7),
	legend.position = "bottom") +
	scale_color_manual(labels = names(term_labels), values = as.character(term_labels)) +
	guides(color=guide_legend("Search terms")) + # add guide properties by aesthetic
	labs(y = "# papers / year",
	title = "Pubmed Search: Comparing search terms: ",
	subtitle = glue::glue("{ names(term_labels[1]) }, { names(term_labels[2]) }, { names(term_labels[3]) }\nNumbers of articles published, { min(years) }-{ max(years) }"),
	caption = "Source: NCBI/PubMed")

	}

	p <- ggplot(mdf, aes(x = year))
	p <- p + geom_point(aes(y = value, color = search_term), size = 3)
	mx <- max(mdf$value)
	mx <- min(y_max, mx)
	p <- format_my_plot(p, mx)
	print(p)


	# let's peek into the future
	mdl_loess_term1 <- loess(term1 ~ year, data = dfr, span = loess_smoothness,
	control = loess.control(surface = "direct"))

	# when will it stop?
	term1_predict <- predict(
	mdl_loess_term1,
	newdata = tibble(
	year = (max(years) + 1):end_prediction)
	) %>%
	as.integer()

	# predict term2 growth
	extrap_years <- tibble(year = (max(years) + 1):end_prediction)
	mdl_loess_term2 <- loess(term2 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness)
	term2_predict <- predict(mdl_loess_term2,
	newdata = extrap_years) %>%
	as.integer()

	mdl_loess_term3 <- loess(term3 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness)
	term3_predict <- predict(mdl_loess_term3,
	newdata = extrap_years) %>%
	as.integer()
	pred_df <- tibble(type = "predicted",
	year = extrap_years$year,
	term1 = term1_predict,
	term2 = term2_predict,
	term3 = term3_predict)
	df2 <- bind_rows(dfr, pred_df)


	n_curr <- dfr %>% filter(year == end_year) %>% pull(term1)
	dfr_lbl <- dfr_lbl %>%
	mutate(text = glue::glue("{text} ({end_year}):\nn={n_curr} papers about {names(term_labels[1])}"))
	mdf2 <- pivot_longer(df2, cols=c("term1", "term2", "term3"), names_to = "search_term")
	mx2 <- max(mdf2$value)

	p2 <- ggplot(mdf2, aes(x = year))
	p2 <- p2 + geom_point(aes(y = value, color = search_term, shape = type), size = 3)
	p2 <- format_my_plot(p2, mx2, to_year = end_prediction)
	print(p2)
	library("plyr")
	library("reshape2")
	library("XML")
	library("ggplot2")

	#Concatenate SQL-style
	concat<-function(...,sep="",collapse=NULL){
	strings<-list(...)
	#NULL, NA
	if(
	all(unlist(llply(strings,length))>0)
	&&
	all(!is.na(unlist(strings)))
	){
	do.call("paste", c(strings, list(sep = sep, collapse = collapse)))
	}else{
	NULL
	}
	}

	getCount<-function(term){function(year){
	nihUrl<-concat("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",term," AND ",year,"[pdat]")
	#cleanurl<-gsub('\\]','%5D',gsub('\\[','%5B',x=url))
	#http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=microarray%5btitle%5d+2003%5bpdat%5d
	#xml<-xmlTreeParse(URLencode(nihUrl),isURL=TRUE)
	xml<-xmlTreeParse(nihUrl,isURL=TRUE)
	#Data Mashups in R, pg17
	as.numeric(xmlValue(xml$doc$children$eSearchResult$children$Count$children$text))

	}}

	years<-1995:2015
	df<-data.frame(type="obs",year=years,
	mic=sapply(years,function(x){do.call(getCount('microarray[title]'),list(x))}),
	ngs=sapply(years,function(x){do.call(getCount('((next generation sequencing[title]) OR (high throughput sequencing[title]))'),list(x))})
	)
	df
	#97 is a fair start
	df<-subset(df,year>=1997)
	mdf<-melt(df,id.vars=c("type","year"),variable_name="citation")
	mdf
	c<-ggplot(mdf,aes(x=year))
	p<-c+geom_point(aes(y=value,color=variable),size=3) +
	ylab("# papers / year") +
	stat_smooth(aes(y=value,color=variable),data=subset(mdf,variable=="mic"),method="loess") +
	scale_x_continuous(breaks=seq(from=1997,to=max(years),by=2))
	print(p)

	#Return 0 for negative elements
	# noNeg(c(3,2,1,0,-1,-2,2))
	# [1] 3 2 1 0 0 0 2
	noNeg<-function(v){sapply(v,function(x){max(x,0)})}

	#Return up to the first negative/zero element inclusive
	# toZeroNoNeg(c(3,2,1,0,-1,-2,2))
	# [1] 3 2 1 0
	toZeroNoNeg<-function(v){noNeg(v)[1:firstZero(noNeg(v))]}

	#return index of first zero
	firstZero<-function(v){which(noNeg(v)==0)[1]}

	#let's peer into the future
	#df.lo.mic<-loess(mic ~ year,df,control=loess.control(surface="direct"))

	df.lo.mic<-loess(mic ~ year,data=df,control=loess.control(surface="direct"))

	#when will it stop?
	mic_predict<-as.integer(predict(df.lo.mic,data.frame(year=(max(years)+1):2035),se=FALSE))
	zero_year<-max(years)+firstZero(mic_predict)
	cat(concat("LOESS projects ",sum(toZeroNoNeg(mic_predict))," more damn microarray papers."))
	cat(concat("The last damn microarray paper is projected to be in ",(zero_year-1),"."))

	#predict ngs growth
	df.lo.ngs<-loess(ngs ~ year,df,control=loess.control(surface="direct"))
	ngs_predict<-as.integer(predict(df.lo.ngs,data.frame(year=(max(years)+1):zero_year),se=FALSE))
	head(df,1)
	pred_df<-data.frame(type="pred",year=c((max(years)+1):zero_year),mic=c(toZeroNoNeg(mic_predict)),ngs=ngs_predict)
	df2<-rbind(df,pred_df)

	mdf2<-melt(df2,id.vars=c("type","year"),variable_name="citation")

	c2<-ggplot(mdf2,aes(x=year))
	p2 <- c2+geom_point(aes(y=value,color=variable,shape=type),size=3) +
	ylab("# papers / year") +
	scale_y_continuous(breaks=seq(from=0,to=max(mdf2$value) %/% 200 * 200 + 200, by=200))+
	scale_x_continuous(breaks=seq(from=1997,to=zero_year,by=2))
	print(p2)