leipzig/microarray.R

## microarray.R
library("plyr")
library("XML")
library("ggplot2")

#Concatenate SQL-style
concat<-function(...,sep="",collapse=NULL){
  strings<-list(...)
  #NULL, NA
  if(
    all(unlist(llply(strings,length))>0)
    &&
      all(!is.na(unlist(strings)))
    ){
    do.call("paste", c(strings, list(sep = sep, collapse = collapse)))
  }else{
    NULL
  }
}

getCount<-function(term){function(year){
  nihUrl<-concat("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",term,"+",year,"[pdat]")
  #cleanurl<-gsub('\\]','%5D',gsub('\\[','%5B',x=url))
  #http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=microarray%5btitle%5d+2003%5bpdat%5d
  xml<-xmlTreeParse(URLencode(nihUrl),isURL=TRUE)
  #Data Mashups in R, pg17
  as.numeric(xmlValue(xml$doc$children$eSearchResult$children$Count$children$text))
}}

years<-1995:2011
df<-data.frame(type="obs",year=years,
    mic=sapply(years,function(x){do.call(getCount('microarray[title]'),list(x))}),
    ngs=sapply(years,function(x){do.call(getCount('"next generation sequencing"[title] OR "high-throughput sequencing"[title]'),list(x))})
)

#97 is a fair start
df<-subset(df,year>=1997)
mdf<-melt(df,id.vars=c("type","year"),variable_name="citation")

c<-ggplot(mdf,aes(x=year))
p<-c+geom_point(aes(y=value,color=citation),size=3) +
  ylab("papers") +
  stat_smooth(aes(y=value,color=citation),data=subset(mdf,citation=="mic"),method="loess") +
  scale_x_continuous(breaks=seq(from=1997,to=2011,by=2))
print(p)

#Return 0 for negative elements
# noNeg(c(3,2,1,0,-1,-2,2))
# [1] 3 2 1 0 0 0 2
noNeg<-function(v){sapply(v,function(x){max(x,0)})}

#Return up to the first negative/zero element inclusive
# toZeroNoNeg(c(3,2,1,0,-1,-2,2))
# [1] 3 2 1 0
toZeroNoNeg<-function(v){noNeg(v)[1:firstZero(noNeg(v))]}

#return index of first zero
firstZero<-function(v){which(noNeg(v)==0)[1]}

#let's peer into the future
df.lo.mic<-loess(mic ~ year,df,control=loess.control(surface="direct"))

#when will it stop?
mic_predict<-as.integer(predict(df.lo.mic,data.frame(year=2012:2020),se=FALSE))
zero_year<-2011+firstZero(mic_predict)
cat(concat("LOESS projects ",sum(toZeroNoNeg(mic_predict))," more damn microarray papers."))
cat(concat("The last damn microarray paper is projected to be in ",(zero_year-1),"."))

#predict ngs growth
df.lo.ngs<-loess(ngs ~ year,df,control=loess.control(surface="direct"))
ngs_predict<-as.integer(predict(df.lo.ngs,data.frame(year=2012:zero_year),se=FALSE))

pred_df<-data.frame(type="pred",year=c(2012:zero_year),mic=toZeroNoNeg(mic_predict),ngs=ngs_predict)
df2<-rbind(df,pred_df)

mdf2<-melt(df2,id.vars=c("type","year"),variable_name="citation")

c2<-ggplot(mdf2,aes(x=year))
p2<-c2+geom_point(aes(y=value,color=citation,shape=type),size=3) +
    ylab("papers") +
    scale_y_continuous(breaks=seq(from=0,to=1600,by=200))+
    scale_x_continuous(breaks=seq(from=1997,to=zero_year,by=2))
print(p2)
	library("plyr")
	library("XML")
	library("ggplot2")

	#Concatenate SQL-style
	concat<-function(...,sep="",collapse=NULL){
	strings<-list(...)
	#NULL, NA
	if(
	all(unlist(llply(strings,length))>0)
	&&
	all(!is.na(unlist(strings)))
	){
	do.call("paste", c(strings, list(sep = sep, collapse = collapse)))
	}else{
	NULL
	}
	}

	getCount<-function(term){function(year){
	nihUrl<-concat("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",term,"+",year,"[pdat]")
	#cleanurl<-gsub('\\]','%5D',gsub('\\[','%5B',x=url))
	#http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=microarray%5btitle%5d+2003%5bpdat%5d
	xml<-xmlTreeParse(URLencode(nihUrl),isURL=TRUE)
	#Data Mashups in R, pg17
	as.numeric(xmlValue(xml$doc$children$eSearchResult$children$Count$children$text))
	}}

	years<-1995:2011
	df<-data.frame(type="obs",year=years,
	mic=sapply(years,function(x){do.call(getCount('microarray[title]'),list(x))}),
	ngs=sapply(years,function(x){do.call(getCount('"next generation sequencing"[title] OR "high-throughput sequencing"[title]'),list(x))})
	)

	#97 is a fair start
	df<-subset(df,year>=1997)
	mdf<-melt(df,id.vars=c("type","year"),variable_name="citation")

	c<-ggplot(mdf,aes(x=year))
	p<-c+geom_point(aes(y=value,color=citation),size=3) +
	ylab("papers") +
	stat_smooth(aes(y=value,color=citation),data=subset(mdf,citation=="mic"),method="loess") +
	scale_x_continuous(breaks=seq(from=1997,to=2011,by=2))
	print(p)

	#Return 0 for negative elements
	# noNeg(c(3,2,1,0,-1,-2,2))
	# [1] 3 2 1 0 0 0 2
	noNeg<-function(v){sapply(v,function(x){max(x,0)})}

	#Return up to the first negative/zero element inclusive
	# toZeroNoNeg(c(3,2,1,0,-1,-2,2))
	# [1] 3 2 1 0
	toZeroNoNeg<-function(v){noNeg(v)[1:firstZero(noNeg(v))]}

	#return index of first zero
	firstZero<-function(v){which(noNeg(v)==0)[1]}

	#let's peer into the future
	df.lo.mic<-loess(mic ~ year,df,control=loess.control(surface="direct"))

	#when will it stop?
	mic_predict<-as.integer(predict(df.lo.mic,data.frame(year=2012:2020),se=FALSE))
	zero_year<-2011+firstZero(mic_predict)
	cat(concat("LOESS projects ",sum(toZeroNoNeg(mic_predict))," more damn microarray papers."))
	cat(concat("The last damn microarray paper is projected to be in ",(zero_year-1),"."))

	#predict ngs growth
	df.lo.ngs<-loess(ngs ~ year,df,control=loess.control(surface="direct"))
	ngs_predict<-as.integer(predict(df.lo.ngs,data.frame(year=2012:zero_year),se=FALSE))

	pred_df<-data.frame(type="pred",year=c(2012:zero_year),mic=toZeroNoNeg(mic_predict),ngs=ngs_predict)
	df2<-rbind(df,pred_df)

	mdf2<-melt(df2,id.vars=c("type","year"),variable_name="citation")

	c2<-ggplot(mdf2,aes(x=year))
	p2<-c2+geom_point(aes(y=value,color=citation,shape=type),size=3) +
	ylab("papers") +
	scale_y_continuous(breaks=seq(from=0,to=1600,by=200))+
	scale_x_continuous(breaks=seq(from=1997,to=zero_year,by=2))
	print(p2)