Skip to content

Instantly share code, notes, and snippets.

@jtleek
Created January 16, 2015 04:43
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 8 You must be signed in to fork a gist
  • Save jtleek/c5158965d77c21ade424 to your computer and use it in GitHub Desktop.
Save jtleek/c5158965d77c21ade424 to your computer and use it in GitHub Desktop.
## Load libraries
library(XML)
library(dplyr)
library(RCurl)
## Get the results for a specific term
scrape_term = function(search_term,npages){
base_url = "http://scholar.google.com/scholar?"
search_string = paste0("q=",paste0(strsplit(search_term," ")[[1]],collapse="+"))
dat = data.frame(NA,nrow=10*npages,ncol=3)
names(dat)=c("pub_year","cites","title")
for(i in 1:npages){
if(i==1){
url1 = paste0(base_url,search_string)
}else{
start_string = paste0("&start=",(i-1)*10)
url1 = paste0(base_url,search_string,start_string)
}
doc <- htmlParse(url1,encoding="UTF-8")
titles <- xpathSApply(doc, "//h3[@class='gs_rt']", xmlValue)
cites = xpathSApply(doc,
'//*[contains(concat( " ", @class, " " ), concat( " ", "gs_ri", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "gs_fl", " " ))]//a[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]',
xmlValue)
cites = cites[1:10*3-2]
cites = as.numeric(sapply(cites,function(x){strsplit(x,"Cited by ")[[1]][2]}))
pub <- xpathSApply(doc, "//div[@class='gs_a']", xmlValue)
pub_years = as.integer(gsub(".*\\s(\\d{4})\\s.*", "\\1", pub))
ind = ((i-1)*10+1):(i*10)
dat[ind,1] = pub_years
dat[ind,2] = cites
dat[ind,3] = titles
}
return(dat)
}
## Search for these terms
terms = c('empirical processes','proportional hazards model','generalized linear model','semiparametric','generalized estimating equation','false discovery rate','microarray statistics','lasso shrinkage','rna-seq statistics')
nterms=length(terms)
term_data = vector(mode="list",length=nterms)
npages =3
for(i in 1:length(terms)){
term_data[[i]] = scrape_term(terms[i],npages)
term_data[[i]] = cbind(term_data[[i]],rep(terms[i],npages*10))
names(term_data[[i]])[4] = "term"
Sys.sleep(3)
cat(i)
}
term_vec = as.vector(sapply(term_data,function(x){x$term}))
## Put the term factor in order for the boxplot
term_vec = reorder(term_vec,rep(1:9,each=30))
## Make the axis abbreviated by changing labels
levels(term_vec) = c("Emp. Proc.", "Prop. Haz.", "GLM", "Semi-param.","GEE","FDR","microarray","lasso","rna-seq")
pubyear_vec = as.vector(sapply(term_data,function(x){x$pub_year}))
title_vec = as.vector(sapply(term_data,function(x){x$title}))
## Create the plot
png(file="citations-boxplot.png",height=400,width=600)
par(bg="black",fg="white",col.axis="white",
col.lab="white",col.main="white",
mar=c(6,4,4,2))
boxcol = "#20B2E3"
pointcol="white"
tmp = boxplot(pubyear_vec ~ term_vec2)
grid(nx=NA, ny=NULL)
boxplot(pubyear_vec ~ term_vec2,col=boxcol,
bty="n",xaxt="n",yaxt="n",ylab="year",main="Publication Year of First 30 G.S. Hits",frame.plot=FALSE)
stripchart(pubyear_vec ~ term_vec2,vertical=T,method="jitter",add=TRUE,pch=19,col=pointcol,cex=0.5)
axis(side=1,at=1:length(tmp$names),labels=tmp$names,tick=FALSE,las=2)
axis(side=2,at=at2,tick=FALSE)
add_simply_logo("black")
dev.off()
tapply(pubyear_vec,term_vec,function(x){mean(x,na.rm=T)})
@dorinstanciu
Copy link

There is an issue with the XML package. A workaround would be
library(httr)
doc <- htmlParse(rawToChar(GET(url)$content))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment