Skip to content

Instantly share code, notes, and snippets.

@jtleek
Created September 3, 2013 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtleek/6424430 to your computer and use it in GitHub Desktop.
Save jtleek/6424430 to your computer and use it in GitHub Desktop.
googleCite gist
#########################################################################################
# Some functions to quantify your Google Scholar citations page.
# R functions Copyright (C) 2011 John Muschelli (jmuschel@jhsph.edu), Andrew Jaffe (ajaffe@jhsph.edu),
# Jeffrey Leek (jtleek@gmail.com), and the Simply Statistics Blog
# (http://simplystatistics.tumblr.com, http://twitter.com/simplystats)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details, see <http://www.gnu.org/licenses/>.
#
#
# These functions depend on the packages: wordcloud, tm, sendmailR, and RColorBrewer. It will
# attempt to install them if they are not installed when you source this function.
#
#
# How to use:
# # Source the function
# source("http://biostat.jhsph.edu/~jleek/code/googleCite.r")
#
# # Get the url for a scholar (this is the one for Rafa Irizarry: http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en)
# # and run the googleCite function. You can choose to plot word clouds of the co-authors and paper titles by setting plotIt=TRUE
# # it will automatically produce a pdf file, if you want to set the name/location of this pdffile, set the pdfname="yourname_wordcloud.pdf"
# # When you run this function, your Google Scholar data will be sent to our email account, so that we can see who is running the function
# # and to perform population-level analyses. The variable out will contain a table with data from your Google Scholar citation page.
#
# out <- googleCite("http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en", pdfname="rafa_cloud.pdf")
#
#
# # To calculate some popular citation indices you can now apply gcSummary to the output
# gcSummary(out)
#
#
# # You can also search for a specific individual by name using the function searchCite
#
# out2 <- searchCite("Rafa Irizarry", pdfname="rafa_cloud.pdf")
#
########################################################################################
getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")
pckg = try(require(wordcloud))
if(!pckg) {
cat("Installing 'wordcloud' from CRAN\n")
getPckg(wordcloud)
require(wordcloud)
}
pckg = try(require(tm))
if(!pckg) {
cat("Installing 'tm' from CRAN\n")
getPckg("tm")
require("tm")
}
pckg = try(require(sendmailR))
if(!pckg) {
cat("Installing 'sendmailR' from CRAN\n")
getPckg("sendmailR")
require("sendmailR")
}
pckg = try(require(RColorBrewer))
if(!pckg) {
cat("Installing 'RColorBrewer' from Bioconductor\n")
getPckg("RColorBrewer")
require("RColorBrewer")
}
# helper functions
googleCite = function(theurl, plotIt = TRUE,pdfname=NULL) {
theurl = strsplit(theurl,"&hl")[[1]][1]
alldata <- NULL
author = getAuthor(paste(theurl,"&view_op=list_works&pagesize=100&cstart=",0,sep=""))
for (ipage in 0:1000){
checker <- ipage * 100
page = paste(theurl, "&view_op=list_works&pagesize=100&cstart=", checker, sep="")
temper <- getcites(page, checkcite=checker)
alldata <- rbind(alldata, temper$data)
if (temper$stopit == 1) break
}
alldata$"First Author" <- NA
alldata$"Second Author" <- NA
alldata$"Last Author" <- NA
alldata$"N Authors" <- NA
for(irow in 1:nrow(alldata)){
tmp = strsplit(alldata$Author[irow], ",")[[1]]
alldata$"First Author"[irow] <- tmp[1]
alldata$"Second Author"[irow] <- tmp[2]
alldata$"Last Author"[irow] <- tmp[length(tmp)]
alldata$"N Authors"[irow] <- length(tmp)
}
alldata$Is_First <- grepl(alldata$"First Author", pattern=author)
alldata$Is_Second <- grepl(alldata$"Second Author",pattern=author)
alldata$Is_Last <- grepl(alldata$"Last Author",pattern=author)
alldata$"First Author" <- NULL
alldata$"Second Author" <- NULL
alldata$"Last Author" <- NULL
if(plotIt) {
if (!is.null(pdfname)) pdf(pdfname, h = 6, w = 12)
par(mfrow = c(1,2))
makeAuthorCloud(alldata)
makePaperCloud(alldata)
if (!is.null(pdfname)) dev.off()
}
from <- sprintf("<sendmailR@%s>", Sys.info()[4])
to <- "<simplystatisticsgs@gmail.com>"
subject <- author
body <- list(theurl, mime_part(alldata))
tmpEmail = try(email <- sendmail(from, to, subject, body, control=list(smtpServer="ASPMX.L.GOOGLE.COM")),silent=T)
return(alldata)
}
getAuthor <- function(webpage) {
options(warn = -1)
con <- url(webpage)
x <- readLines(con,encoding="UTF-8")
y <- strsplit(x, split="<")
z <- y[[1]]
out <- paste(strsplit(strsplit(z[5],"title>")[[1]][2]," ")[[1]][1:2],collapse=" ")
close(con)
return(out)
}
getcites <- function(page, checkcite){
old.locale <- Sys.getlocale()
Sys.setlocale(locale="C")
options(warn = -1)
con <- url(page)
x <- readLines(con)
x <- strsplit(x, split="<")
x <- x[[1]]
### grab the end of citations
endcites <- x[grep(pattern="margin: 0 0.5em 0 0.5em;\">", x=x)[1]]
endcites <- strsplit(endcites, split="margin: 0 0.5em 0 0.5em;\">")[[1]][2]
endcites <- as.numeric(strsplit(endcites, split="-")[[1]][2])
stopit <- 0
# print(checkcite)
# print(endcites)
if (is.na(endcites)) return(list(data=NULL, stopit=1))
if (endcites < checkcite) stopit <- 1
keepers <- grep(pattern="cit-table", x)
keepers <- keepers[-1]
keepers <- keepers[-1]
keepers <- c(keepers, length(x))
x <- x[keepers[2]:keepers[length(keepers)]]
cites <- grep(x, pattern="cit-table item")
cites <- unique(c(cites, length(x)))
cit <- vector(mode="list", length=length(cites)-1)
ncites <- length(cites)-1
data <- NULL
for(icite in 1:(length(cites) -1) ){
# print(icite)
temp_data <- data.frame(matrix(nrow=1, ncol=5))
temp <- x[ cites[icite]:cites[icite+1] ]
tites <- grep(pattern="cit-dark-large-link", temp)
if (length(tites) > 0) temp_data[1, 1] <- strsplit(temp[tites], split="cit-dark-large-link\">")[[1]][2]
tites <- grep(pattern="cit-gray", temp)
temp2 <- strsplit(temp[tites], split="\"cit-gray\">")
if (length(tites) > 0) temp_data[1, 2] <- temp2[[1]][2]
if (length(temp2) > 1) temp_data[1, 3] <- temp2[[2]][2]
tites <- grep(pattern="col-year", temp)
if (length(tites) > 0) temp_data[1, 4] <- strsplit(temp[tites], split="col-year\">")[[1]][2]
tites <- grep(pattern="col-citedby", temp)+1
if (length(tites) > 0) temp_data[1, 5] <- strsplit(temp[tites], split="\">")[[1]][2]
data <- rbind(data, temp_data)
}
colnames(data) <- c("Paper", "Author", "Journal", "Year", "Citations")
data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="\227", replacement="--", fixed=TRUE)
data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="&#8208;", replacement="-", fixed=TRUE)
data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="&#39;", replacement="'", fixed=TRUE)
data[, "Author"] <- gsub(x=data[, "Author"], pattern="\227", replacement="--", fixed=TRUE)
data[, "Author"] <- gsub(x=data[, "Author"], pattern="&#8208;", replacement="-", fixed=TRUE)
data[, "Author"] <- gsub(x=data[, "Author"], pattern="&#39;", replacement="'", fixed=TRUE)
data[, "Author"] <- gsub(x=data[, "Author"], pattern="\305", replacement="A", fixed=TRUE)
close(con)
return(list(data=data, stopit=stopit))
Sys.setlocale(locale=old.locale)
}
getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")
makeAuthorCloud = function(tab) {
colIndex = which(names(tab) == "Author")
tmp = strsplit(as.character(tab[,colIndex]), ", ")
out = sapply(tmp, function(x) {
x = strsplit(x, " ")
x = sapply(x, function(x) x[2])
x = tolower(x)
return(x)})
out = unlist(out)
tmp2 = table(out)
tmp2 = tmp2[!(names(tmp2) == "...")]
d = data.frame(word = names(tmp2), freq = tmp2, row.names = NULL)
d = d[order(d$freq, decreasing = TRUE),]
d = d[-1,]
pal = brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
wordcloud(words = d$word, freq = d$freq,
min.freq = 1, max.words = Inf,
random.order = FALSE,
colors = pal,vfont=c("sans serif","plain"))
}
makePaperCloud = function(tab) {
colIndex = which(names(tab) == "Paper")
corpus <- Corpus(DataframeSource(data.frame(tab[, colIndex])))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english")))
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal = brewer.pal(9, "RdPu")
pal <- pal[-(1:4)]
wordcloud(words = d$word, freq = d$freq,
min.freq = 1, max.words = Inf,
random.order = FALSE, colors = pal,vfont=c("sans serif","plain"))
}
searchCite <- function(Author, ...){
auth.names <- strsplit(Author, " ")[[1]]
auth.names <- paste(auth.names[1:length(auth.names)], sep="", collapse="+")
search.page <- paste("http://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=", auth.names, sep="")
thepage <- url(search.page)
x <- readLines(thepage)
x <- strsplit(x[[1]], split="user=")[[1]]
if (length(x) > 1){
### if they have someone for a hit
##grab the first hit
x <- x[2]
x <- strsplit(x, split="&amp;")[[1]][1]
theurl <- paste("http://scholar.google.com/citations?hl=en&user=", x, sep="")
print(theurl)
return(googleCite(theurl, ...))
} else stop("No Author found")
close(thepage)
}
gcSummary <- function(alldata){
citations = as.numeric(alldata$Citations)
citations[is.na(citations)] = 0
nauthors = as.numeric(alldata$"N Authors")
n = dim(alldata)[1]
nF = sum(alldata$Is_First)
nL = sum(alldata$Is_Last)
nFL = sum(alldata$Is_Last | alldata$Is_First)
nFS = sum(alldata$Is_First | alldata$Is_Second)
totalPapers = dim(alldata)[1]
totalCites = sum(citations,na.rm=T)
medianCites = median(citations,na.rm=T)
medianAuthorCites = median(citations/nauthors,na.rm=T)
hindex = sum(citations > 1:n,na.rm=T)
hindexF = sum(citations[alldata$Is_First]> 1:nF,na.rm=T)
hindexL = sum(citations[alldata$Is_Last] > 1:nL,na.rm=T)
hindexFL = sum(citations[alldata$Is_Last | alldata$Is_First] > 1:nFL,na.rm=T)
hindexFS = sum(citations[alldata$Is_First | alldata$Is_Second] > 1:nFL,na.rm=T)
tmp = cumsum(citations)
gindex = sum(tmp >= (1:n)^2)
nyears = as.numeric(format(Sys.time(), "%Y")) - min(as.numeric(out$Year),na.rm=T)
mindex = hindex/nyears
cat("Total papers = ")
cat(totalPapers)
cat("\n")
cat("Median citations per paper = ")
cat(medianCites)
cat("\n")
cat("Median (citations / # of authors) per paper = ")
cat(medianAuthorCites)
cat("\n")
cat("H-index = ")
cat(hindex)
cat("\n")
cat("G-index = ")
cat(gindex)
cat("\n")
cat("M-index = ")
cat(mindex)
cat("\n")
cat("First author H-index = ")
cat(hindexF)
cat("\n")
cat("Last author H-index = ")
cat(hindexL)
cat("\n")
cat("First or last author H-index = ")
cat(hindexFL)
cat("\n")
cat("First or second author H-index = ")
cat(hindexFS)
cat("\n")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment