chr1swallace/orcidcloud.R

## orcidcloud.R
## install packages using the following
## install.packages("devtools")
## library(devtools)
## install_github("ropensci/rorcid")
## install.packages(c("tm","wordcloud"))

## load libraries
library(rorcid)
library(wordcloud)

## functions
get.years <- function(x) {as.numeric(x[["works"]][["publication-date.year.value"]])}
get.titles <- function(x) {x[["works"]][["title.title.value"]]}
get.authors <- function(x,surnames=FALSE) {
    all <- x[["works"]][["work-contributors.contributor"]]
    all <- lapply(all,"[[","credit-name.value")
    if(surnames) {
        all <- lapply(all,function(x) sub(",.*","",x))
        all <- lapply(all,function(x) sub(".* ","",x))
    } else {
        all <- lapply(all, function(x) sub(" ",",",x))
    }
    return(all)
}

## adapted this function from http://entrenchant.blogspot.co.uk/2013/06/english-word-clouds-in-r.html
aggregate.plurals <- function (m) {
  words <- rownames(m)
    mind <- vector("list",length(words))
    for (i in seq_along(words)) {
      if(i %in% unlist(mind))
        next
      mind[[i]] <- i
      if(i==length(words))
        next
      plurals <- paste0(words[[i]], c("s","es"))
      wh <- which(words[-c(1:i)] %in% plurals) + i
      if(length(wh))
        mind[[i]] <- c(mind[[i]],wh)
     }
  mind.l <- sapply(mind,length)
  for(i in which(mind.l>1)) {
    m[i,] <- colSums(m[mind[[i]],,drop=FALSE])
  }
  m <- m[mind.l>0,,drop=FALSE]
  return(m)
 }

mystopwords <- c("using","near","one","two","three","ten","without")

make.cloud <- function(text,by=NULL,group=NULL,min.freq=3) {
  if(!is.null(group) && !is.null(by))
      by <- group * (as.numeric(by) %/% group)
  if(!is.null(by)) {
      text <- tapply(text,by,paste,collapse=" ")
  }

  corp <- VCorpus(VectorSource(text))
  corp <- tm_map(corp, content_transformer(tolower))
  corp <- tm_map(corp,removePunctuation)
  corp <- tm_map(corp,removeNumbers)
  corp <- tm_map(corp, removeWords, c(stopwords("english"),mystopwords))

  tdm <- TermDocumentMatrix(corp)
  m <- as.matrix(tdm)
  colnames(m) <- names(text)
  m <- aggregate.plurals(m)

  m <- m[order(rowSums(m),decreasing=TRUE),,drop=FALSE ]
  ## for(i in 1:ncol(m))
  ##   m[,i] <- m[,i]/sum(m[,i])
  m <- m[rowSums(m)>=min.freq, ,drop=FALSE]
  pal <- brewer.pal(6,"Dark2")
  pal <- pal[-1]
  if(is.null(by)) {
     v <- sort(rowSums(m),decreasing=TRUE)
             d <- data.frame(word = names(v),freq=v)
           return(wordcloud(d$word,d$freq,colors=brewer.pal(6,"Dark2"),random.order=FALSE))
}

  comparison.cloud(m)
}

orcid.cloud <- function(data,what="titles") {
switch(what,
"titles" = make.cloud(get.titles(data[[1]])),
"authors" = make.cloud(get.authors(data[[1]],surnames=TRUE)),
"years" = hist(get.years(data[[1]])),
message(what," not recognised"))
}
	## install packages using the following
	## install.packages("devtools")
	## library(devtools)
	## install_github("ropensci/rorcid")
	## install.packages(c("tm","wordcloud"))

	## load libraries
	library(rorcid)
	library(wordcloud)

	## functions
	get.years <- function(x) {as.numeric(x[["works"]][["publication-date.year.value"]])}
	get.titles <- function(x) {x[["works"]][["title.title.value"]]}
	get.authors <- function(x,surnames=FALSE) {
	all <- x[["works"]][["work-contributors.contributor"]]
	all <- lapply(all,"[[","credit-name.value")
	if(surnames) {
	all <- lapply(all,function(x) sub(",.*","",x))
	all <- lapply(all,function(x) sub(".* ","",x))
	} else {
	all <- lapply(all, function(x) sub(" ",",",x))
	}
	return(all)
	}

	## adapted this function from http://entrenchant.blogspot.co.uk/2013/06/english-word-clouds-in-r.html
	aggregate.plurals <- function (m) {
	words <- rownames(m)
	mind <- vector("list",length(words))
	for (i in seq_along(words)) {
	if(i %in% unlist(mind))
	next
	mind[[i]] <- i
	if(i==length(words))
	next
	plurals <- paste0(words[[i]], c("s","es"))
	wh <- which(words[-c(1:i)] %in% plurals) + i
	if(length(wh))
	mind[[i]] <- c(mind[[i]],wh)
	}
	mind.l <- sapply(mind,length)
	for(i in which(mind.l>1)) {
	m[i,] <- colSums(m[mind[[i]],,drop=FALSE])
	}
	m <- m[mind.l>0,,drop=FALSE]
	return(m)
	}

	mystopwords <- c("using","near","one","two","three","ten","without")

	make.cloud <- function(text,by=NULL,group=NULL,min.freq=3) {
	if(!is.null(group) && !is.null(by))
	by <- group * (as.numeric(by) %/% group)
	if(!is.null(by)) {
	text <- tapply(text,by,paste,collapse=" ")
	}

	corp <- VCorpus(VectorSource(text))
	corp <- tm_map(corp, content_transformer(tolower))
	corp <- tm_map(corp,removePunctuation)
	corp <- tm_map(corp,removeNumbers)
	corp <- tm_map(corp, removeWords, c(stopwords("english"),mystopwords))

	tdm <- TermDocumentMatrix(corp)
	m <- as.matrix(tdm)
	colnames(m) <- names(text)
	m <- aggregate.plurals(m)

	m <- m[order(rowSums(m),decreasing=TRUE),,drop=FALSE ]
	## for(i in 1:ncol(m))
	## m[,i] <- m[,i]/sum(m[,i])
	m <- m[rowSums(m)>=min.freq, ,drop=FALSE]
	pal <- brewer.pal(6,"Dark2")
	pal <- pal[-1]
	if(is.null(by)) {
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)
	return(wordcloud(d$word,d$freq,colors=brewer.pal(6,"Dark2"),random.order=FALSE))
	}

	comparison.cloud(m)
	}

	orcid.cloud <- function(data,what="titles") {
	switch(what,
	"titles" = make.cloud(get.titles(data[[1]])),
	"authors" = make.cloud(get.authors(data[[1]],surnames=TRUE)),
	"years" = hist(get.years(data[[1]])),
	message(what," not recognised"))
	}