mjwestgate/Text_Mining_Westgate_2015_ConsBiol.R

## Text_Mining_Westgate_2015_ConsBiol.R
# Worked example to use revtools (https://github.com/mjwestgate/revtools)
# to duplicate analyses from the following paper:

# MJ Westgate, PS Barton, JC Pierson & DB Lindenmayer (2015) Text analysis tools
# for identification of emerging topics and research gaps in conservation science.
# Conservation Biology 29(6): 1606-1614. http://doi.org/10.1111/cobi.12605

# NOTE: This has code has been re-written; it is NOT the code from the original paper.
# As a result there may be bugs here not present in the published version.
# This is a preliminary upload ahead of conversion to a tutorial for revtools.


# Specify required packages
library(revtools) # bibliographic data manipulation
library(ggplot2) # plotting
library(Rmpfr) # setting precision
library(topicmodels) # topic models (unsurprisingly!)
library(lme4) # mixed models
library(ggdendro) # dendrogram
library(ggcorrplot) # plotting of a correlation matrix


## STEP 1: DATA IMPORT AND PREPARATION
data<-read_bibliography("your_dataset.ris")
# summary(data) # check
remove_list<-c(
  "wiley", "elsevier", "copyright",	"ltd",
  "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth",
  "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten")
dtm<-make_DTM(data, stop_words=remove_list) # create DTM


## STEP 2: DETERMINE OPTIMAL NUMBER OF TOPICS
# This section uses code given by Ben Marmick on SO:
# http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity/21394092#21394092

# create function for calculating harmonic mean
harmonicMean <- function(logLikelihoods, precision = 2000L) {
  llMed <- median(logLikelihoods)
  as.double(llMed - log(mean(exp(-mpfr(logLikelihoods, prec = precision) + llMed))))
}
# run multiple models - NOTE THIS MAY TAKE A LONG TIME TO RUN!!
k_values <- seq(5, 40, by = 5)
burnin <- 1000
iterations <- 10000
keep <- 50
models <- lapply(k_values, function(k){
  topicmodels::LDA(
    dtm,
    k,
    method = "Gibbs",
    control = list(burnin = burnin, iter = iterations, keep = keep)
  )
})
logLiks <- lapply(models, function(L)  L@logLiks[-c(1:(burnin/keep))])
model_selection<-data.frame(
  k = k_values,
  mean_log_lik = sapply(logLiks, function(h) harmonicMean(h))
)
# end borrowed code

# plot these results
ggplot(model_selection, aes(x=k, y=mean_log_lik)) + geom_point()


## STEP 3: CALCUALATE 'BEST' TOPIC MODEL, SUMMARIZE
# run a single model with high number of iterations etc.
model<-topicmodels::LDA(
  dtm,
  k=15,
  LDA.control=list(burnin = 1000, iter = 6000, keep = 100)
)

# similarity
M1 <-t(modeltools::posterior(model)$terms) # word x topic
M2 <-modeltools::posterior(model)$topics # article x topic
word.dist<-dist(as.data.frame(t(log10(M1+1))), method="euclidean", diag=FALSE, upper=FALSE)
article.dist <-dist(as.data.frame(log10(t(M2+1))), method="euclidean", diag=FALSE, upper=FALSE)
attr(word.dist, "Labels")<-apply(terms(model, 5), 2, function(a){paste(a, collapse=", ")})
similarity<-hclust(word.dist)
ggdendrogram(similarity, rotate=TRUE)

# popularity
articles<-as.data.frame(data)
articles$topic<-topics(model)
popularity<-as.data.frame(
	xtabs(~ year + topic, data=articles, drop.unused.levels=FALSE),
	stringsAsFactors=FALSE)
popularity$year<-scale(as.numeric(popularity$year))
popularity$topic<-as.factor(popularity$topic)
popularity_model<-glmer(Freq ~ 1 + (1 | topic) + (year -1 | topic),
	family=poisson(link="log"),
	data=popularity)
# summary(popularity_model)
popularity_results<-ranef(popularity_model)$topic
colnames(popularity_results)<-c("intercept", "slope")
ggplot(popularity_results, aes(x=intercept, y=slope)) + geom_point()

# generality
generality<-lapply(c(1:model@k), function(a, dist, topics){
	x<-as.numeric(M2[, a])
	c(included=mean(x[topics==a]), excluded=mean(x[topics!=a]))
},topics=topics(model), dist=M2)
generality_result<-as.data.frame(do.call(rbind, generality))
generality_result$topic<-c(1:model@k)
ggplot(generality_result, aes(x=included, y=excluded)) + geom_point()

# overlap
scale.zero.one<-function(x){x<-x-min(x, na.rm=TRUE); x<-x*100; x<-x/max(x, na.rm=TRUE); return(x)}
gap.width<-scale.zero.one(word.dist)* scale.zero.one(article.dist)
ggcorrplot(as.matrix(gap.width),
	type="lower",
	hc.order=TRUE)
	# Worked example to use revtools (https://github.com/mjwestgate/revtools)
	# to duplicate analyses from the following paper:

	# MJ Westgate, PS Barton, JC Pierson & DB Lindenmayer (2015) Text analysis tools
	# for identification of emerging topics and research gaps in conservation science.
	# Conservation Biology 29(6): 1606-1614. http://doi.org/10.1111/cobi.12605

	# NOTE: This has code has been re-written; it is NOT the code from the original paper.
	# As a result there may be bugs here not present in the published version.
	# This is a preliminary upload ahead of conversion to a tutorial for revtools.


	# Specify required packages
	library(revtools) # bibliographic data manipulation
	library(ggplot2) # plotting
	library(Rmpfr) # setting precision
	library(topicmodels) # topic models (unsurprisingly!)
	library(lme4) # mixed models
	library(ggdendro) # dendrogram
	library(ggcorrplot) # plotting of a correlation matrix


	## STEP 1: DATA IMPORT AND PREPARATION
	data<-read_bibliography("your_dataset.ris")
	# summary(data) # check
	remove_list<-c(
	"wiley", "elsevier", "copyright", "ltd",
	"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth",
	"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten")
	dtm<-make_DTM(data, stop_words=remove_list) # create DTM


	## STEP 2: DETERMINE OPTIMAL NUMBER OF TOPICS
	# This section uses code given by Ben Marmick on SO:
	# http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity/21394092#21394092

	# create function for calculating harmonic mean
	harmonicMean <- function(logLikelihoods, precision = 2000L) {
	llMed <- median(logLikelihoods)
	as.double(llMed - log(mean(exp(-mpfr(logLikelihoods, prec = precision) + llMed))))
	}
	# run multiple models - NOTE THIS MAY TAKE A LONG TIME TO RUN!!
	k_values <- seq(5, 40, by = 5)
	burnin <- 1000
	iterations <- 10000
	keep <- 50
	models <- lapply(k_values, function(k){
	topicmodels::LDA(
	dtm,
	k,
	method = "Gibbs",
	control = list(burnin = burnin, iter = iterations, keep = keep)
	)
	})
	logLiks <- lapply(models, function(L) L@logLiks[-c(1:(burnin/keep))])
	model_selection<-data.frame(
	k = k_values,
	mean_log_lik = sapply(logLiks, function(h) harmonicMean(h))
	)
	# end borrowed code

	# plot these results
	ggplot(model_selection, aes(x=k, y=mean_log_lik)) + geom_point()


	## STEP 3: CALCUALATE 'BEST' TOPIC MODEL, SUMMARIZE
	# run a single model with high number of iterations etc.
	model<-topicmodels::LDA(
	dtm,
	k=15,
	LDA.control=list(burnin = 1000, iter = 6000, keep = 100)
	)

	# similarity
	M1 <-t(modeltools::posterior(model)$terms) # word x topic
	M2 <-modeltools::posterior(model)$topics # article x topic
	word.dist<-dist(as.data.frame(t(log10(M1+1))), method="euclidean", diag=FALSE, upper=FALSE)
	article.dist <-dist(as.data.frame(log10(t(M2+1))), method="euclidean", diag=FALSE, upper=FALSE)
	attr(word.dist, "Labels")<-apply(terms(model, 5), 2, function(a){paste(a, collapse=", ")})
	similarity<-hclust(word.dist)
	ggdendrogram(similarity, rotate=TRUE)

	# popularity
	articles<-as.data.frame(data)
	articles$topic<-topics(model)
	popularity<-as.data.frame(
	xtabs(~ year + topic, data=articles, drop.unused.levels=FALSE),
	stringsAsFactors=FALSE)
	popularity$year<-scale(as.numeric(popularity$year))
	popularity$topic<-as.factor(popularity$topic)
	popularity_model<-glmer(Freq ~ 1 + (1 \| topic) + (year -1 \| topic),
	family=poisson(link="log"),
	data=popularity)
	# summary(popularity_model)
	popularity_results<-ranef(popularity_model)$topic
	colnames(popularity_results)<-c("intercept", "slope")
	ggplot(popularity_results, aes(x=intercept, y=slope)) + geom_point()

	# generality
	generality<-lapply(c(1:model@k), function(a, dist, topics){
	x<-as.numeric(M2[, a])
	c(included=mean(x[topics==a]), excluded=mean(x[topics!=a]))
	},topics=topics(model), dist=M2)
	generality_result<-as.data.frame(do.call(rbind, generality))
	generality_result$topic<-c(1:model@k)
	ggplot(generality_result, aes(x=included, y=excluded)) + geom_point()

	# overlap
	scale.zero.one<-function(x){x<-x-min(x, na.rm=TRUE); x<-x*100; x<-x/max(x, na.rm=TRUE); return(x)}
	gap.width<-scale.zero.one(word.dist)* scale.zero.one(article.dist)
	ggcorrplot(as.matrix(gap.width),
	type="lower",
	hc.order=TRUE)