Skip to content

Instantly share code, notes, and snippets.

@mjwestgate
Created March 28, 2018 06:18
Show Gist options
  • Save mjwestgate/981909640d13c362897e57fcf466b58b to your computer and use it in GitHub Desktop.
Save mjwestgate/981909640d13c362897e57fcf466b58b to your computer and use it in GitHub Desktop.
Worked example to duplicate analyses from Westgate et al. (2015) Conservation Biology 29(6): 1606-1614
# Worked example to use revtools (https://github.com/mjwestgate/revtools)
# to duplicate analyses from the following paper:
# MJ Westgate, PS Barton, JC Pierson & DB Lindenmayer (2015) Text analysis tools
# for identification of emerging topics and research gaps in conservation science. 
# Conservation Biology 29(6): 1606-1614. http://doi.org/10.1111/cobi.12605
# NOTE: This has code has been re-written; it is NOT the code from the original paper.
# As a result there may be bugs here not present in the published version.
# This is a preliminary upload ahead of conversion to a tutorial for revtools.
# Specify required packages
library(revtools) # bibliographic data manipulation
library(ggplot2) # plotting
library(Rmpfr) # setting precision
library(topicmodels) # topic models (unsurprisingly!)
library(lme4) # mixed models
library(ggdendro) # dendrogram
library(ggcorrplot) # plotting of a correlation matrix
## STEP 1: DATA IMPORT AND PREPARATION
data<-read_bibliography("your_dataset.ris")
# summary(data) # check
remove_list<-c(
"wiley", "elsevier", "copyright", "ltd",
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth",
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten")
dtm<-make_DTM(data, stop_words=remove_list) # create DTM
## STEP 2: DETERMINE OPTIMAL NUMBER OF TOPICS
# This section uses code given by Ben Marmick on SO:
# http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity/21394092#21394092
# create function for calculating harmonic mean
harmonicMean <- function(logLikelihoods, precision = 2000L) {
llMed <- median(logLikelihoods)
as.double(llMed - log(mean(exp(-mpfr(logLikelihoods, prec = precision) + llMed))))
}
# run multiple models - NOTE THIS MAY TAKE A LONG TIME TO RUN!!
k_values <- seq(5, 40, by = 5)
burnin <- 1000
iterations <- 10000
keep <- 50
models <- lapply(k_values, function(k){
topicmodels::LDA(
dtm,
k,
method = "Gibbs",
control = list(burnin = burnin, iter = iterations, keep = keep)
)
})
logLiks <- lapply(models, function(L) L@logLiks[-c(1:(burnin/keep))])
model_selection<-data.frame(
k = k_values,
mean_log_lik = sapply(logLiks, function(h) harmonicMean(h))
)
# end borrowed code
# plot these results
ggplot(model_selection, aes(x=k, y=mean_log_lik)) + geom_point()
## STEP 3: CALCUALATE 'BEST' TOPIC MODEL, SUMMARIZE
# run a single model with high number of iterations etc.
model<-topicmodels::LDA(
dtm,
k=15,
LDA.control=list(burnin = 1000, iter = 6000, keep = 100)
)
# similarity
M1 <-t(modeltools::posterior(model)$terms) # word x topic
M2 <-modeltools::posterior(model)$topics # article x topic
word.dist<-dist(as.data.frame(t(log10(M1+1))), method="euclidean", diag=FALSE, upper=FALSE)
article.dist <-dist(as.data.frame(log10(t(M2+1))), method="euclidean", diag=FALSE, upper=FALSE)
attr(word.dist, "Labels")<-apply(terms(model, 5), 2, function(a){paste(a, collapse=", ")})
similarity<-hclust(word.dist)
ggdendrogram(similarity, rotate=TRUE)
# popularity
articles<-as.data.frame(data)
articles$topic<-topics(model)
popularity<-as.data.frame(
xtabs(~ year + topic, data=articles, drop.unused.levels=FALSE),
stringsAsFactors=FALSE)
popularity$year<-scale(as.numeric(popularity$year))
popularity$topic<-as.factor(popularity$topic)
popularity_model<-glmer(Freq ~ 1 + (1 | topic) + (year -1 | topic),
family=poisson(link="log"),
data=popularity)
# summary(popularity_model)
popularity_results<-ranef(popularity_model)$topic
colnames(popularity_results)<-c("intercept", "slope")
ggplot(popularity_results, aes(x=intercept, y=slope)) + geom_point()
# generality
generality<-lapply(c(1:model@k), function(a, dist, topics){
x<-as.numeric(M2[, a])
c(included=mean(x[topics==a]), excluded=mean(x[topics!=a]))
},topics=topics(model), dist=M2)
generality_result<-as.data.frame(do.call(rbind, generality))
generality_result$topic<-c(1:model@k)
ggplot(generality_result, aes(x=included, y=excluded)) + geom_point()
# overlap
scale.zero.one<-function(x){x<-x-min(x, na.rm=TRUE); x<-x*100; x<-x/max(x, na.rm=TRUE); return(x)}
gap.width<-scale.zero.one(word.dist)* scale.zero.one(article.dist)
ggcorrplot(as.matrix(gap.width),
type="lower",
hc.order=TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment