Created
March 28, 2018 06:18
-
-
Save mjwestgate/981909640d13c362897e57fcf466b58b to your computer and use it in GitHub Desktop.
Worked example to duplicate analyses from Westgate et al. (2015) Conservation Biology 29(6): 1606-1614
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Worked example to use revtools (https://github.com/mjwestgate/revtools) | |
# to duplicate analyses from the following paper: | |
# MJ Westgate, PS Barton, JC Pierson & DB Lindenmayer (2015) Text analysis tools | |
# for identification of emerging topics and research gaps in conservation science. | |
# Conservation Biology 29(6): 1606-1614. http://doi.org/10.1111/cobi.12605 | |
# NOTE: This has code has been re-written; it is NOT the code from the original paper. | |
# As a result there may be bugs here not present in the published version. | |
# This is a preliminary upload ahead of conversion to a tutorial for revtools. | |
# Specify required packages | |
library(revtools) # bibliographic data manipulation | |
library(ggplot2) # plotting | |
library(Rmpfr) # setting precision | |
library(topicmodels) # topic models (unsurprisingly!) | |
library(lme4) # mixed models | |
library(ggdendro) # dendrogram | |
library(ggcorrplot) # plotting of a correlation matrix | |
## STEP 1: DATA IMPORT AND PREPARATION | |
data<-read_bibliography("your_dataset.ris") | |
# summary(data) # check | |
remove_list<-c( | |
"wiley", "elsevier", "copyright", "ltd", | |
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", | |
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten") | |
dtm<-make_DTM(data, stop_words=remove_list) # create DTM | |
## STEP 2: DETERMINE OPTIMAL NUMBER OF TOPICS | |
# This section uses code given by Ben Marmick on SO: | |
# http://stackoverflow.com/questions/21355156/topic-models-cross-validation-with-loglikelihood-or-perplexity/21394092#21394092 | |
# create function for calculating harmonic mean | |
harmonicMean <- function(logLikelihoods, precision = 2000L) { | |
llMed <- median(logLikelihoods) | |
as.double(llMed - log(mean(exp(-mpfr(logLikelihoods, prec = precision) + llMed)))) | |
} | |
# run multiple models - NOTE THIS MAY TAKE A LONG TIME TO RUN!! | |
k_values <- seq(5, 40, by = 5) | |
burnin <- 1000 | |
iterations <- 10000 | |
keep <- 50 | |
models <- lapply(k_values, function(k){ | |
topicmodels::LDA( | |
dtm, | |
k, | |
method = "Gibbs", | |
control = list(burnin = burnin, iter = iterations, keep = keep) | |
) | |
}) | |
logLiks <- lapply(models, function(L) L@logLiks[-c(1:(burnin/keep))]) | |
model_selection<-data.frame( | |
k = k_values, | |
mean_log_lik = sapply(logLiks, function(h) harmonicMean(h)) | |
) | |
# end borrowed code | |
# plot these results | |
ggplot(model_selection, aes(x=k, y=mean_log_lik)) + geom_point() | |
## STEP 3: CALCUALATE 'BEST' TOPIC MODEL, SUMMARIZE | |
# run a single model with high number of iterations etc. | |
model<-topicmodels::LDA( | |
dtm, | |
k=15, | |
LDA.control=list(burnin = 1000, iter = 6000, keep = 100) | |
) | |
# similarity | |
M1 <-t(modeltools::posterior(model)$terms) # word x topic | |
M2 <-modeltools::posterior(model)$topics # article x topic | |
word.dist<-dist(as.data.frame(t(log10(M1+1))), method="euclidean", diag=FALSE, upper=FALSE) | |
article.dist <-dist(as.data.frame(log10(t(M2+1))), method="euclidean", diag=FALSE, upper=FALSE) | |
attr(word.dist, "Labels")<-apply(terms(model, 5), 2, function(a){paste(a, collapse=", ")}) | |
similarity<-hclust(word.dist) | |
ggdendrogram(similarity, rotate=TRUE) | |
# popularity | |
articles<-as.data.frame(data) | |
articles$topic<-topics(model) | |
popularity<-as.data.frame( | |
xtabs(~ year + topic, data=articles, drop.unused.levels=FALSE), | |
stringsAsFactors=FALSE) | |
popularity$year<-scale(as.numeric(popularity$year)) | |
popularity$topic<-as.factor(popularity$topic) | |
popularity_model<-glmer(Freq ~ 1 + (1 | topic) + (year -1 | topic), | |
family=poisson(link="log"), | |
data=popularity) | |
# summary(popularity_model) | |
popularity_results<-ranef(popularity_model)$topic | |
colnames(popularity_results)<-c("intercept", "slope") | |
ggplot(popularity_results, aes(x=intercept, y=slope)) + geom_point() | |
# generality | |
generality<-lapply(c(1:model@k), function(a, dist, topics){ | |
x<-as.numeric(M2[, a]) | |
c(included=mean(x[topics==a]), excluded=mean(x[topics!=a])) | |
},topics=topics(model), dist=M2) | |
generality_result<-as.data.frame(do.call(rbind, generality)) | |
generality_result$topic<-c(1:model@k) | |
ggplot(generality_result, aes(x=included, y=excluded)) + geom_point() | |
# overlap | |
scale.zero.one<-function(x){x<-x-min(x, na.rm=TRUE); x<-x*100; x<-x/max(x, na.rm=TRUE); return(x)} | |
gap.width<-scale.zero.one(word.dist)* scale.zero.one(article.dist) | |
ggcorrplot(as.matrix(gap.width), | |
type="lower", | |
hc.order=TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment