Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active March 18, 2019 00:09
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save benmarwick/4541872 to your computer and use it in GitHub Desktop.
Save benmarwick/4541872 to your computer and use it in GitHub Desktop.
Speed test of different methods of parallel processing to generate topic models with different numbers of topics. Coded for a single Windows 7 laptop with a four core processor (ie not a networked cluster) and data from the topicmodels package.
# Speed tests of different parallel and non-parallel methods
# for iterating over different numbers of topics with
# topicmodels
# clear workspace and stop any previous cluster instances
rm(list = ls(all.names = TRUE))
gc()
sfStop()
library(topicmodels)
library(plyr)
library(snowfall)
library(foreach)
library(doSNOW)
# get data
data("AssociatedPress", package = "topicmodels")
# set number of topics to start with
k <- 20
# set model options
control_LDA_VEM <-
list(estimate.alpha = TRUE, alpha = 50/k, estimate.beta = TRUE,
verbose = 0, prefix = tempfile(), save = 0, keep = 0,
seed = as.integer(100), nstart = 1, best = TRUE,
var = list(iter.max = 10, tol = 10^-6),
em = list(iter.max = 10, tol = 10^-4),
initialize = "random")
# set sequence of topic numbers to iterate over
seq <- seq(2, 500, by = 100)
# set parallel processing options
# initiate cores
sfInit(parallel=TRUE, cpus=4, type="SOCK") # for snowfall
cl <- makeCluster(4, type = "SOCK") # for snow
registerDoSNOW(cl) # for snow
# send data and packages to multicores
sfExport("AssociatedPress", "control_LDA_VEM") # for snowfall
sfLibrary(topicmodels) # for snowfall
# again for snow
clusterEvalQ(cl, library(topicmodels)) # for snow
clusterExport(cl, c("AssociatedPress", "control_LDA_VEM")) # for snow
# non-parallel methods
#base
BASE <- system.time(best.model.BASE <<- lapply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))
# plyr non-parallel
PLYR_S <- system.time(best.model.PLYR_S <<- llply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .progress = "text"))
# parallel methods
# wrapper for some of these that don't handle the function well
wrapper <- function (d) topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)
# using parLapply
PARLAP <- system.time(best.model.PARLAP <<- parLapply(cl, seq, wrapper))
# using dopar
DOPAR <- system.time(best.model.DOPAR <<- foreach(i = seq, .export = c("AssociatedPress", "control_LDA_VEM"), .packages = "topicmodels", .verbose = TRUE) %dopar% (LDA(AssociatedPress[1:20,], control = control_LDA_VEM, k=i)))
# using sfLapply
SFLAPP <- system.time(best.model.SFLAPP <<- sfLapply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))
# using sfClusterApplyLB
SFCLU <- system.time(best.model.SFCLU <<- sfClusterApplyLB(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))
# using plyr in parallel (needs snow options)
PLYRP <- system.time(best.model.PLYRP <<- llply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .parallel = TRUE))
# inspect results
rbind(BASE, PLYR_S, PARLAP, DOPAR, SFLAPP, SFCLU, PLYRP)
### using the lda package
AP <- dtm2ldaformat(AssociatedPress[1:30])
wc<-word.counts(AP$documents) #get word counts
AP <- dtm2ldaformat(AssociatedPress[1:30])
wc <- word.counts(AP$documents) #get word counts
AP.filtered <- filter.words(AP$documents, as.numeric(names(wc)[wc <= 2]) ) #remove words occuring less than 2 times
AP.lex.lda <- lda.collapsed.gibbs.sampler(AP$documents,
k,
AP$vocab,
30,
0.1,
0.1,
compute.log.likelihood = TRUE) # uses a collapsed Gibbs sampler to fit a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/
# speed test on many topics - it's very fast!
sequ <- seq(2,202,10)
system.time(lda.ll <- llply(sequ, function(d) lda.collapsed.gibbs.sampler(AP$documents, d, AP$vocab, 30, 0.1, 0.1, compute.log.likelihood = TRUE) , .progress = "text"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment