Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Speed test of different methods of parallel processing to generate topic models with different numbers of topics. Coded for a single Windows 7 laptop with a four core processor (ie not a networked cluster) and data from the topicmodels package.
# Speed tests of different parallel and non-parallel methods
# for iterating over different numbers of topics with
# topicmodels
# clear workspace and stop any previous cluster instances
rm(list = ls(all.names = TRUE))
gc()
sfStop()
library(topicmodels)
library(plyr)
library(snowfall)
library(foreach)
library(doSNOW)
# get data
data("AssociatedPress", package = "topicmodels")
# set number of topics to start with
k <- 20
# set model options
control_LDA_VEM <-
list(estimate.alpha = TRUE, alpha = 50/k, estimate.beta = TRUE,
verbose = 0, prefix = tempfile(), save = 0, keep = 0,
seed = as.integer(100), nstart = 1, best = TRUE,
var = list(iter.max = 10, tol = 10^-6),
em = list(iter.max = 10, tol = 10^-4),
initialize = "random")
# set sequence of topic numbers to iterate over
seq <- seq(2, 500, by = 100)
# set parallel processing options
# initiate cores
sfInit(parallel=TRUE, cpus=4, type="SOCK") # for snowfall
cl <- makeCluster(4, type = "SOCK") # for snow
registerDoSNOW(cl) # for snow
# send data and packages to multicores
sfExport("AssociatedPress", "control_LDA_VEM") # for snowfall
sfLibrary(topicmodels) # for snowfall
# again for snow
clusterEvalQ(cl, library(topicmodels)) # for snow
clusterExport(cl, c("AssociatedPress", "control_LDA_VEM")) # for snow
# non-parallel methods
#base
BASE <- system.time(best.model.BASE <<- lapply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))
# plyr non-parallel
PLYR_S <- system.time(best.model.PLYR_S <<- llply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .progress = "text"))
# parallel methods
# wrapper for some of these that don't handle the function well
wrapper <- function (d) topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)
# using parLapply
PARLAP <- system.time(best.model.PARLAP <<- parLapply(cl, seq, wrapper))
# using dopar
DOPAR <- system.time(best.model.DOPAR <<- foreach(i = seq, .export = c("AssociatedPress", "control_LDA_VEM"), .packages = "topicmodels", .verbose = TRUE) %dopar% (LDA(AssociatedPress[1:20,], control = control_LDA_VEM, k=i)))
# using sfLapply
SFLAPP <- system.time(best.model.SFLAPP <<- sfLapply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))
# using sfClusterApplyLB
SFCLU <- system.time(best.model.SFCLU <<- sfClusterApplyLB(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))
# using plyr in parallel (needs snow options)
PLYRP <- system.time(best.model.PLYRP <<- llply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .parallel = TRUE))
# inspect results
rbind(BASE, PLYR_S, PARLAP, DOPAR, SFLAPP, SFCLU, PLYRP)
### using the lda package
AP <- dtm2ldaformat(AssociatedPress[1:30])
wc<-word.counts(AP$documents) #get word counts
AP <- dtm2ldaformat(AssociatedPress[1:30])
wc <- word.counts(AP$documents) #get word counts
AP.filtered <- filter.words(AP$documents, as.numeric(names(wc)[wc <= 2]) ) #remove words occuring less than 2 times
AP.lex.lda <- lda.collapsed.gibbs.sampler(AP$documents,
k,
AP$vocab,
30,
0.1,
0.1,
compute.log.likelihood = TRUE) # uses a collapsed Gibbs sampler to fit a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/
# speed test on many topics - it's very fast!
sequ <- seq(2,202,10)
system.time(lda.ll <- llply(sequ, function(d) lda.collapsed.gibbs.sampler(AP$documents, d, AP$vocab, 30, 0.1, 0.1, compute.log.likelihood = TRUE) , .progress = "text"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.