Skip to content

Instantly share code, notes, and snippets.

@Dr-Eti
Forked from ratsgo/pLSA.R
Last active May 2, 2021 09:52
Show Gist options
  • Save Dr-Eti/ff49e701c17827aac9faac8184ee0b34 to your computer and use it in GitHub Desktop.
Save Dr-Eti/ff49e701c17827aac9faac8184ee0b34 to your computer and use it in GitHub Desktop.
pLSA
# Modifies the code in
# github thread: https://gist.github.com/ratsgo/c25deb6d79f3ab8b0b050af751fbbdb8#file-plsa-r-L78
# to speed it up by introducing vectorisation alternatives to nested for loops
# and by avoiding cumbersome string lookups
# assuming this is based on Hofmann (2001) https://doi.org/10.1023/A:1007617005950, p. 182 eqns 6, 11 and 12 but re-casted for parametrization as in eq. 5
# prepare corpus
corpus <- matrix(c(1,2,0,0,0,0,
3,1,0,0,0,0,
2,0,0,0,0,0,
3,3,2,3,2,4,
0,0,3,2,0,0,
0,0,4,1,0,0,
0,0,0,0,4,3,
0,0,0,0,2,1,
0,0,0,0,3,2,
0,0,1,0,2,3), ncol=6, byrow=T)
# set labels
docnames <- c('Doc1','Doc2','Doc3','Doc4','Doc5','Doc6')
termnames <- c('Baseball','Basketball','Boxing','Money','Interest','Rate','Democrat','Republican','Cocus','President')
colnames(corpus) <- docnames
rownames(corpus) <- termnames
# set counters
nterms <- length(termnames)
ndocs <- length(docnames)
n_docs <- ndocs
# initialize parameters
ntopic <- 3
topicnames <- paste0('topic',1:ntopic)
#### vectorisation setup ####
sum_corpus <- sum(corpus) # a constant for later use
# vectorize row and col indexes
a_plsa <- 1:ncol(corpus)
b_plsa <- 1:nrow(corpus)
for_idx_plsa <- expand.grid(a_plsa,b_plsa) # expand grid pre-generates combinations of indexes to lookup. in the original thread this step is a nested for loop coupled with cumbersome unlisting
colnames(for_idx_plsa) <- c("doc","semant_unit")
# vectorize again: fix doc, change semantic unit
for_idx_plsa_swapped <- expand.grid(b_plsa,a_plsa)
colnames(for_idx_plsa_swapped) <- c("semant_unit","doc")
# vectorize corpus PLSA to make it commensurate to the expand.grid
vect_corpus_PLSA <- c(t(corpus)) # to compute pwz (docs * terms)
vect_corpus_PLSA_2 <- c(corpus) # to compute pdz (terms * docs)
#### random initialisation ####
set.seed(123)
pz.init <- matrix(runif(ntopic,min=0,max=1),ncol=ntopic)
colnames(pz.init) <- topicnames
pdz.init <- matrix(runif(ndocs*ntopic,min=0,max=1),ncol=ntopic)
colnames(pdz.init) <- topicnames
rownames(pdz.init) <- docnames
pwz.init <- matrix(runif(nterms*ntopic,min=0,max=1),ncol=ntopic)
colnames(pwz.init) <- topicnames
rownames(pwz.init) <- termnames
parameter.init <- list(pwz.init,pdz.init,pz.init)
#### Expectation Step - without for loops ####
estep2 <- function(x, y, parameter){
pwz <- parameter[[1]]
pdz <- parameter[[2]]
pz <- parameter[[3]]
a <- pwz[y,]
b <- pdz[x,]
# vectorize pz
c <- matrix(pz, nrow=nrow(y), ncol=length(pz), byrow=TRUE) # thread: https://stackoverflow.com/questions/14927507/duplicate-vector-into-matrix-r
numerator <- c * b * a
denominator <- apply(numerator,1,sum)
numerator/denominator
}
#### Maximization Step - without for loops ####
mstep_pwz <- function(x, posterior){
temp_result_1 <- apply(posterior,2,function(w){ # posterior also has (docs*terms) rows but the n. of columns equals the n of topics. Each col has to be multiplied
w*vect_corpus_PLSA # element-wise multiplication. To be summed
})
temp_result_2 <- cbind(x, temp_result_1)
colnames(temp_result_2) <- c("semant_unit", colnames(temp_result_1))
temp_result_3 <- aggregate(. ~semant_unit, data=as.data.frame(temp_result_2), sum) # kind of sumif thread: https://stackoverflow.com/questions/21607464/what-is-the-equivalent-of-the-sumif-function-in-r
pwznumerator <- temp_result_3[,-1] # remove first column
pwzdenominator <- apply(pwznumerator, 2, sum)
apply(pwznumerator,1, function(x){
x * 1/pwzdenominator
})
}
mstep_pdz <- function(y, posterior){
temp_result_4 <- apply(posterior,2,function(z){ # posterior also has (docs*terms) rows but the n. of columns equals the n of topics. Each col has to be multiplied
z*vect_corpus_PLSA_2 # element-wise multiplication. To be summed
})
temp_result_5 <- cbind(y, temp_result_4)
colnames(temp_result_5) <- c("doc", colnames(temp_result_4))
temp_result_6 <- aggregate(. ~doc, data=as.data.frame(temp_result_5), sum) # kind of sumif thread: https://stackoverflow.com/questions/21607464/what-is-the-equivalent-of-the-sumif-function-in-r
pdznumerator <- temp_result_6[,-1] # remove first column
pdzdenominator <- apply(pdznumerator, 2, sum)
apply(pdznumerator,1, function(x){
x * 1/pdzdenominator
})
}
mstep_pz <- function(posterior){
temp_result_7 <- apply(posterior,2,function(w){ # posterior also has (docs*terms) rows but the n. of columns equals the n of topics. Each col has to be multiplied
w*vect_corpus_PLSA_2 # element-wise multiplication. To be summed
})
pznumerator <- apply(temp_result_7,2,sum)
pzdenominator <- sum_corpus # sum_corpus is a constant
pznumerator/pzdenominator
}
#### calculate probs ####
max_iter <- 50 # Hofmann 2001 p 184: apparently 20-50 iterations are typically sufficient
parameter.iter <- parameter.init
for(i_loop in 1:max_iter){
# update posterior
posterior.iter2 <- estep2(for_idx_plsa_swapped$doc, for_idx_plsa_swapped$semant_unit, parameter.iter)
posterior_index_rownames <- paste0("d",for_idx_plsa_swapped$doc, "_", "t", for_idx_plsa_swapped$semant_unit)
rownames(posterior.iter2) <- posterior_index_rownames
# swap doc with term in expand grid so that we have doc = 1, terms =1,2,.... Right now it's the other way out
posterior_reindex_rownames <- paste0("d",for_idx_plsa$doc, "_", "t", for_idx_plsa$semant_unit)
re_idx_posterior <- match(posterior_reindex_rownames,posterior_index_rownames)
posterior_reindex <- posterior.iter2[re_idx_posterior,]
# update parameters one by one
pwz.iter <- mstep_pwz(for_idx_plsa$semant_unit, posterior_reindex)
colnames(pwz.iter) <- termnames
pwz.iter <- t(pwz.iter)
pdz.iter <- mstep_pdz(for_idx_plsa_swapped$doc, posterior.iter2)
colnames(pdz.iter) <- docnames
pdz.iter <- t(pdz.iter)
pz.iter <- mstep_pz(posterior.iter)
# assemble
parameter.iter <- list(pwz.iter, pdz.iter, pz.iter)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment