Skip to content

Instantly share code, notes, and snippets.

@ratsgo
Last active April 4, 2021 11:17
Show Gist options
  • Save ratsgo/c25deb6d79f3ab8b0b050af751fbbdb8 to your computer and use it in GitHub Desktop.
Save ratsgo/c25deb6d79f3ab8b0b050af751fbbdb8 to your computer and use it in GitHub Desktop.
pLSA
library(stringr)
# prepare corpus
corpus <- matrix(c(1,2,0,0,0,0,
3,1,0,0,0,0,
2,0,0,0,0,0,
3,3,2,3,2,4,
0,0,3,2,0,0,
0,0,4,1,0,0,
0,0,0,0,4,3,
0,0,0,0,2,1,
0,0,0,0,3,2,
0,0,1,0,2,3), ncol=6, byrow=T)
# initialize parameters
ntopic <- 3
docnames <- c('Doc1','Doc2','Doc3','Doc4','Doc5','Doc6')
termnames <- c('Baseball','Basketball','Boxing','Money','Interest','Rate','Democrat','Republican','Cocus','President')
colnames(corpus) <- docnames
ndocs <- length(docnames)
rownames(corpus) <- termnames
nterms <- length(termnames)
topicnames <- paste0('topic',1:ntopic)
dtnames <- c()
for (i in 1:dim(corpus)[2]) {
dtnames <- c(dtnames,paste0(docnames[i],' ',termnames))
}
posterior.init <- matrix(runif(dim(corpus)[1]*dim(corpus)[2]*ntopic,min=0,max=1),ncol=ntopic)
colnames(posterior.init) <- topicnames
rownames(posterior.init) <- dtnames
pz.init <- matrix(runif(ntopic,min=0,max=1),ncol=ntopic)
colnames(pz.init) <- topicnames
pdz.init <- matrix(runif(dim(corpus)[2]*ntopic,min=0,max=1),ncol=ntopic)
colnames(pdz.init) <- topicnames
rownames(pdz.init) <- docnames
pwz.init <- matrix(runif(dim(corpus)[1]*ntopic,min=0,max=1),ncol=ntopic)
colnames(pwz.init) <- topicnames
rownames(pwz.init) <- termnames
parameter.init <- list(pwz.init,pdz.init,pz.init)
# Expectation Step
estep <- function(parameter,posterior) {
pwz <- parameter[[1]]
pdz <- parameter[[2]]
pz <- parameter[[3]]
for (i in 1:(dim(corpus)[1]*dim(corpus)[2])) {
doc <- unlist(strsplit(dtnames[i],' '))[1]
term <- unlist(strsplit(dtnames[i],' '))[2]
denominator <- sum(pz * pwz[which(rownames(pwz)==term),] * pdz[which(rownames(pdz)==doc),])
for (j in 1:ntopic) {
numerator <- pz[1,j] * pdz[which(rownames(pdz)==doc),j] * pwz[which(rownames(pwz)==term),j]
posterior[i,j] <- numerator/denominator
}
}
return(posterior)
}
# Maximization Step
mstep <- function(posterior, parameter) {
pwz <- parameter[[1]]
pdz <- parameter[[2]]
pz <- parameter[[3]]
for (i in 1:dim(pwz)[1]) {
for (j in 1:dim(pwz)[2]) {
pwznumerator <- sum(corpus[i,] * posterior[which(str_detect(rownames(posterior), termnames[i])),j])
pwzdenominator <- sum(corpus * posterior[,j])
pwz[i,j] <- pwznumerator/pwzdenominator
}
}
for (i in 1:dim(pdz)[1]) {
for (j in 1:dim(pdz)[2]) {
pdznumerator <- sum(corpus[,i] * posterior[which(str_detect(rownames(posterior), docnames[i])),j])
pdzdenominator <- sum(corpus * posterior[,j])
pdz[i,j] <- pdznumerator/pdzdenominator
}
}
for (i in 1:dim(pz)[2]) {
pznumerator <- sum(posterior[,i] * corpus)
pzdenominator <- sum(corpus)
pz[1,i] <- pznumerator/pzdenominator
}
return(list(pwz,pdz,pz))
}
# calculate probs
posterior.iter <- estep(parameter.init, posterior.init)
parameter.iter <- mstep(posterior.init, parameter.init)
while(i<100) {
posterior.iter <- estep(parameter.iter, posterior.iter)
parameter.iter <- mstep(posterior.iter, parameter.iter)
i <- i + 1
}
@Dr-Eti
Copy link

Dr-Eti commented Apr 4, 2021

Hi I've found this interpretation of PLSA very interesting but it slows down a lot for even moderately-sized corpora due to the nested for loops and the operations on strings. I've forked it to introduce some vectorisation strategy hoping to speed it up a bit. Also, any thoughts on how it compares with FAST_PSA in the SVS package?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment