public
Created

Code that simulates an LDA corpus.

  • Download Gist
SimLDACorpus.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## Perform Inference on Simulated LDA Corpus
 
library(MCMCpack)
library(ggplot2)
library(tm)
library(topicmodels)
library(plyr)
setwd('/home/rmealey/Dropbox/LaPlacianAmbitions/10-TopicModels')
 
##################################
### 1. Simulate Data
 
# Default Values
M <- 1000 # number of documents
nTerms <- 100 # number of terms
 
## document lengths all identical at 100
docLengths <- rep(100,M)
 
## document lengths (word counts) distributed according to poisson(100)
#docLengths <- rpois(M,100)
 
## Set additional hyperparameters to some customary values used in LDA priors
#K <- round(nTerms/M) # Number of Topics
K <- 10 # Number of Topics
alphA <- 1/K # parameter for symmetric Document/Topic dirichlet distribution
betA <- 1/K # parameter for Topic/Term dirichlet distribution
AlphA <- rep(alphA, K) # number-of-topics length vector set to symmetric alpha paramater across all topics
BetA <- rep(betA, nTerms) # number-of-terms length vector set to symmetric beta paramater across all terms
 
## generate simulated corpus (See script SimulateCorpus.R)
# Returns corpus list object, default 100 documents, 1000 terms, 1000/100=10 topics
# with documents and "true" values for doc/topic matrix and topic/term matrix
source('SimulateCorpus.R')
corpus <- simulateCorpus(M, nTerms, docLengths, K, alphA, betA)
 
##################################
 
 
##################################
### 2. Inference
 
# A. Using R Package topicmodels:
# Labels
Terms <- paste("Term",seq(nTerms))
Topics <- paste("Topic", seq(K))
Documents <- paste("Document", seq(M))
 
LDA(corpus[['termFreqMatrix']], K, control=list(alpha=alphA, beta=betA), method='Gibbs') -> lda1
 
# "Estimated" Document/Topic distribution matrix
Theta_est <- posterior(lda1,corpus[['termFreqMatrix']])$topics
colnames(Theta_est) <- Topics
rownames(Theta_est) <- Documents
 
# "Estimated" Topic/Term Distribution Matrix
Phi_est <- posterior(lda1,corpus[['termFreqMatrix']])$terms
colnames(Phi_est) <- Terms
rownames(Phi_est) <- Topics
 
##################################
 
##################################
### 3. Compare "True" to Estimate
 
Theta_true <- corpus[['Theta']]
Phi_true <- corpus[['Phi']]
 
##################################
SimLDACorpusSource.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
### Basic LDA Topic Model Simulation ###
### Generate Simulated Corpus ###
library(ggplot2)
library(tm)
library(MCMCpack)
 
simulateCorpus <- function( M, # number of documents
nTerms,
docLengths,
K, # Number of Topics
alphA, # parameter for symmetric Document/Topic dirichlet distribution
betA, # parameter for Topic/Term dirichlet distribution
Alpha=rep(alphA,K), # number-of-topics length vector set to symmetric alpha parameter across all topics
Beta=rep(betA,nTerms)) # number-of-terms length vector set to symmetric beta paramater across all terms
{
# Labels
Terms <- paste("Term",seq(nTerms))
Topics <- paste("Topic", seq(K))
Documents <- paste("Document", seq(M))
 
## Generate latent topic and term distributions
# "True" Document/Topic distribution matrix
Theta <- rdirichlet(M, Alpha)
colnames(Theta) <- Topics
rownames(Theta) <- Documents
 
# "True" Topic/Term Distribution Matrix
Phi <- rdirichlet(K, Beta)
colnames(Phi) <- Terms
rownames(Phi) <- Topics
 
## Function to generate individual document
generateDoc <- function(docLength, topic_dist, terms_topics_dist){
# docLength is specific document length
# topic_dist is specific topic distribution for this document
# terms_topics_dist is terms distribution matrix over all topics
document <- c()
for (i in seq(docLength)){
# For each word in a document, choose a topic from that document's topic distribution
topic <- rmultinom(1, 1, topic_dist)
 
# Then choose a term from that topic's term distribution
term <- rmultinom(1, 1, terms_topics_dist[topic,])
 
# and append term to document vector
document <- c(document, colnames(terms_topics_dist)[which.max(term)])
}
return(document)
}
 
## generate "observed" corpus as list of terms
corpus <- list()
for (i in seq(M)){
corpus[[i]] <- generateDoc(docLengths[i], Theta[i,], Phi)
}
 
## convert document term vectors to frequency vectors
freqsLists <- llply(corpus, table)
 
## write values to termFreqMatrix
termFreqMatrix <- matrix(nrow=M, ncol=nTerms, 0)
colnames(termFreqMatrix) <- Terms
rownames(termFreqMatrix) <- Documents
for (i in seq(M)){
termFreqMatrix[i,names(freqsLists[[i]])] <- freqsLists[[i]]
}
 
stopifnot(rowSums(termFreqMatrix) == docLengths)
 
return(list("docs"=corpus, 'termFreqMatrix'=termFreqMatrix, "Theta"=Theta, "Phi"=Phi))
 
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.