Skip to content

Instantly share code, notes, and snippets.

@robbymeals
Created October 31, 2012 06:37
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save robbymeals/3985469 to your computer and use it in GitHub Desktop.
Save robbymeals/3985469 to your computer and use it in GitHub Desktop.
Code that simulates an LDA corpus.
## Perform Inference on Simulated LDA Corpus
library(MCMCpack)
library(ggplot2)
library(tm)
library(topicmodels)
library(plyr)
setwd('/home/rmealey/Dropbox/LaPlacianAmbitions/10-TopicModels')
##################################
### 1. Simulate Data
# Default Values
M <- 1000 # number of documents
nTerms <- 100 # number of terms
## document lengths all identical at 100
docLengths <- rep(100,M)
## document lengths (word counts) distributed according to poisson(100)
#docLengths <- rpois(M,100)
## Set additional hyperparameters to some customary values used in LDA priors
#K <- round(nTerms/M) # Number of Topics
K <- 10 # Number of Topics
alphA <- 1/K # parameter for symmetric Document/Topic dirichlet distribution
betA <- 1/K # parameter for Topic/Term dirichlet distribution
AlphA <- rep(alphA, K) # number-of-topics length vector set to symmetric alpha paramater across all topics
BetA <- rep(betA, nTerms) # number-of-terms length vector set to symmetric beta paramater across all terms
## generate simulated corpus (See script SimulateCorpus.R)
# Returns corpus list object, default 100 documents, 1000 terms, 1000/100=10 topics
# with documents and "true" values for doc/topic matrix and topic/term matrix
source('SimulateCorpus.R')
corpus <- simulateCorpus(M, nTerms, docLengths, K, alphA, betA)
##################################
##################################
### 2. Inference
# A. Using R Package topicmodels:
# Labels
Terms <- paste("Term",seq(nTerms))
Topics <- paste("Topic", seq(K))
Documents <- paste("Document", seq(M))
LDA(corpus[['termFreqMatrix']], K, control=list(alpha=alphA, beta=betA), method='Gibbs') -> lda1
# "Estimated" Document/Topic distribution matrix
Theta_est <- posterior(lda1,corpus[['termFreqMatrix']])$topics
colnames(Theta_est) <- Topics
rownames(Theta_est) <- Documents
# "Estimated" Topic/Term Distribution Matrix
Phi_est <- posterior(lda1,corpus[['termFreqMatrix']])$terms
colnames(Phi_est) <- Terms
rownames(Phi_est) <- Topics
##################################
##################################
### 3. Compare "True" to Estimate
Theta_true <- corpus[['Theta']]
Phi_true <- corpus[['Phi']]
##################################
### Basic LDA Topic Model Simulation ###
### Generate Simulated Corpus ###
library(ggplot2)
library(tm)
library(MCMCpack)
simulateCorpus <- function( M, # number of documents
nTerms,
docLengths,
K, # Number of Topics
alphA, # parameter for symmetric Document/Topic dirichlet distribution
betA, # parameter for Topic/Term dirichlet distribution
Alpha=rep(alphA,K), # number-of-topics length vector set to symmetric alpha parameter across all topics
Beta=rep(betA,nTerms)) # number-of-terms length vector set to symmetric beta paramater across all terms
{
# Labels
Terms <- paste("Term",seq(nTerms))
Topics <- paste("Topic", seq(K))
Documents <- paste("Document", seq(M))
## Generate latent topic and term distributions
# "True" Document/Topic distribution matrix
Theta <- rdirichlet(M, Alpha)
colnames(Theta) <- Topics
rownames(Theta) <- Documents
# "True" Topic/Term Distribution Matrix
Phi <- rdirichlet(K, Beta)
colnames(Phi) <- Terms
rownames(Phi) <- Topics
## Function to generate individual document
generateDoc <- function(docLength, topic_dist, terms_topics_dist){
# docLength is specific document length
# topic_dist is specific topic distribution for this document
# terms_topics_dist is terms distribution matrix over all topics
document <- c()
for (i in seq(docLength)){
# For each word in a document, choose a topic from that document's topic distribution
topic <- rmultinom(1, 1, topic_dist)
# Then choose a term from that topic's term distribution
term <- rmultinom(1, 1, terms_topics_dist[topic,])
# and append term to document vector
document <- c(document, colnames(terms_topics_dist)[which.max(term)])
}
return(document)
}
## generate "observed" corpus as list of terms
corpus <- list()
for (i in seq(M)){
corpus[[i]] <- generateDoc(docLengths[i], Theta[i,], Phi)
}
## convert document term vectors to frequency vectors
freqsLists <- llply(corpus, table)
## write values to termFreqMatrix
termFreqMatrix <- matrix(nrow=M, ncol=nTerms, 0)
colnames(termFreqMatrix) <- Terms
rownames(termFreqMatrix) <- Documents
for (i in seq(M)){
termFreqMatrix[i,names(freqsLists[[i]])] <- freqsLists[[i]]
}
stopifnot(rowSums(termFreqMatrix) == docLengths)
return(list("docs"=corpus, 'termFreqMatrix'=termFreqMatrix, "Theta"=Theta, "Phi"=Phi))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment