Created
October 31, 2012 06:37
-
-
Save robbymeals/3985469 to your computer and use it in GitHub Desktop.
Code that simulates an LDA corpus.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Perform Inference on Simulated LDA Corpus | |
library(MCMCpack) | |
library(ggplot2) | |
library(tm) | |
library(topicmodels) | |
library(plyr) | |
setwd('/home/rmealey/Dropbox/LaPlacianAmbitions/10-TopicModels') | |
################################## | |
### 1. Simulate Data | |
# Default Values | |
M <- 1000 # number of documents | |
nTerms <- 100 # number of terms | |
## document lengths all identical at 100 | |
docLengths <- rep(100,M) | |
## document lengths (word counts) distributed according to poisson(100) | |
#docLengths <- rpois(M,100) | |
## Set additional hyperparameters to some customary values used in LDA priors | |
#K <- round(nTerms/M) # Number of Topics | |
K <- 10 # Number of Topics | |
alphA <- 1/K # parameter for symmetric Document/Topic dirichlet distribution | |
betA <- 1/K # parameter for Topic/Term dirichlet distribution | |
AlphA <- rep(alphA, K) # number-of-topics length vector set to symmetric alpha paramater across all topics | |
BetA <- rep(betA, nTerms) # number-of-terms length vector set to symmetric beta paramater across all terms | |
## generate simulated corpus (See script SimulateCorpus.R) | |
# Returns corpus list object, default 100 documents, 1000 terms, 1000/100=10 topics | |
# with documents and "true" values for doc/topic matrix and topic/term matrix | |
source('SimulateCorpus.R') | |
corpus <- simulateCorpus(M, nTerms, docLengths, K, alphA, betA) | |
################################## | |
################################## | |
### 2. Inference | |
# A. Using R Package topicmodels: | |
# Labels | |
Terms <- paste("Term",seq(nTerms)) | |
Topics <- paste("Topic", seq(K)) | |
Documents <- paste("Document", seq(M)) | |
LDA(corpus[['termFreqMatrix']], K, control=list(alpha=alphA, beta=betA), method='Gibbs') -> lda1 | |
# "Estimated" Document/Topic distribution matrix | |
Theta_est <- posterior(lda1,corpus[['termFreqMatrix']])$topics | |
colnames(Theta_est) <- Topics | |
rownames(Theta_est) <- Documents | |
# "Estimated" Topic/Term Distribution Matrix | |
Phi_est <- posterior(lda1,corpus[['termFreqMatrix']])$terms | |
colnames(Phi_est) <- Terms | |
rownames(Phi_est) <- Topics | |
################################## | |
################################## | |
### 3. Compare "True" to Estimate | |
Theta_true <- corpus[['Theta']] | |
Phi_true <- corpus[['Phi']] | |
################################## |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Basic LDA Topic Model Simulation ### | |
### Generate Simulated Corpus ### | |
library(ggplot2) | |
library(tm) | |
library(MCMCpack) | |
simulateCorpus <- function( M, # number of documents | |
nTerms, | |
docLengths, | |
K, # Number of Topics | |
alphA, # parameter for symmetric Document/Topic dirichlet distribution | |
betA, # parameter for Topic/Term dirichlet distribution | |
Alpha=rep(alphA,K), # number-of-topics length vector set to symmetric alpha parameter across all topics | |
Beta=rep(betA,nTerms)) # number-of-terms length vector set to symmetric beta paramater across all terms | |
{ | |
# Labels | |
Terms <- paste("Term",seq(nTerms)) | |
Topics <- paste("Topic", seq(K)) | |
Documents <- paste("Document", seq(M)) | |
## Generate latent topic and term distributions | |
# "True" Document/Topic distribution matrix | |
Theta <- rdirichlet(M, Alpha) | |
colnames(Theta) <- Topics | |
rownames(Theta) <- Documents | |
# "True" Topic/Term Distribution Matrix | |
Phi <- rdirichlet(K, Beta) | |
colnames(Phi) <- Terms | |
rownames(Phi) <- Topics | |
## Function to generate individual document | |
generateDoc <- function(docLength, topic_dist, terms_topics_dist){ | |
# docLength is specific document length | |
# topic_dist is specific topic distribution for this document | |
# terms_topics_dist is terms distribution matrix over all topics | |
document <- c() | |
for (i in seq(docLength)){ | |
# For each word in a document, choose a topic from that document's topic distribution | |
topic <- rmultinom(1, 1, topic_dist) | |
# Then choose a term from that topic's term distribution | |
term <- rmultinom(1, 1, terms_topics_dist[topic,]) | |
# and append term to document vector | |
document <- c(document, colnames(terms_topics_dist)[which.max(term)]) | |
} | |
return(document) | |
} | |
## generate "observed" corpus as list of terms | |
corpus <- list() | |
for (i in seq(M)){ | |
corpus[[i]] <- generateDoc(docLengths[i], Theta[i,], Phi) | |
} | |
## convert document term vectors to frequency vectors | |
freqsLists <- llply(corpus, table) | |
## write values to termFreqMatrix | |
termFreqMatrix <- matrix(nrow=M, ncol=nTerms, 0) | |
colnames(termFreqMatrix) <- Terms | |
rownames(termFreqMatrix) <- Documents | |
for (i in seq(M)){ | |
termFreqMatrix[i,names(freqsLists[[i]])] <- freqsLists[[i]] | |
} | |
stopifnot(rowSums(termFreqMatrix) == docLengths) | |
return(list("docs"=corpus, 'termFreqMatrix'=termFreqMatrix, "Theta"=Theta, "Phi"=Phi)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment