Skip to content

Instantly share code, notes, and snippets.

@cpsievert
Last active August 29, 2015 14:05
Show Gist options
  • Save cpsievert/ac5c8758eaecd2cfab1f to your computer and use it in GitHub Desktop.
Save cpsievert/ac5c8758eaecd2cfab1f to your computer and use it in GitHub Desktop.
Some proposed changes to @noamross's ldavis post -- http://www.noamross.net/blog/2014/8/22/topicmodeling.html
# Load/install necessary packages
library(devtools)
#install_github("cpsievert/LDAvis")
library(LDAvis)
library(mallet)
library(Rmpfr)
library(plyr)
library(XML)
library(stringi)
ppath = "eco.confex.com/eco/2014/webprogram"
paper_files = list.files(ppath, recursive=TRUE, pattern="Paper\\d+\\.html")
abs = alply(paper_files, 1, function(paper) {
paper_xml = htmlTreeParse(file.path(ppath, paper), useInternalNodes = TRUE, trim=TRUE)
ab = try(xmlValue(paper_xml[['//div[@class="abstract"]']]), silent = TRUE)
if(class(ab) != "try-error") {
ab = stringi::stri_replace_all_fixed(ab, "Background/Question/Methods", "")
ab = stringi::stri_replace_all_fixed(ab, "Results/Conclusions", "")
return(ab)
} else {
return(NULL)
}
}, .progress = "time")
abs = unlist(compact(abs))
# grab some common stopwords and import the abstracts to mallet
download.file("http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop", "stopwords.txt")
instance <- mallet.import(names(abs), abs, "stopwords.txt")
model <- MalletLDA(num.topics = 50)
model$loadDocuments(instance)
freqs <- mallet.word.freqs(model)
# add infrequent words to the list of stopwords and re-import
stopwords <- as.character(subset(freqs, term.freq <= 9)$words)
writeLines(c(readLines("stopwords.txt"), stopwords), "stopwords2.txt")
instance2 <- mallet.import(names(abs), abs, "stopwords2.txt")
model2 <- MalletLDA(num.topics = 50)
model2$loadDocuments(instance2)
freqs2 <- mallet.word.freqs(model2)
#just make sure there aren't any terms with frequency 0
subset(freqs2, term.freq <= 9)
# fit the model...5000 iterations is maybe a bit excessive here
model2$train(5000)
# Here, we compute the estimated topic-term distribution, incorporating the effect
# of the prior using 'smoothed = TRUE'.
phi <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = TRUE))
phi.count <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE))
# Now get the smoothed estimates of the document-topic distributions:
topic.words <- mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE)
# 'count' of the number of tokens per topic (including pseudo-tokens from the priors)
topic.counts <- rowSums(topic.words)
topic.proportions <- topic.counts/sum(topic.counts)
vocab <- model2$getVocabulary()
out <- check.inputs(K = 50, W = length(vocab), phi = phi,
term.frequency = apply(phi.count, 1, sum),
vocab = vocab, topic.proportion = topic.proportions)
# Relabel topics so that topics are numbered in decreasing order of frequency.
colnames(out$phi) <- seq_len(out$K)
json <- with(out, createJSON(K = 50, phi, term.frequency,
vocab, topic.proportion))
serVis(json, out.dir = 'vis', open.browser = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment