Last active
August 29, 2015 14:05
-
-
Save cpsievert/ac5c8758eaecd2cfab1f to your computer and use it in GitHub Desktop.
Some proposed changes to @noamross's ldavis post -- http://www.noamross.net/blog/2014/8/22/topicmodeling.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load/install necessary packages | |
library(devtools) | |
#install_github("cpsievert/LDAvis") | |
library(LDAvis) | |
library(mallet) | |
library(Rmpfr) | |
library(plyr) | |
library(XML) | |
library(stringi) | |
ppath = "eco.confex.com/eco/2014/webprogram" | |
paper_files = list.files(ppath, recursive=TRUE, pattern="Paper\\d+\\.html") | |
abs = alply(paper_files, 1, function(paper) { | |
paper_xml = htmlTreeParse(file.path(ppath, paper), useInternalNodes = TRUE, trim=TRUE) | |
ab = try(xmlValue(paper_xml[['//div[@class="abstract"]']]), silent = TRUE) | |
if(class(ab) != "try-error") { | |
ab = stringi::stri_replace_all_fixed(ab, "Background/Question/Methods", "") | |
ab = stringi::stri_replace_all_fixed(ab, "Results/Conclusions", "") | |
return(ab) | |
} else { | |
return(NULL) | |
} | |
}, .progress = "time") | |
abs = unlist(compact(abs)) | |
# grab some common stopwords and import the abstracts to mallet | |
download.file("http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop", "stopwords.txt") | |
instance <- mallet.import(names(abs), abs, "stopwords.txt") | |
model <- MalletLDA(num.topics = 50) | |
model$loadDocuments(instance) | |
freqs <- mallet.word.freqs(model) | |
# add infrequent words to the list of stopwords and re-import | |
stopwords <- as.character(subset(freqs, term.freq <= 9)$words) | |
writeLines(c(readLines("stopwords.txt"), stopwords), "stopwords2.txt") | |
instance2 <- mallet.import(names(abs), abs, "stopwords2.txt") | |
model2 <- MalletLDA(num.topics = 50) | |
model2$loadDocuments(instance2) | |
freqs2 <- mallet.word.freqs(model2) | |
#just make sure there aren't any terms with frequency 0 | |
subset(freqs2, term.freq <= 9) | |
# fit the model...5000 iterations is maybe a bit excessive here | |
model2$train(5000) | |
# Here, we compute the estimated topic-term distribution, incorporating the effect | |
# of the prior using 'smoothed = TRUE'. | |
phi <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = TRUE)) | |
phi.count <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE)) | |
# Now get the smoothed estimates of the document-topic distributions: | |
topic.words <- mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE) | |
# 'count' of the number of tokens per topic (including pseudo-tokens from the priors) | |
topic.counts <- rowSums(topic.words) | |
topic.proportions <- topic.counts/sum(topic.counts) | |
vocab <- model2$getVocabulary() | |
out <- check.inputs(K = 50, W = length(vocab), phi = phi, | |
term.frequency = apply(phi.count, 1, sum), | |
vocab = vocab, topic.proportion = topic.proportions) | |
# Relabel topics so that topics are numbered in decreasing order of frequency. | |
colnames(out$phi) <- seq_len(out$K) | |
json <- with(out, createJSON(K = 50, phi, term.frequency, | |
vocab, topic.proportion)) | |
serVis(json, out.dir = 'vis', open.browser = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment