cpsievert/topics.R

## topics.R
# Load/install necessary packages
library(devtools)
#install_github("cpsievert/LDAvis")
library(LDAvis)
library(mallet)
library(Rmpfr)
library(plyr)
library(XML)
library(stringi)

ppath = "eco.confex.com/eco/2014/webprogram"
paper_files = list.files(ppath, recursive=TRUE, pattern="Paper\\d+\\.html")

abs = alply(paper_files, 1, function(paper) {
  paper_xml = htmlTreeParse(file.path(ppath, paper), useInternalNodes = TRUE, trim=TRUE)
  ab = try(xmlValue(paper_xml[['//div[@class="abstract"]']]), silent = TRUE)
  if(class(ab) != "try-error") {
    ab = stringi::stri_replace_all_fixed(ab, "Background/Question/Methods", "")
    ab = stringi::stri_replace_all_fixed(ab, "Results/Conclusions", "")
    return(ab)
  } else {
    return(NULL)
  }
}, .progress = "time")
abs = unlist(compact(abs))

# grab some common stopwords and import the abstracts to mallet
download.file("http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop", "stopwords.txt")
instance <- mallet.import(names(abs), abs, "stopwords.txt")
model <- MalletLDA(num.topics = 50)
model$loadDocuments(instance)
freqs <- mallet.word.freqs(model)

# add infrequent words to the list of stopwords and re-import
stopwords <- as.character(subset(freqs, term.freq <= 9)$words)
writeLines(c(readLines("stopwords.txt"), stopwords),  "stopwords2.txt")
instance2 <- mallet.import(names(abs), abs, "stopwords2.txt")
model2 <- MalletLDA(num.topics = 50)
model2$loadDocuments(instance2)
freqs2 <- mallet.word.freqs(model2)

#just make sure there aren't any terms with frequency 0
subset(freqs2, term.freq <= 9)

# fit the model...5000 iterations is maybe a bit excessive here
model2$train(5000)
# Here, we compute the estimated topic-term distribution, incorporating the effect
# of the prior using 'smoothed = TRUE'.
phi <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = TRUE))
phi.count <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE))
# Now get the smoothed estimates of the document-topic distributions:
topic.words <- mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE)
# 'count' of the number of tokens per topic (including pseudo-tokens from the priors)
topic.counts <- rowSums(topic.words)
topic.proportions <- topic.counts/sum(topic.counts)
vocab <- model2$getVocabulary()

out <- check.inputs(K = 50, W = length(vocab), phi = phi,
                    term.frequency = apply(phi.count, 1, sum),
                    vocab = vocab, topic.proportion = topic.proportions)
# Relabel topics so that topics are numbered in decreasing order of frequency.
colnames(out$phi) <- seq_len(out$K)

json <- with(out, createJSON(K = 50, phi, term.frequency,
                             vocab, topic.proportion))

serVis(json, out.dir = 'vis', open.browser = FALSE)
	# Load/install necessary packages
	library(devtools)
	#install_github("cpsievert/LDAvis")
	library(LDAvis)
	library(mallet)
	library(Rmpfr)
	library(plyr)
	library(XML)
	library(stringi)

	ppath = "eco.confex.com/eco/2014/webprogram"
	paper_files = list.files(ppath, recursive=TRUE, pattern="Paper\\d+\\.html")

	abs = alply(paper_files, 1, function(paper) {
	paper_xml = htmlTreeParse(file.path(ppath, paper), useInternalNodes = TRUE, trim=TRUE)
	ab = try(xmlValue(paper_xml[['//div[@class="abstract"]']]), silent = TRUE)
	if(class(ab) != "try-error") {
	ab = stringi::stri_replace_all_fixed(ab, "Background/Question/Methods", "")
	ab = stringi::stri_replace_all_fixed(ab, "Results/Conclusions", "")
	return(ab)
	} else {
	return(NULL)
	}
	}, .progress = "time")
	abs = unlist(compact(abs))

	# grab some common stopwords and import the abstracts to mallet
	download.file("http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop", "stopwords.txt")
	instance <- mallet.import(names(abs), abs, "stopwords.txt")
	model <- MalletLDA(num.topics = 50)
	model$loadDocuments(instance)
	freqs <- mallet.word.freqs(model)

	# add infrequent words to the list of stopwords and re-import
	stopwords <- as.character(subset(freqs, term.freq <= 9)$words)
	writeLines(c(readLines("stopwords.txt"), stopwords), "stopwords2.txt")
	instance2 <- mallet.import(names(abs), abs, "stopwords2.txt")
	model2 <- MalletLDA(num.topics = 50)
	model2$loadDocuments(instance2)
	freqs2 <- mallet.word.freqs(model2)

	#just make sure there aren't any terms with frequency 0
	subset(freqs2, term.freq <= 9)

	# fit the model...5000 iterations is maybe a bit excessive here
	model2$train(5000)
	# Here, we compute the estimated topic-term distribution, incorporating the effect
	# of the prior using 'smoothed = TRUE'.
	phi <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = TRUE))
	phi.count <- t(mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE))
	# Now get the smoothed estimates of the document-topic distributions:
	topic.words <- mallet.topic.words(model2, smoothed = TRUE, normalized = FALSE)
	# 'count' of the number of tokens per topic (including pseudo-tokens from the priors)
	topic.counts <- rowSums(topic.words)
	topic.proportions <- topic.counts/sum(topic.counts)
	vocab <- model2$getVocabulary()

	out <- check.inputs(K = 50, W = length(vocab), phi = phi,
	term.frequency = apply(phi.count, 1, sum),
	vocab = vocab, topic.proportion = topic.proportions)
	# Relabel topics so that topics are numbered in decreasing order of frequency.
	colnames(out$phi) <- seq_len(out$K)

	json <- with(out, createJSON(K = 50, phi, term.frequency,
	vocab, topic.proportion))

	serVis(json, out.dir = 'vis', open.browser = FALSE)