abelsonlive/lda.R

## lda.R
# after successfully installing hiR, start here:

# setup
library("hiR")

# read in every text file to a single character vector #

# set working directory
setwd("DIggOutput")

# list all the file paths in this directory
files <- list.files()

# select only those file paths that contain ".txt"
text_files <- files[grep("\\.txt", files)]

# read in individual text files and convert to a character vector...
corpus <- unlist(laply(text_files, readLines), .progress="text")

# run topic model #
# these are the only parameters you need to worry about for now:
# adjust "min_word_count" up if you want to remove noise,

# adjust n_topics based off of the results
# you may think, for instance, that a topic in being subsumed by another,
# or two topics should really be one. this is totally subjective

# include words in "stop_words_to_add" that aren't meaningful given the context

results <- lda(corpus, # this is a character vector of text documents
               stem_words=TRUE, # this usually produces better results.
               min_word_count=2, # how many times a word must appear to be included
               n_topics=10, # number of topics to infer
               stop_words_to_add=c("") )

results[[1]] # these are the top 20 words per topic. use these to infer what each topic is "about."
results[[2]] # this is a data.frame with stats about each document, including the topic assignments ("topic_a" and "topic_b")
	# after successfully installing hiR, start here:

	# setup
	library("hiR")

	# read in every text file to a single character vector #

	# set working directory
	setwd("DIggOutput")

	# list all the file paths in this directory
	files <- list.files()

	# select only those file paths that contain ".txt"
	text_files <- files[grep("\\.txt", files)]

	# read in individual text files and convert to a character vector...
	corpus <- unlist(laply(text_files, readLines), .progress="text")

	# run topic model #
	# these are the only parameters you need to worry about for now:
	# adjust "min_word_count" up if you want to remove noise,

	# adjust n_topics based off of the results
	# you may think, for instance, that a topic in being subsumed by another,
	# or two topics should really be one. this is totally subjective

	# include words in "stop_words_to_add" that aren't meaningful given the context

	results <- lda(corpus, # this is a character vector of text documents
	stem_words=TRUE, # this usually produces better results.
	min_word_count=2, # how many times a word must appear to be included
	n_topics=10, # number of topics to infer
	stop_words_to_add=c("") )

	results[[1]] # these are the top 20 words per topic. use these to infer what each topic is "about."
	results[[2]] # this is a data.frame with stats about each document, including the topic assignments ("topic_a" and "topic_b")