Last active
December 9, 2015 23:59
-
-
Save abelsonlive/4348124 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# after successfully installing hiR, start here: | |
# setup | |
library("hiR") | |
# read in every text file to a single character vector # | |
# set working directory | |
setwd("DIggOutput") | |
# list all the file paths in this directory | |
files <- list.files() | |
# select only those file paths that contain ".txt" | |
text_files <- files[grep("\\.txt", files)] | |
# read in individual text files and convert to a character vector... | |
corpus <- unlist(laply(text_files, readLines), .progress="text") | |
# run topic model # | |
# these are the only parameters you need to worry about for now: | |
# adjust "min_word_count" up if you want to remove noise, | |
# adjust n_topics based off of the results | |
# you may think, for instance, that a topic in being subsumed by another, | |
# or two topics should really be one. this is totally subjective | |
# include words in "stop_words_to_add" that aren't meaningful given the context | |
results <- lda(corpus, # this is a character vector of text documents | |
stem_words=TRUE, # this usually produces better results. | |
min_word_count=2, # how many times a word must appear to be included | |
n_topics=10, # number of topics to infer | |
stop_words_to_add=c("") ) | |
results[[1]] # these are the top 20 words per topic. use these to infer what each topic is "about." | |
results[[2]] # this is a data.frame with stats about each document, including the topic assignments ("topic_a" and "topic_b") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment