Created
July 23, 2015 14:13
-
-
Save jwinternheimer/7ef0afd84b29976a5632 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm);library(SnowballC); library(wordcloud); library(RColorBrewer);library(RWeka) | |
library(ape) | |
## Read Text File of Conversations | |
get_text <- function(filename) { | |
txt <- read.table(filename, header=F) | |
names(txt) <- "text" | |
txt <- as.data.frame(clean.text(txt$text)) | |
names(txt) <- "text" | |
return(txt) | |
} | |
## Convert Text to Corpus and Create Term Document Matrix | |
to_corpus <- function(text_df) { | |
corpus <- Corpus(VectorSource(text_df$text)) | |
corpus <- tm_map(corpus,removeWords,stopwords("english")) | |
return(corpus) | |
} | |
## Create Wordcloud | |
make_wordcloud <- function(corpus) { | |
pal2 <- brewer.pal(8,"Dark2") | |
wordcloud(corpus,scale=c(8,.2),min.freq=3,max.words=inf, | |
random.order=FALSE, rot.per=.15, colors=pal2) | |
} | |
## Build Document-Term Matrix | |
hs.tdm <- TermDocumentMatrix(hs_corpus) | |
## Identify Terms Used at Least 10 Times | |
findFreqTerms(hs.tdm,lowfreq=20) | |
## Find Terms That Frequently Co-Occur | |
findAssocs(hs.tdm,'cant',0.25) | |
## Remove Sparse Terms and Convert to Data Frame | |
hs2.tdm <- removeSparseTerms(hs.tdm,sparse=0.90) | |
hs2.df <- as.data.frame(inspect(hs2.tdm)) | |
## Scale Data and Create Distance Matrix | |
hs2.df.scale <- scale(hs2.df) | |
hs2.dis <- dist(hs2.df.scale, method="euclidean") | |
## Cluster the Data | |
hs.fit <- hclust(hs2.dis, method="ward.D") | |
plot(hs.fit,main="Cluster - Analytics") | |
## Five Clusters | |
groups <- cutree(hs.fit,k=5) | |
rect.hclust(hs.fit,k=5) | |
## N-gram Identifyier | |
options(mc.cores=1) | |
ngramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4)) | |
helpscout.tdm <- TermDocumentMatrix(Corpus(VectorSource(hs_text$text)), control=list(tokenize=ngramTokenizer)) | |
## Order Terms by Frequency | |
freq <- rowSums(as.matrix(helpscout.tdm)) | |
length(freq) | |
ord <- order(-freq) | |
top_twenty <- ord[1:20] | |
freq[top_twenty] | |
inspect(helpscout.tdm)[1:5,1:5] | |
## Clean Text Function | |
clean.text <- function(some_txt) { | |
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt) | |
some_txt = gsub("@\\w+", "", some_txt) | |
some_txt = gsub("[[:punct:]]", "", some_txt) | |
some_txt = gsub("[[:digit:]]", "", some_txt) | |
some_txt = gsub("http\\w+", "", some_txt) | |
some_txt = gsub("[ \t]{2,}", "", some_txt) | |
some_txt = gsub("^\\s+|\\s+$", "", some_txt) | |
some_txt = gsub("amp", "", some_txt) | |
# define "tolower error handling" function | |
try.tolower = function(x) { | |
y = NA | |
try_error = tryCatch(tolower(x), error=function(e) e) | |
if (!inherits(try_error, "error")) | |
y = tolower(x) | |
return(y) | |
} | |
some_txt = sapply(some_txt, try.tolower) | |
some_txt = some_txt[some_txt != ""] | |
names(some_txt) = NULL | |
return(some_txt) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment