Skip to content

Instantly share code, notes, and snippets.

@jwinternheimer
Created July 23, 2015 14:13
Show Gist options
  • Save jwinternheimer/7ef0afd84b29976a5632 to your computer and use it in GitHub Desktop.
Save jwinternheimer/7ef0afd84b29976a5632 to your computer and use it in GitHub Desktop.
library(tm);library(SnowballC); library(wordcloud); library(RColorBrewer);library(RWeka)
library(ape)
## Read Text File of Conversations
get_text <- function(filename) {
txt <- read.table(filename, header=F)
names(txt) <- "text"
txt <- as.data.frame(clean.text(txt$text))
names(txt) <- "text"
return(txt)
}
## Convert Text to Corpus and Create Term Document Matrix
to_corpus <- function(text_df) {
corpus <- Corpus(VectorSource(text_df$text))
corpus <- tm_map(corpus,removeWords,stopwords("english"))
return(corpus)
}
## Create Wordcloud
make_wordcloud <- function(corpus) {
pal2 <- brewer.pal(8,"Dark2")
wordcloud(corpus,scale=c(8,.2),min.freq=3,max.words=inf,
random.order=FALSE, rot.per=.15, colors=pal2)
}
## Build Document-Term Matrix
hs.tdm <- TermDocumentMatrix(hs_corpus)
## Identify Terms Used at Least 10 Times
findFreqTerms(hs.tdm,lowfreq=20)
## Find Terms That Frequently Co-Occur
findAssocs(hs.tdm,'cant',0.25)
## Remove Sparse Terms and Convert to Data Frame
hs2.tdm <- removeSparseTerms(hs.tdm,sparse=0.90)
hs2.df <- as.data.frame(inspect(hs2.tdm))
## Scale Data and Create Distance Matrix
hs2.df.scale <- scale(hs2.df)
hs2.dis <- dist(hs2.df.scale, method="euclidean")
## Cluster the Data
hs.fit <- hclust(hs2.dis, method="ward.D")
plot(hs.fit,main="Cluster - Analytics")
## Five Clusters
groups <- cutree(hs.fit,k=5)
rect.hclust(hs.fit,k=5)
## N-gram Identifyier
options(mc.cores=1)
ngramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
helpscout.tdm <- TermDocumentMatrix(Corpus(VectorSource(hs_text$text)), control=list(tokenize=ngramTokenizer))
## Order Terms by Frequency
freq <- rowSums(as.matrix(helpscout.tdm))
length(freq)
ord <- order(-freq)
top_twenty <- ord[1:20]
freq[top_twenty]
inspect(helpscout.tdm)[1:5,1:5]
## Clean Text Function
clean.text <- function(some_txt) {
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
some_txt = gsub("@\\w+", "", some_txt)
some_txt = gsub("[[:punct:]]", "", some_txt)
some_txt = gsub("[[:digit:]]", "", some_txt)
some_txt = gsub("http\\w+", "", some_txt)
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
some_txt = gsub("amp", "", some_txt)
# define "tolower error handling" function
try.tolower = function(x) {
y = NA
try_error = tryCatch(tolower(x), error=function(e) e)
if (!inherits(try_error, "error"))
y = tolower(x)
return(y)
}
some_txt = sapply(some_txt, try.tolower)
some_txt = some_txt[some_txt != ""]
names(some_txt) = NULL
return(some_txt)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment