jwinternheimer/helpscout_conversations.R

## helpscout_conversations.R
library(tm);library(SnowballC); library(wordcloud); library(RColorBrewer);library(RWeka)
library(ape)

## Read Text File of Conversations
get_text <- function(filename) {
  txt <- read.table(filename, header=F)
  names(txt) <- "text"

  txt <- as.data.frame(clean.text(txt$text))
  names(txt) <- "text"

  return(txt)
}


## Convert Text to Corpus and Create Term Document Matrix
to_corpus <- function(text_df) {
  corpus <- Corpus(VectorSource(text_df$text))
  corpus <- tm_map(corpus,removeWords,stopwords("english"))

  return(corpus)
}


## Create Wordcloud
make_wordcloud <- function(corpus) {
  pal2 <- brewer.pal(8,"Dark2")
  wordcloud(corpus,scale=c(8,.2),min.freq=3,max.words=inf,
            random.order=FALSE, rot.per=.15, colors=pal2)
}


## Build Document-Term Matrix
hs.tdm <- TermDocumentMatrix(hs_corpus)


## Identify Terms Used at Least 10 Times
findFreqTerms(hs.tdm,lowfreq=20)


## Find Terms That Frequently Co-Occur
findAssocs(hs.tdm,'cant',0.25)


## Remove Sparse Terms and Convert to Data Frame
hs2.tdm <- removeSparseTerms(hs.tdm,sparse=0.90)
hs2.df <- as.data.frame(inspect(hs2.tdm))


## Scale Data and Create Distance Matrix
hs2.df.scale <- scale(hs2.df)
hs2.dis <- dist(hs2.df.scale, method="euclidean")


## Cluster the Data
hs.fit <- hclust(hs2.dis, method="ward.D")
plot(hs.fit,main="Cluster - Analytics")

## Five Clusters
groups <- cutree(hs.fit,k=5)
rect.hclust(hs.fit,k=5)

## N-gram Identifyier
options(mc.cores=1)
ngramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
helpscout.tdm <- TermDocumentMatrix(Corpus(VectorSource(hs_text$text)), control=list(tokenize=ngramTokenizer))


## Order Terms by Frequency
freq <- rowSums(as.matrix(helpscout.tdm))
length(freq)

ord <- order(-freq)
top_twenty <- ord[1:20]

freq[top_twenty]

inspect(helpscout.tdm)[1:5,1:5]


## Clean Text Function
clean.text <- function(some_txt) {
  some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
  some_txt = gsub("@\\w+", "", some_txt)
  some_txt = gsub("[[:punct:]]", "", some_txt)
  some_txt = gsub("[[:digit:]]", "", some_txt)
  some_txt = gsub("http\\w+", "", some_txt)
  some_txt = gsub("[ \t]{2,}", "", some_txt)
  some_txt = gsub("^\\s+|\\s+$", "", some_txt)
  some_txt = gsub("amp", "", some_txt)
  # define "tolower error handling" function
  try.tolower = function(x) {
    y = NA
    try_error = tryCatch(tolower(x), error=function(e) e)
    if (!inherits(try_error, "error"))
      y = tolower(x)
    return(y)
  }

  some_txt = sapply(some_txt, try.tolower)
  some_txt = some_txt[some_txt != ""]
  names(some_txt) = NULL
  return(some_txt)
}
	library(tm);library(SnowballC); library(wordcloud); library(RColorBrewer);library(RWeka)
	library(ape)

	## Read Text File of Conversations
	get_text <- function(filename) {
	txt <- read.table(filename, header=F)
	names(txt) <- "text"

	txt <- as.data.frame(clean.text(txt$text))
	names(txt) <- "text"

	return(txt)
	}



	## Convert Text to Corpus and Create Term Document Matrix
	to_corpus <- function(text_df) {
	corpus <- Corpus(VectorSource(text_df$text))
	corpus <- tm_map(corpus,removeWords,stopwords("english"))

	return(corpus)
	}


	## Create Wordcloud
	make_wordcloud <- function(corpus) {
	pal2 <- brewer.pal(8,"Dark2")
	wordcloud(corpus,scale=c(8,.2),min.freq=3,max.words=inf,
	random.order=FALSE, rot.per=.15, colors=pal2)
	}



	## Build Document-Term Matrix
	hs.tdm <- TermDocumentMatrix(hs_corpus)


	## Identify Terms Used at Least 10 Times
	findFreqTerms(hs.tdm,lowfreq=20)


	## Find Terms That Frequently Co-Occur
	findAssocs(hs.tdm,'cant',0.25)



	## Remove Sparse Terms and Convert to Data Frame
	hs2.tdm <- removeSparseTerms(hs.tdm,sparse=0.90)
	hs2.df <- as.data.frame(inspect(hs2.tdm))


	## Scale Data and Create Distance Matrix
	hs2.df.scale <- scale(hs2.df)
	hs2.dis <- dist(hs2.df.scale, method="euclidean")


	## Cluster the Data
	hs.fit <- hclust(hs2.dis, method="ward.D")
	plot(hs.fit,main="Cluster - Analytics")

	## Five Clusters
	groups <- cutree(hs.fit,k=5)
	rect.hclust(hs.fit,k=5)

	## N-gram Identifyier
	options(mc.cores=1)
	ngramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
	helpscout.tdm <- TermDocumentMatrix(Corpus(VectorSource(hs_text$text)), control=list(tokenize=ngramTokenizer))


	## Order Terms by Frequency
	freq <- rowSums(as.matrix(helpscout.tdm))
	length(freq)

	ord <- order(-freq)
	top_twenty <- ord[1:20]

	freq[top_twenty]

	inspect(helpscout.tdm)[1:5,1:5]



	## Clean Text Function
	clean.text <- function(some_txt) {
	some_txt = gsub("(RT\|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
	some_txt = gsub("@\\w+", "", some_txt)
	some_txt = gsub("[[:punct:]]", "", some_txt)
	some_txt = gsub("[[:digit:]]", "", some_txt)
	some_txt = gsub("http\\w+", "", some_txt)
	some_txt = gsub("[ \t]{2,}", "", some_txt)
	some_txt = gsub("^\\s+\|\\s+$", "", some_txt)
	some_txt = gsub("amp", "", some_txt)
	# define "tolower error handling" function
	try.tolower = function(x) {
	y = NA
	try_error = tryCatch(tolower(x), error=function(e) e)
	if (!inherits(try_error, "error"))
	y = tolower(x)
	return(y)
	}

	some_txt = sapply(some_txt, try.tolower)
	some_txt = some_txt[some_txt != ""]
	names(some_txt) = NULL
	return(some_txt)
	}