Skip to content

Instantly share code, notes, and snippets.

@martinctc
Last active November 15, 2016 17:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martinctc/08d77f39d4b116c6e2e7fe1a7c52b050 to your computer and use it in GitHub Desktop.
Save martinctc/08d77f39d4b116c6e2e7fe1a7c52b050 to your computer and use it in GitHub Desktop.
Basic Text Analytics for NPS Open-Ended Responses
library("tm")
library("wordcloud")
library("SnowballC")
library("RColorBrewer")
library("tcltk2")
tk_choose.dir(getwd(),"Choose a suitable folder")
db = file.choose()
data <-read.csv(db,stringsAsFactors = FALSE)
#Create text corpuses for Promoters and Detractors
PromoCorp <- Corpus(VectorSource(data$Promoters))
DetraCorp <- Corpus(VectorSource(data$Detractors))
set.seed(169)
#Create word cloud for Promoters
PromoCorp<- tm_map(PromoCorp, removePunctuation)
PromoCorp<- tm_map(PromoCorp, removeWords,c(stopwords('english')))
#PromoCorp <- tm_map(PromoCorp, stemDocument)
png("wordcloud_promoter.png", width=1000,height=1000)
wordcloud(PromoCorp, max.words=300, random.order=FALSE, colors=brewer.pal(8,"Dark2"), rot.per=0)
dev.off()
#Create word cloud for Detractors
DetraCorp<- tm_map(DetraCorp, removePunctuation)
DetraCorp<- tm_map(DetraCorp, removeWords,c(stopwords('english'),'dont'))
#DetraCorp <- tm_map(DetraCorp, stemDocument)
png("wordcloud_detractor.png", width=1000,height=1000)
wordcloud(DetraCorp, max.words=300, random.order=FALSE, colors=brewer.pal(8,"Dark2"), rot.per=0)
dev.off()
#Create word frequency table for Promoters
tdmpro = TermDocumentMatrix(PromoCorp)
# tdmpro <- removeSparseTerms(tdmpro,0.1)
mpro = as.matrix(tdmpro)
word_freqsp = sort(rowSums(mpro),decreasing = TRUE)
dmpro = data.frame(word=names(word_freqsp),freqp=word_freqsp)
write.csv(file="dmpro.csv",x=dmpro)
#Create word frequency table for Detractors
tdmdet = TermDocumentMatrix(DetraCorp)
# tdmdet <- removeSparseTerms(tdmdet,0.1)
mdet = as.matrix(tdmdet)
word_freqsd = sort(rowSums(mdet),decreasing = TRUE)
dmdet = data.frame(word=names(word_freqsd),freqd=word_freqsd)
write.csv(file="dmdet.csv",x=dmdet)
#Hierarchical clustering for Promoters
dtmpro <- DocumentTermMatrix(PromoCorp)
dtmpro <- removeSparseTerms(dtmpro,0.96)
dtmpro <- as.matrix(dtmpro)
d <- dist(t(dtmpro),method="euclidean")
fitp <- hclust(d=d, method="ward.D")
png("hcluster_promoter.png")
plot(fitp,hang=-1)
groups <- cutree (fitp, k=5)
rect.hclust(fitp,k=5, border="blue")
dev.off()
#Hierarchical clustering for Detractors
dtmdet <- DocumentTermMatrix(DetraCorp)
dtmdet <- removeSparseTerms(dtmdet,0.995)
dtmdet <- as.matrix(dtmdet)
d <- dist(t(dtmdet),method="euclidean")
fitd <- hclust(d=d, method="ward.D")
png("hcluster_detractor.png")
plot(fitd,hang=-1)
groups <- cutree (fitd, k=5)
rect.hclust(fitd,k=5, border="blue")
dev.off()
#Tokens Big-grams Function for Phrase Frequency
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
#Create special TDMs for Promoters and Detractors
tdmb_pro <- TermDocumentMatrix(PromoCorp,control=list(tokenize=BigramTokenizer))
tdmb_det <- TermDocumentMatrix(DetraCorp,control=list(tokenize=BigramTokenizer))
#Create CSV files for Promoters and Detractors Phrase Frequency Tables
bigram_pro = as.matrix(tdmb_pro)
word_freqbp = sort(rowSums(bigram_pro),decreasing = TRUE)
bigram_pro = data.frame(word=names(word_freqbp),freqp=word_freqbp)
write.csv(file="bigram_pro.csv",x=bigram_pro)
bigram_det = as.matrix(tdmb_det)
word_freqbd = sort(rowSums(bigram_det),decreasing = TRUE)
bigram_det = data.frame(word=names(word_freqbd),freqp=word_freqbd)
write.csv(file="bigram_det.csv",x=bigram_det)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment