Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Created July 10, 2017 10:48
Show Gist options
  • Save yabyzq/6d27a5da39b0e7b9ca8df26604174704 to your computer and use it in GitHub Desktop.
Save yabyzq/6d27a5da39b0e7b9ca8df26604174704 to your computer and use it in GitHub Desktop.
TFIDF examples
library(RODBC)
library(tm)
library(wordcloud)
library(ggplot2)
library(ROracle)
a <- read.csv(file = 'C:/Users/eye1/Desktop/text.csv')
names(a) <- 'feedback'
narrative <- a$feedback
str(narrative)
head(narrative)
corpus <- Corpus(VectorSource(narrative))##assoc analysis, for speed, change this to narrative
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"),"australia", "aust", "limited"))
dictCorpus <- corpus
corpus = tm_map(corpus, stemDocument);
#corpus = tm_map(corpus, stemCompletion, dictionary=dictCorpus);
#1 Simple Word Count
dtmSimple<-DocumentTermMatrix(corpus, control=list(wordLengths=c(4, 20)))#bounds=list(global=c(10, Inf))##c(floor(length(corpus)*0.05)))
dtmSMatrix <- as.matrix(dtmSimple)
freqSimple <- colSums(dtmSMatrix)
freqSimple <- sort(freqSimple, decreasing=TRUE)
head(freqSimple,15)
tail(freqSimple) ##findFreqTerms(dtm4,lowfreq=50)
wordcloud(names(freqSimple)[1:20], freqSimple[1:20],colors=brewer.pal(6,"Dark2"))
findFreqTerms(dtmSimple,10)
findAssocs(dtmSimple,"calls",0.1)
#2 Using tfidf
dtmTfidf<-DocumentTermMatrix(corpus,control = list(wordLengths=c(4, 20),#bounds=list(global=c(10, Inf)),
weighting = function(x) weightTfIdf(x, normalize = TRUE)))
dtmTMatrix <- as.matrix(dtmTfidf)
#Full Document Frequency
freqTfidf <- colSums(dtmTMatrix)
freqTfidf <- sort(freqTfidf, decreasing=TRUE)
head(freqTfidf,15)
tail(freqTfidf) ##findFreqTerms(dtm4,lowfreq=50)
#dissimilarity(dtmObject, method = "cosine")
#Top Words
wordcloud(names(freqTfidf)[1:15], freqTfidf[1:15],colors=brewer.pal(6,"Dark2"))
wf=data.frame(term=names(freqTfidf[1:15]),occurrences=freqTfidf[1:15])
ggplot(wf, aes(reorder(term, -occurrences), occurrences)) +
geom_bar(stat="identity", fill='lightblue') +
theme(text = element_text(size=15, colour='grey100'), axis.text.x=element_text(angle=45, hjust=1),panel.background = element_rect(fill = "white"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment