Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Bigrams
###########################################################################################################
#get the bigrams in queries and getting the frequency / create wordcloud
###########################################################################################################
#text cleaning
#import data
queries <- read.csv("texts.csv")
library(tm)
library(stringr)
library(RWeka)
#build a corpus and specify the source to be character of vectors
myCorpus <- VCorpus(VectorSource(queries$text))
#remove emoticons
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
#convert myCorpus into lowercase
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
#remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
#remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
#remove URLs
removeURL <- function(x) gsub("http[[:alnum:]]*","", x)
myCorpus <- tm_map(myCorpus,removeURL)
#add stopwords
mystopwords <- c(stopwords("english"),("i"))
#remove stopwords
myCorpus <- tm_map(myCorpus,removeWords,mystopwords)
tdm <- TermDocumentMatrix(myCorpus,control = list(wordlengths = c(1,Inf)))
tdm
#Tokenizer for n-grams and passed on to the term-document matrix constructor
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm.bigram = TermDocumentMatrix(myCorpus,control = list(tokenize = BigramTokenizer))
#get the frequency of the bigrams
freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
rownames(freq.df) <- NULL
head(freq.df, 30)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment