Bigrams
########################################################################################################### | |
#get the bigrams in queries and getting the frequency / create wordcloud | |
########################################################################################################### | |
#text cleaning | |
#import data | |
queries <- read.csv("texts.csv") | |
library(tm) | |
library(stringr) | |
library(RWeka) | |
#build a corpus and specify the source to be character of vectors | |
myCorpus <- VCorpus(VectorSource(queries$text)) | |
#remove emoticons | |
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte")) | |
#convert myCorpus into lowercase | |
myCorpus <- tm_map(myCorpus, content_transformer(tolower)) | |
#remove punctuation | |
myCorpus <- tm_map(myCorpus, removePunctuation) | |
#remove numbers | |
myCorpus <- tm_map(myCorpus, removeNumbers) | |
#remove URLs | |
removeURL <- function(x) gsub("http[[:alnum:]]*","", x) | |
myCorpus <- tm_map(myCorpus,removeURL) | |
#add stopwords | |
mystopwords <- c(stopwords("english"),("i")) | |
#remove stopwords | |
myCorpus <- tm_map(myCorpus,removeWords,mystopwords) | |
tdm <- TermDocumentMatrix(myCorpus,control = list(wordlengths = c(1,Inf))) | |
tdm | |
#Tokenizer for n-grams and passed on to the term-document matrix constructor | |
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) | |
tdm.bigram = TermDocumentMatrix(myCorpus,control = list(tokenize = BigramTokenizer)) | |
#get the frequency of the bigrams | |
freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE) | |
freq.df = data.frame(word=names(freq), freq=freq) | |
rownames(freq.df) <- NULL | |
head(freq.df, 30) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment