Created
August 6, 2019 09:18
-
-
Save wandabwa2004/5e07d8614b7253a70fa7368949cb3cd2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wordToRemove = c('the','mister','honourable','also','will','speaker') ##Found to be repetitive with no semantic sense. | |
docs <- tm_map(docs, tolower) #Lower case all words | |
docs <- tm_map(docs, removeNumbers) | |
docs <- tm_map(docs, removePunctuation) | |
docs <- tm_map(docs, removeWords, stopwords("english")) #TM function to remove stop words e.g. "is","the" etc | |
docs <- tm_map(docs,removeWords,wordToRemove) | |
docs <- tm_map(docs, stripWhitespace) | |
docs = tm_map(docs, PlainTextDocument) | |
dtm = DocumentTermMatrix(docs) #Convert text to term matrix format for easier computations | |
dim(dtm) | |
dtm <- removeSparseTerms(dtm, 0.75) | |
dim(dtm) | |
rownames(dtm) <- c("2014", "2015", "2016", "2017", "2018","2019") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment