Skip to content

Instantly share code, notes, and snippets.

@loiyumba
Created March 12, 2016 09:44
Show Gist options
  • Save loiyumba/f9fc491dc5ada8cf2e4b to your computer and use it in GitHub Desktop.
Save loiyumba/f9fc491dc5ada8cf2e4b to your computer and use it in GitHub Desktop.
### Words count of Bhagavadgita text ###
# Load the necessary packages
require(wordcloud)
require(tm)
# Set working directory
setwd("..\\Words Cloud for Bhagavadgita")
# Get the data from the site - www.gutenberg.org where all the texts are available for free
gita <- readLines("http://www.gutenberg.org/cache/epub/2388/pg2388.txt")
# How many lines it read?
length(gita) # 3687 lines
# We need to extract only the lines which is actually the text of Bhagavadgita, and remove other notes
# In order to do that we need to find out from which line it starts and in which line it ends
gita # read all lines to check the start and end lines.
# It starts from 170 and ends at 3229
gita <- gita[170:3229] # subset rows to get extract the actual text
head(gita) # read the first 6 lines
tail(gita) # read the last 6 lines
#### Not Run ####
# Bring all the texts into one blob of text
#gita_blob <- paste(gita, sep = "", collapse = " ")
#gita_blob
##################
#Remove all the punctuations
gita <- gsub(gita, pattern = "[[:punct:]]", replacement = " ")
# Lower case all the text
gita <- tolower(gita)
# Save the data as we don't have to download it every time we run this code
write.csv(gita, "gita.txt", row.names = FALSE)
# Reading it from local disk
gita <- read.csv("gita.txt")
# Create corpus
gita_corpus <- Corpus(VectorSource(gita))
gita_corpus <- tm_map(gita_corpus, stripWhitespace) # remove double spaces
gita_corpus <- tm_map(gita_corpus, removeWords, stopwords()) # remove all the stop words
gita_corpus <- tm_map(gita_corpus, removeWords, c("chapter")) # remove the word "chapter"
gita_corpus <- tm_map(gita_corpus, PlainTextDocument) # change it to plain text document
# Convert it to matrix
gita_DTM <- DocumentTermMatrix(gita_corpus)
gita_mat <- as.matrix(gita_DTM)
gita_word_freq <- sort(colSums(gita_mat), decreasing = TRUE) # get the count of each words and sort it
gita_df <- data.frame(word = names(gita_word_freq), freq = gita_word_freq) # change it to data frame
#### Not Run ####
#gita_tdm <- TermDocumentMatrix(gita_corpus)
#gita_tdm <- as.matrix(gita_tdm)
#gita_tdm_freq <- sort(rowSums(gita_tdm), decreasing = TRUE)
#gita_tdm_df <- data.frame(word = names(gita_tdm_freq), freq = gita_tdm_freq)
#row.names(gita_tdm_df) <- NULL
####################################
# Wordcloud
wordcloud(gita_df$word, gita_df$freq, random.order = FALSE, colors = brewer.pal(6, "Dark2"),
max.words = Inf)
row.names(gita_df) <- NULL
# Table
pander::pandoc.table(gita_df[1:20, ])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment