loiyumba/gita_wordcloud.r

## gita_wordcloud.r
### Words count of Bhagavadgita text ###

# Load the necessary packages
require(wordcloud)
require(tm)

# Set working directory
setwd("..\\Words Cloud for Bhagavadgita")

# Get the data from the site - www.gutenberg.org where all the texts are available for free
gita <- readLines("http://www.gutenberg.org/cache/epub/2388/pg2388.txt")

# How many lines it read?
length(gita) # 3687 lines

# We need to extract only the lines which is actually the text of Bhagavadgita, and remove other notes
# In order to do that we need to find out from which line it starts and in which line it ends
gita # read all lines to check the start and end lines.
# It starts from 170 and ends at 3229
gita <- gita[170:3229] # subset rows to get extract the actual text
head(gita) # read the first 6 lines
tail(gita) # read the last 6 lines

#### Not Run ####
# Bring all the texts into one blob of text
#gita_blob <- paste(gita, sep = "", collapse = " ")
#gita_blob
##################

#Remove all the punctuations
gita <- gsub(gita, pattern = "[[:punct:]]", replacement = " ")
# Lower case all the text
gita <- tolower(gita)
# Save the data as we don't have to download it every time we run this code
write.csv(gita, "gita.txt", row.names = FALSE)
# Reading it from local disk
gita <- read.csv("gita.txt")

# Create corpus
gita_corpus <- Corpus(VectorSource(gita))
gita_corpus <- tm_map(gita_corpus, stripWhitespace) # remove double spaces
gita_corpus <- tm_map(gita_corpus, removeWords, stopwords()) # remove all the stop words
gita_corpus <- tm_map(gita_corpus, removeWords, c("chapter")) # remove the word "chapter"
gita_corpus <- tm_map(gita_corpus, PlainTextDocument) # change it to plain text document

# Convert it to matrix
gita_DTM <- DocumentTermMatrix(gita_corpus)
gita_mat <- as.matrix(gita_DTM)
gita_word_freq <- sort(colSums(gita_mat), decreasing = TRUE) # get the count of each words and sort it
gita_df <- data.frame(word = names(gita_word_freq), freq = gita_word_freq) # change it to data frame

#### Not Run ####
#gita_tdm <- TermDocumentMatrix(gita_corpus)
#gita_tdm <- as.matrix(gita_tdm)
#gita_tdm_freq <- sort(rowSums(gita_tdm), decreasing = TRUE)
#gita_tdm_df <- data.frame(word = names(gita_tdm_freq), freq = gita_tdm_freq)
#row.names(gita_tdm_df) <- NULL
####################################

# Wordcloud
wordcloud(gita_df$word, gita_df$freq, random.order = FALSE, colors = brewer.pal(6, "Dark2"),
          max.words = Inf)
row.names(gita_df) <- NULL
# Table
pander::pandoc.table(gita_df[1:20, ])
	### Words count of Bhagavadgita text ###

	# Load the necessary packages
	require(wordcloud)
	require(tm)

	# Set working directory
	setwd("..\\Words Cloud for Bhagavadgita")

	# Get the data from the site - www.gutenberg.org where all the texts are available for free
	gita <- readLines("http://www.gutenberg.org/cache/epub/2388/pg2388.txt")

	# How many lines it read?
	length(gita) # 3687 lines

	# We need to extract only the lines which is actually the text of Bhagavadgita, and remove other notes
	# In order to do that we need to find out from which line it starts and in which line it ends
	gita # read all lines to check the start and end lines.
	# It starts from 170 and ends at 3229
	gita <- gita[170:3229] # subset rows to get extract the actual text
	head(gita) # read the first 6 lines
	tail(gita) # read the last 6 lines

	#### Not Run ####
	# Bring all the texts into one blob of text
	#gita_blob <- paste(gita, sep = "", collapse = " ")
	#gita_blob
	##################

	#Remove all the punctuations
	gita <- gsub(gita, pattern = "[[:punct:]]", replacement = " ")
	# Lower case all the text
	gita <- tolower(gita)
	# Save the data as we don't have to download it every time we run this code
	write.csv(gita, "gita.txt", row.names = FALSE)
	# Reading it from local disk
	gita <- read.csv("gita.txt")

	# Create corpus
	gita_corpus <- Corpus(VectorSource(gita))
	gita_corpus <- tm_map(gita_corpus, stripWhitespace) # remove double spaces
	gita_corpus <- tm_map(gita_corpus, removeWords, stopwords()) # remove all the stop words
	gita_corpus <- tm_map(gita_corpus, removeWords, c("chapter")) # remove the word "chapter"
	gita_corpus <- tm_map(gita_corpus, PlainTextDocument) # change it to plain text document

	# Convert it to matrix
	gita_DTM <- DocumentTermMatrix(gita_corpus)
	gita_mat <- as.matrix(gita_DTM)
	gita_word_freq <- sort(colSums(gita_mat), decreasing = TRUE) # get the count of each words and sort it
	gita_df <- data.frame(word = names(gita_word_freq), freq = gita_word_freq) # change it to data frame

	#### Not Run ####
	#gita_tdm <- TermDocumentMatrix(gita_corpus)
	#gita_tdm <- as.matrix(gita_tdm)
	#gita_tdm_freq <- sort(rowSums(gita_tdm), decreasing = TRUE)
	#gita_tdm_df <- data.frame(word = names(gita_tdm_freq), freq = gita_tdm_freq)
	#row.names(gita_tdm_df) <- NULL
	####################################

	# Wordcloud
	wordcloud(gita_df$word, gita_df$freq, random.order = FALSE, colors = brewer.pal(6, "Dark2"),
	max.words = Inf)
	row.names(gita_df) <- NULL
	# Table
	pander::pandoc.table(gita_df[1:20, ])