Created
March 12, 2016 09:44
-
-
Save loiyumba/f9fc491dc5ada8cf2e4b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Words count of Bhagavadgita text ### | |
# Load the necessary packages | |
require(wordcloud) | |
require(tm) | |
# Set working directory | |
setwd("..\\Words Cloud for Bhagavadgita") | |
# Get the data from the site - www.gutenberg.org where all the texts are available for free | |
gita <- readLines("http://www.gutenberg.org/cache/epub/2388/pg2388.txt") | |
# How many lines it read? | |
length(gita) # 3687 lines | |
# We need to extract only the lines which is actually the text of Bhagavadgita, and remove other notes | |
# In order to do that we need to find out from which line it starts and in which line it ends | |
gita # read all lines to check the start and end lines. | |
# It starts from 170 and ends at 3229 | |
gita <- gita[170:3229] # subset rows to get extract the actual text | |
head(gita) # read the first 6 lines | |
tail(gita) # read the last 6 lines | |
#### Not Run #### | |
# Bring all the texts into one blob of text | |
#gita_blob <- paste(gita, sep = "", collapse = " ") | |
#gita_blob | |
################## | |
#Remove all the punctuations | |
gita <- gsub(gita, pattern = "[[:punct:]]", replacement = " ") | |
# Lower case all the text | |
gita <- tolower(gita) | |
# Save the data as we don't have to download it every time we run this code | |
write.csv(gita, "gita.txt", row.names = FALSE) | |
# Reading it from local disk | |
gita <- read.csv("gita.txt") | |
# Create corpus | |
gita_corpus <- Corpus(VectorSource(gita)) | |
gita_corpus <- tm_map(gita_corpus, stripWhitespace) # remove double spaces | |
gita_corpus <- tm_map(gita_corpus, removeWords, stopwords()) # remove all the stop words | |
gita_corpus <- tm_map(gita_corpus, removeWords, c("chapter")) # remove the word "chapter" | |
gita_corpus <- tm_map(gita_corpus, PlainTextDocument) # change it to plain text document | |
# Convert it to matrix | |
gita_DTM <- DocumentTermMatrix(gita_corpus) | |
gita_mat <- as.matrix(gita_DTM) | |
gita_word_freq <- sort(colSums(gita_mat), decreasing = TRUE) # get the count of each words and sort it | |
gita_df <- data.frame(word = names(gita_word_freq), freq = gita_word_freq) # change it to data frame | |
#### Not Run #### | |
#gita_tdm <- TermDocumentMatrix(gita_corpus) | |
#gita_tdm <- as.matrix(gita_tdm) | |
#gita_tdm_freq <- sort(rowSums(gita_tdm), decreasing = TRUE) | |
#gita_tdm_df <- data.frame(word = names(gita_tdm_freq), freq = gita_tdm_freq) | |
#row.names(gita_tdm_df) <- NULL | |
#################################### | |
# Wordcloud | |
wordcloud(gita_df$word, gita_df$freq, random.order = FALSE, colors = brewer.pal(6, "Dark2"), | |
max.words = Inf) | |
row.names(gita_df) <- NULL | |
# Table | |
pander::pandoc.table(gita_df[1:20, ]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment