Last active
August 29, 2015 14:03
-
-
Save not-for-me/d3b9539e8ea81aa434a1 to your computer and use it in GitHub Desktop.
Clustering text(bibile) wirh R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Library Load | |
library(tm) | |
# Set file Paths | |
otFilePath <- "~/Documents/mining/project/old" | |
ntFilePath <- "~/Documents/mining/project/new" | |
# Import txt to TextCorpus | |
oldTextCorpus <- Corpus(DirSource(otFilePath), readerControl = list(reader = readPlain, language = "en")) | |
newTextCorpus <- Corpus(DirSource(ntFilePath), readerControl = list(reader = readPlain, language = "en")) | |
summary(oldTextCorpus) | |
summary(newTextCorpus) | |
# Text Preprocessing | |
oldTextCorpus <- tm_map(oldTextCorpus, content_transformer(tolower)) | |
oldTextCorpus <- tm_map(oldTextCorpus, removeWords, stopwords("english")) | |
myStopWords <- c("also", "among", "like", "may", "must", "shall", "take", "went", "will") | |
oldTextCorpus <- tm_map(oldTextCorpus, removeWords, myStopWords ) | |
oldTextCorpus <- tm_map(oldTextCorpus, removeNumbers) | |
oldTextCorpus <- tm_map(oldTextCorpus, removePunctuation) | |
oldTextCorpus <- tm_map(oldTextCorpus, stripWhitespace) | |
library(SnowballC) | |
oldTextCorpus <- tm_map(oldTextCorpus, stemDocument) | |
newTextCorpus <- tm_map(newTextCorpus, content_transformer(tolower)) | |
newTextCorpus <- tm_map(newTextCorpus, removeWords, stopwords("english")) | |
myStopWords <- c("also", "among", "like", "may", "must", "shall", "take", "went", "will") | |
newTextCorpus <- tm_map(newTextCorpus, removeWords, myStopWords ) | |
newTextCorpus <- tm_map(newTextCorpus, removeNumbers) | |
newTextCorpus <- tm_map(newTextCorpus, removePunctuation) | |
newTextCorpus <- tm_map(newTextCorpus, stripWhitespace) | |
newTextCorpus <- tm_map(newTextCorpus, stemDocument) | |
bibleCorpus <- c(oldTextCorpus, newTextCorpus) | |
old_dtm <-DocumentTermMatrix(oldTextCorpus) | |
dim(old_dtm) | |
new_dtm <-DocumentTermMatrix(newTextCorpus) | |
dim(new_dtm) | |
bible_dtm <-DocumentTermMatrix(bibleCorpus) | |
dim(bible_dtm) | |
# Remove Sparse Terms | |
old_stm <- removeSparseTerms(old_dtm, 0.8) | |
dim(old_stm) | |
new_stm <- removeSparseTerms(new_dtm, 0.8) | |
dim(new_stm) | |
bible_stm <- removeSparseTerms(bible_dtm, 0.8) | |
dim(bible_stm) | |
old_tdm <- t(old_stm) | |
dim(old_tdm) | |
new_tdm <- t(new_stm) | |
dim(new_tdm) | |
bible_tdm <- t(bible_stm) | |
dim(bible_tdm) | |
# Convert stm to dataframe | |
old_df <- as.data.frame(inspect(old_stm)) | |
new_df <- as.data.frame(inspect(new_stm)) | |
bible_df <- as.data.frame(inspect(bible_stm)) | |
## hierarchical clustering | |
library(proxy) | |
library(ggplot2) | |
## cosine Distance / single method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="single") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="single") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="single") | |
plot(bible_hc) | |
## cosine Distance / complete method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="complete") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="complete") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="complete") | |
plot(bible_hc) | |
## cosine Distance / average method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="average") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="average") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="average") | |
plot(bible_hc) | |
## cosine Distance / median method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="median") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="median") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="median") | |
plot(bible_hc) | |
## cosine Distance / centroid method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="centroid") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="centroid") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="centroid") | |
plot(bible_hc) | |
## cosine Distance / ward.D method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="ward.D") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="ward.D") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="ward.D") | |
plot(bible_hc) | |
## cosine Distance / ward.D2 method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="ward.D2") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="ward.D2") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="ward.D2") | |
plot(bible_hc) | |
## cosine Distance / mcquitty method | |
old_d <- dist(old_df, method="cosine") | |
old_hc <- hclust(old_d , method="mcquitty") | |
plot(old_hc) | |
new_d <- dist(new_df, method="cosine") | |
new_hc <- hclust(new_d , method="mcquitty") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="cosine") | |
bible_hc <- hclust(bible_d , method="mcquitty") | |
plot(bible_hc) | |
## euclidean Distance / single method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="single") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="single") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="single") | |
plot(bible_hc) | |
## euclidean Distance / complete method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="complete") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="complete") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="complete") | |
plot(bible_hc) | |
## euclidean Distance / average method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="average") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="average") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="average") | |
plot(bible_hc) | |
## euclidean Distance / median method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="median") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="median") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="median") | |
plot(bible_hc) | |
## euclidean Distance / centroid method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="centroid") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="centroid") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="centroid") | |
plot(bible_hc) | |
## euclidean Distance / ward.D method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="ward.D") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="ward.D") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="ward.D") | |
plot(bible_hc) | |
## euclidean Distance / ward.D2 method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="ward.D2") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="ward.D2") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="ward.D2") | |
plot(bible_hc) | |
## euclidean Distance / mcquitty method | |
old_d <- dist(old_df, method="euclidean") | |
old_hc <- hclust(old_d , method="mcquitty") | |
plot(old_hc) | |
new_d <- dist(new_df, method="euclidean") | |
new_hc <- hclust(new_d , method="mcquitty") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="euclidean") | |
bible_hc <- hclust(bible_d , method="mcquitty") | |
plot(bible_hc) | |
## manhattan Distance / single method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="single") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="single") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="single") | |
plot(bible_hc) | |
## manhattan Distance / complete method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="complete") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="complete") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="complete") | |
plot(bible_hc) | |
## manhattan Distance / average method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="average") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="average") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="average") | |
plot(bible_hc) | |
## manhattan Distance / median method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="median") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="median") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="median") | |
plot(bible_hc) | |
## manhattan Distance / centroid method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="centroid") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="centroid") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="centroid") | |
plot(bible_hc) | |
## manhattan Distance / ward.D method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="ward.D") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="ward.D") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="ward.D") | |
plot(bible_hc) | |
## manhattan Distance / ward.D2 method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="ward.D2") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="ward.D2") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="ward.D2") | |
plot(bible_hc) | |
## manhattan Distance / mcquitty method | |
old_d <- dist(old_df, method="manhattan") | |
old_hc <- hclust(old_d , method="mcquitty") | |
plot(old_hc) | |
new_d <- dist(new_df, method="manhattan") | |
new_hc <- hclust(new_d , method="mcquitty") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="manhattan") | |
bible_hc <- hclust(bible_d , method="mcquitty") | |
plot(bible_hc) | |
## canberra Distance / ward.D2 method | |
old_d <- dist(old_df, method="canberra") | |
old_hc <- hclust(old_d , method="ward.D2") | |
plot(old_hc) | |
new_d <- dist(new_df, method="canberra") | |
new_hc <- hclust(new_d , method="ward.D2") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="canberra") | |
bible_hc <- hclust(bible_d , method="ward.D2") | |
plot(bible_hc) | |
## canberra Distance / mcquitty method | |
old_d <- dist(old_df, method="canberra") | |
old_hc <- hclust(old_d , method="mcquitty") | |
plot(old_hc) | |
new_d <- dist(new_df, method="canberra") | |
new_hc <- hclust(new_d , method="mcquitty") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="canberra") | |
bible_hc <- hclust(bible_d , method="mcquitty") | |
plot(bible_hc) | |
## binary Distance / ward.D2 method | |
old_d <- dist(old_df, method="binary") | |
old_hc <- hclust(old_d , method="ward.D2") | |
plot(old_hc) | |
new_d <- dist(new_df, method="binary") | |
new_hc <- hclust(new_d , method="ward.D2") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="binary") | |
bible_hc <- hclust(bible_d , method="ward.D2") | |
plot(bible_hc) | |
## binary Distance / mcquitty method | |
old_d <- dist(old_df, method="binary") | |
old_hc <- hclust(old_d , method="mcquitty") | |
plot(old_hc) | |
new_d <- dist(new_df, method="binary") | |
new_hc <- hclust(new_d , method="mcquitty") | |
plot(new_hc) | |
bible_d <- dist(bible_df, method="binary") | |
bible_hc <- hclust(bible_d , method="mcquitty") | |
plot(bible_hc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment