Skip to content

Instantly share code, notes, and snippets.

@not-for-me
Last active August 29, 2015 14:03
Show Gist options
  • Save not-for-me/d3b9539e8ea81aa434a1 to your computer and use it in GitHub Desktop.
Save not-for-me/d3b9539e8ea81aa434a1 to your computer and use it in GitHub Desktop.
Clustering text(bibile) wirh R
# Library Load
library(tm)
# Set file Paths
otFilePath <- "~/Documents/mining/project/old"
ntFilePath <- "~/Documents/mining/project/new"
# Import txt to TextCorpus
oldTextCorpus <- Corpus(DirSource(otFilePath), readerControl = list(reader = readPlain, language = "en"))
newTextCorpus <- Corpus(DirSource(ntFilePath), readerControl = list(reader = readPlain, language = "en"))
summary(oldTextCorpus)
summary(newTextCorpus)
# Text Preprocessing
oldTextCorpus <- tm_map(oldTextCorpus, content_transformer(tolower))
oldTextCorpus <- tm_map(oldTextCorpus, removeWords, stopwords("english"))
myStopWords <- c("also", "among", "like", "may", "must", "shall", "take", "went", "will")
oldTextCorpus <- tm_map(oldTextCorpus, removeWords, myStopWords )
oldTextCorpus <- tm_map(oldTextCorpus, removeNumbers)
oldTextCorpus <- tm_map(oldTextCorpus, removePunctuation)
oldTextCorpus <- tm_map(oldTextCorpus, stripWhitespace)
library(SnowballC)
oldTextCorpus <- tm_map(oldTextCorpus, stemDocument)
newTextCorpus <- tm_map(newTextCorpus, content_transformer(tolower))
newTextCorpus <- tm_map(newTextCorpus, removeWords, stopwords("english"))
myStopWords <- c("also", "among", "like", "may", "must", "shall", "take", "went", "will")
newTextCorpus <- tm_map(newTextCorpus, removeWords, myStopWords )
newTextCorpus <- tm_map(newTextCorpus, removeNumbers)
newTextCorpus <- tm_map(newTextCorpus, removePunctuation)
newTextCorpus <- tm_map(newTextCorpus, stripWhitespace)
newTextCorpus <- tm_map(newTextCorpus, stemDocument)
bibleCorpus <- c(oldTextCorpus, newTextCorpus)
old_dtm <-DocumentTermMatrix(oldTextCorpus)
dim(old_dtm)
new_dtm <-DocumentTermMatrix(newTextCorpus)
dim(new_dtm)
bible_dtm <-DocumentTermMatrix(bibleCorpus)
dim(bible_dtm)
# Remove Sparse Terms
old_stm <- removeSparseTerms(old_dtm, 0.8)
dim(old_stm)
new_stm <- removeSparseTerms(new_dtm, 0.8)
dim(new_stm)
bible_stm <- removeSparseTerms(bible_dtm, 0.8)
dim(bible_stm)
old_tdm <- t(old_stm)
dim(old_tdm)
new_tdm <- t(new_stm)
dim(new_tdm)
bible_tdm <- t(bible_stm)
dim(bible_tdm)
# Convert stm to dataframe
old_df <- as.data.frame(inspect(old_stm))
new_df <- as.data.frame(inspect(new_stm))
bible_df <- as.data.frame(inspect(bible_stm))
## hierarchical clustering
library(proxy)
library(ggplot2)
## cosine Distance / single method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="single")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="single")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="single")
plot(bible_hc)
## cosine Distance / complete method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="complete")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="complete")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="complete")
plot(bible_hc)
## cosine Distance / average method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="average")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="average")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="average")
plot(bible_hc)
## cosine Distance / median method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="median")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="median")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="median")
plot(bible_hc)
## cosine Distance / centroid method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="centroid")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="centroid")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="centroid")
plot(bible_hc)
## cosine Distance / ward.D method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="ward.D")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="ward.D")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="ward.D")
plot(bible_hc)
## cosine Distance / ward.D2 method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="ward.D2")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="ward.D2")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="ward.D2")
plot(bible_hc)
## cosine Distance / mcquitty method
old_d <- dist(old_df, method="cosine")
old_hc <- hclust(old_d , method="mcquitty")
plot(old_hc)
new_d <- dist(new_df, method="cosine")
new_hc <- hclust(new_d , method="mcquitty")
plot(new_hc)
bible_d <- dist(bible_df, method="cosine")
bible_hc <- hclust(bible_d , method="mcquitty")
plot(bible_hc)
## euclidean Distance / single method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="single")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="single")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="single")
plot(bible_hc)
## euclidean Distance / complete method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="complete")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="complete")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="complete")
plot(bible_hc)
## euclidean Distance / average method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="average")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="average")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="average")
plot(bible_hc)
## euclidean Distance / median method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="median")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="median")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="median")
plot(bible_hc)
## euclidean Distance / centroid method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="centroid")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="centroid")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="centroid")
plot(bible_hc)
## euclidean Distance / ward.D method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="ward.D")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="ward.D")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="ward.D")
plot(bible_hc)
## euclidean Distance / ward.D2 method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="ward.D2")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="ward.D2")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="ward.D2")
plot(bible_hc)
## euclidean Distance / mcquitty method
old_d <- dist(old_df, method="euclidean")
old_hc <- hclust(old_d , method="mcquitty")
plot(old_hc)
new_d <- dist(new_df, method="euclidean")
new_hc <- hclust(new_d , method="mcquitty")
plot(new_hc)
bible_d <- dist(bible_df, method="euclidean")
bible_hc <- hclust(bible_d , method="mcquitty")
plot(bible_hc)
## manhattan Distance / single method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="single")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="single")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="single")
plot(bible_hc)
## manhattan Distance / complete method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="complete")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="complete")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="complete")
plot(bible_hc)
## manhattan Distance / average method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="average")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="average")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="average")
plot(bible_hc)
## manhattan Distance / median method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="median")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="median")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="median")
plot(bible_hc)
## manhattan Distance / centroid method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="centroid")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="centroid")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="centroid")
plot(bible_hc)
## manhattan Distance / ward.D method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="ward.D")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="ward.D")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="ward.D")
plot(bible_hc)
## manhattan Distance / ward.D2 method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="ward.D2")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="ward.D2")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="ward.D2")
plot(bible_hc)
## manhattan Distance / mcquitty method
old_d <- dist(old_df, method="manhattan")
old_hc <- hclust(old_d , method="mcquitty")
plot(old_hc)
new_d <- dist(new_df, method="manhattan")
new_hc <- hclust(new_d , method="mcquitty")
plot(new_hc)
bible_d <- dist(bible_df, method="manhattan")
bible_hc <- hclust(bible_d , method="mcquitty")
plot(bible_hc)
## canberra Distance / ward.D2 method
old_d <- dist(old_df, method="canberra")
old_hc <- hclust(old_d , method="ward.D2")
plot(old_hc)
new_d <- dist(new_df, method="canberra")
new_hc <- hclust(new_d , method="ward.D2")
plot(new_hc)
bible_d <- dist(bible_df, method="canberra")
bible_hc <- hclust(bible_d , method="ward.D2")
plot(bible_hc)
## canberra Distance / mcquitty method
old_d <- dist(old_df, method="canberra")
old_hc <- hclust(old_d , method="mcquitty")
plot(old_hc)
new_d <- dist(new_df, method="canberra")
new_hc <- hclust(new_d , method="mcquitty")
plot(new_hc)
bible_d <- dist(bible_df, method="canberra")
bible_hc <- hclust(bible_d , method="mcquitty")
plot(bible_hc)
## binary Distance / ward.D2 method
old_d <- dist(old_df, method="binary")
old_hc <- hclust(old_d , method="ward.D2")
plot(old_hc)
new_d <- dist(new_df, method="binary")
new_hc <- hclust(new_d , method="ward.D2")
plot(new_hc)
bible_d <- dist(bible_df, method="binary")
bible_hc <- hclust(bible_d , method="ward.D2")
plot(bible_hc)
## binary Distance / mcquitty method
old_d <- dist(old_df, method="binary")
old_hc <- hclust(old_d , method="mcquitty")
plot(old_hc)
new_d <- dist(new_df, method="binary")
new_hc <- hclust(new_d , method="mcquitty")
plot(new_hc)
bible_d <- dist(bible_df, method="binary")
bible_hc <- hclust(bible_d , method="mcquitty")
plot(bible_hc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment