Created
December 13, 2016 13:20
-
-
Save shan4224/be57a36a36fa184af36a5d74469dbe83 to your computer and use it in GitHub Desktop.
Predicting spam messages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("E:/SS/AV/OnlineHack/Kaggle/SMS Spam Collection Dataset") | |
#Classifying SMS messages as SPAM/NON-SPAM based on their content. | |
#Load libraries | |
library(readr) | |
library(caTools) | |
library(e1071) | |
library(randomForest) | |
library(rpart) | |
library(rpart.plot) | |
library(wordcloud) | |
library(tm) | |
library(SnowballC) | |
library(ROCR) | |
library(pROC) | |
library(RColorBrewer) | |
library(stringr) | |
library(ggplot2) | |
library(plotly) | |
library(lattice) | |
#Get input | |
sms <- read.csv("spam.csv", stringsAsFactors = F) | |
str(sms) | |
# remove empty columns | |
sms$X <- NULL | |
sms$X.1 <- NULL | |
sms$X.2 <- NULL | |
names(sms) <- c("label","message") | |
levels(as.factor(sms$label)) | |
sms$label[sms$label == "ham"] <- "non-spam" | |
sms$label[sms$label == "spam"] <- "spam" | |
sms$label <- factor(sms$label) | |
#Text Analysis | |
#Clean text for analysis | |
# create bag of words from text | |
bag <- Corpus(VectorSource(sms$message)) | |
bag <- tm_map(bag, tolower) | |
bag <- tm_map(bag, PlainTextDocument) | |
bag <- tm_map(bag, removePunctuation) | |
bag <- tm_map(bag, removeWords, c(stopwords("english"))) | |
bag <- tm_map(bag, stripWhitespace) | |
bag <- tm_map(bag, stemDocument) | |
#Convert bag of words to data frame | |
frequencies <- DocumentTermMatrix(bag) | |
# look at words that appear atleast 150 times | |
findFreqTerms(frequencies, lowfreq = 150) | |
# [1] "back" "call" "can" "come" "day" "dont" "free" "get" "good" "got" "home" "ill" "just" | |
# [14] "know" "like" "lor" "love" "ltgt" "mobil" "need" "now" "one" "repli" "see" "send" "sorri" | |
# [27] "still" "stop" "take" "tell" "text" "think" "time" "today" "txt" "want" "week" "will" | |
sparseWords <- removeSparseTerms(frequencies, 0.995) | |
# convert the matrix of sparse words to data frame | |
sparseWords <- as.data.frame(as.matrix(sparseWords)) | |
# rename column names to proper format in order to be used by R | |
colnames(sparseWords) <- make.names(colnames(sparseWords)) | |
str(sparseWords) | |
sparseWords$label <- sms$label | |
#Predicting whether SMS is spam/non-spam | |
#split data into 75:25 and assign to train and test. | |
set.seed(256) | |
split <- sample.split(sparseWords$label, SplitRatio = 0.75) | |
train <- subset(sparseWords, split == T) | |
test <- subset(sparseWords, split == F) | |
#Baseline Model(predicting every message as non-spam) | |
table(test$label) | |
## | |
## non-spam spam | |
## 1206 187 | |
print(paste("Predicting all messages as non-spam gives an accuracy of: ", | |
100*round(table(test$label)[1]/nrow(test), 4), "%")) | |
## [1] "Predicting all messages as non-spam gives an accuracy of: 86.58 %" | |
#Logistic Regression Model | |
glm.model <- glm(label ~ ., data = train, family = "binomial") | |
glm.predict <- predict(glm.model, test, type = "response") | |
### ROC curve | |
glm.ROCR <- prediction(glm.predict, test$label) | |
print(glm.AUC <- as.numeric(performance(glm.ROCR,"auc")@y.values)) | |
## [1] 0.9617931 | |
glm.prediction <- prediction(abs(glm.predict), test$label) | |
glm.performance <- performance(glm.prediction,"tpr","fpr") | |
plot(glm.performance) | |
### selecting threshold = 0.75 for spam filtering | |
table(test$label, glm.predict > 0.9) | |
## | |
## FALSE TRUE | |
## non-spam 1193 13 | |
## spam 38 149 | |
glm.accuracy.table <- as.data.frame(table(test$label, glm.predict > 0.9)) | |
print(paste("logistic model accuracy:", | |
100*round(((glm.accuracy.table$Freq[1]+glm.accuracy.table$Freq[4])/nrow(test)), 4), | |
"%")) | |
## [1] "logistic model accuracy: 96.34 %" | |
#Support Vector Machine Model | |
svm.model <- svm(label ~ ., data = train, kernel = "linear", cost = 0.1, gamma = 0.1) | |
svm.predict <- predict(svm.model, test) | |
table(test$label, svm.predict) | |
## svm.predict | |
## non-spam spam | |
## non-spam 1190 16 | |
## spam 28 159 | |
svm.accuracy.table <- as.data.frame(table(test$label, svm.predict)) | |
print(paste("SVM accuracy:", | |
100*round(((svm.accuracy.table$Freq[1]+svm.accuracy.table$Freq[4])/nrow(test)), 4), | |
"%")) | |
## [1] "SVM accuracy: 96.84 %" | |
#Decision Trees | |
tree.model <- rpart(label ~ ., data = train, method = "class", minbucket = 15) | |
# visualize the decision tree. It tells us about significant words. | |
prp(tree.model) | |
tree.predict <- predict(tree.model, test, type = "class") | |
table(test$label, tree.predict) | |
## tree.predict | |
## non-spam spam | |
## non-spam 1179 27 | |
## spam 94 93 | |
rpart.accuracy.table <- as.data.frame(table(test$label, tree.predict)) | |
print(paste("rpart (decision tree) accuracy:", | |
100*round(((rpart.accuracy.table$Freq[1]+rpart.accuracy.table$Freq[4])/nrow(test)), 4), | |
"%")) | |
## [1] "rpart (decision tree) accuracy: 91.31 %" | |
#SVM is the most accurate model but rpart is the most interpretable because it tells us about the words that play a significant role in detecting whether a SMS is SPAM or NON-SPAM. | |
#Random Forest | |
set.seed(256) | |
rf.model <- randomForest(label ~ ., data = train, ntree=400, mtry=15, importance=T) | |
# Importance tells us about significant words. | |
importance(rf.model) | |
varimp <- varImpPlot(rf.model,main = "Importance of each variable") | |
# Plot error | |
plot(rf.model, main ="Evolution of the error") | |
# significant predictors based on MeanDecreaseGini | |
#dotplot(sort(varimp[,2]), | |
# xlab="Variable Importance in DATA\n(predictors to right of dashed vertical line are significant)", | |
# panel = function(x,y){ | |
# panel.dotplot(x, y, col='darkblue', pch=16, cex=1.1) | |
# panel.abline(v=abs(min(varimp)), | |
# col='red', | |
# lty='longdash', lwd=2) | |
# } | |
#) | |
rf.predict <- predict(rf.model, test, type = "class") | |
table(test$label, rf.predict) | |
## tree.predict | |
## non-spam spam | |
## non-spam 1199 7 | |
## spam 37 150 | |
rf.accuracy.table <- as.data.frame(table(test$label, rf.predict)) | |
print(paste("random forest accuracy:", | |
100*round(((rf.accuracy.table$Freq[1]+rf.accuracy.table$Freq[4])/nrow(test)), 4), | |
"%")) | |
#"random forest accuracy: 96.84 %" | |
## Since tagging a non-spam as spam incurs more cost than otherwise, we can follow | |
rf.predict <- predict(rf.model, test, type = "prob") | |
table(test$label, rf.predict[,2] >0.7) | |
# FALSE TRUE | |
#non-spam 1206 0 | |
#spam 53 134 | |
accuracy <- (1206+134)/nrow(test) | |
accuracy | |
# 0.9619526 | |
#Wordcloud | |
bag <- TermDocumentMatrix(bag) | |
bag <- as.matrix(bag) | |
bag <- sort(rowSums(bag), decreasing = T) | |
bag.df <- data.frame(word = names(bag), freq = bag) | |
set.seed(154) | |
str(bag) | |
## Named num [1:7804] 653 478 447 405 384 366 297 279 276 275 ... | |
## - attr(*, "names")= chr [1:7804] "call" "now" "get" "can" ... | |
wordcloud(words = bag.df$word, freq = bag.df$freq, min.freq = 100, | |
max.words=1500, random.order=FALSE, rot.per=0.25, | |
colors=brewer.pal(8, "Dark2"), | |
scale = c(0.5,3)) | |
#Wordcloud | |
bag <- TermDocumentMatrix(bag) | |
bag <- as.matrix(bag) | |
bag <- sort(rowSums(bag), decreasing = T) | |
bag.df <- data.frame(word = names(bag), freq = bag) | |
set.seed(154) | |
str(bag) | |
## Named num [1:7804] 653 478 447 405 384 366 297 279 276 275 ... | |
## - attr(*, "names")= chr [1:7804] "call" "now" "get" "can" ... | |
wordcloud(words = bag.df$word, freq = bag.df$freq, min.freq = 100, | |
max.words=1500, random.order=FALSE, rot.per=0.25, | |
colors=brewer.pal(8, "Dark2"), | |
scale = c(0.5,3)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment