public
Created

Classifying Emails as Spam or Ham using RTextTools

  • Download Gist
Jurka_03_spam_RTextTools.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
suppressPackageStartupMessages(require(RTextTools))
suppressPackageStartupMessages(require(tm))
source("C:/Users/denbrige/100 FxOption/103 FxOptionVerBack/080 Fx Git/R-source/PlusReg.R", echo=FALSE)
spam.dir <- paste0(RegGetRNonSourceDir(), "spamassassin/")
get.msg <- function(path.dir)
{
con <- file(path.dir, open="rt", encoding="latin1")
text <- readLines(con)
msg <- text[seq(which(text=="")[1]+1,length(text),1)]
close(con)
return(paste(msg, collapse="\n"))
}
get.msg.try <- function(path.dir)
{
con <- file(path.dir, open="rt", encoding="latin1")
text <- readLines(con)
options(warn=-1)
msg <- tryCatch( text[seq(which(text=="")[1]+1,length(text),1)],
error=function(e) { 9999 }, finally={} )
close(con)
if( substr(msg, 1, 5)=="Error" )
{
return("Error")
}
else
{
return(paste(msg, collapse="\n"))
}
}
get.all <- function(path.dir)
{
all.file <- dir(path.dir)
all.file <- all.file[which(all.file!="cmds")]
msg.all <- sapply(all.file, function(p) get.msg(paste0(path.dir,p)))
}
get.all.try <- function(path.dir)
{
all.file <- dir(path.dir)
all.file <- all.file[which(all.file!="cmds")]
msg.all <- sapply(all.file, function(p) get.msg.try(paste0(path.dir,p)))
}
easy_ham.all <- get.all(paste0(spam.dir, "easy_ham/"))
easy_ham_2.all <- get.all(paste0(spam.dir, "easy_ham_2/"))
hard_ham.all <- get.all(paste0(spam.dir, "hard_ham/"))
hard_ham_2.all <- get.all(paste0(spam.dir, "hard_ham_2/"))
spam.all <- get.all.try(paste0(spam.dir, "spam/"))
spam_2.all <- get.all(paste0(spam.dir, "spam_2/"))
 
easy_ham.dfr <- as.data.frame(easy_ham.all)
easy_ham_2.dfr <- as.data.frame(easy_ham_2.all)
hard_ham.dfr <- as.data.frame(hard_ham.all)
hard_ham_2.dfr <- as.data.frame(hard_ham_2.all)
spam.dfr <- as.data.frame(spam.all)
spam_2.dfr <- as.data.frame(spam_2.all)
rownames(easy_ham.dfr) <- NULL
rownames(easy_ham_2.dfr) <- NULL
rownames(hard_ham.dfr) <- NULL
rownames(hard_ham_2.dfr) <- NULL
rownames(spam.dfr) <- NULL
rownames(spam_2.dfr) <- NULL
easy_ham.dfr$outcome <- 2
easy_ham_2.dfr$outcome <- 2
hard_ham.dfr$outcome <- 2
hard_ham_2.dfr$outcome <- 2
spam.dfr$outcome <- 4
spam_2.dfr$outcome <- 4
names(easy_ham.dfr) <- c("text", "outcome")
names(easy_ham_2.dfr) <- c("text", "outcome")
names(hard_ham.dfr) <- c("text", "outcome")
names(hard_ham_2.dfr) <- c("text", "outcome")
names(spam.dfr) <- c("text", "outcome")
names(spam_2.dfr) <- c("text", "outcome")
train.data <- rbind(easy_ham.dfr, hard_ham.dfr, spam.dfr)
train.num <- nrow(train.data)
train.data <- rbind(train.data, easy_ham_2.dfr, hard_ham_2.dfr, spam_2.dfr)
names(train.data) <- c("text", "outcome")
spam.str <- paste0(RegGetRNonSourceDir(),"Jurka_03_spam.rda")
if( !file.exists(spam.str) )
{
save(train.data, train.num, file=spam.str)
}
 
set.seed(2012)
train_out.data <- train.data$outcome
train_txt.data <- train.data$text
 
matrix <- create_matrix(train_txt.data, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE, weighting=weightTfIdf)
container <- create_container(matrix,t(train_out.data), trainSize=1:train.num, testSize=(train.num+1):nrow(train.data), virgin=FALSE)
maxent.model <- train_model(container, "MAXENT")
svm.model <- train_model(container, "SVM")
 
svm.result <- classify_model(container, svm.model)
svm.analytic <- create_analytics(container, svm.result)
svm.doc <- svm.analytic@document_summary
svm_spam.doc <- svm.doc[svm.doc$MANUAL_CODE==4, ]
svm_ham.doc <- svm.doc[svm.doc$MANUAL_CODE==2, ]
svm.true.pos <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==4,]) / nrow(svm_spam.doc)
svm.false.neg <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==2,]) / nrow(svm_spam.doc)
svm.true.neg <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==2,]) / nrow(svm_ham.doc)
svm.false.pos <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==4,]) / nrow(svm_ham.doc)
maxent.result <- classify_model(container, maxent.model)
maxent.analytic <- create_analytics(container, maxent.result)
maxent.doc <- maxent.analytic@document_summary
maxent_spam.doc <- maxent.doc[maxent.doc$MANUAL_CODE==4, ]
maxent_ham.doc <- maxent.doc[maxent.doc$MANUAL_CODE==2, ]
maxent.true.pos <- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==4,]) / nrow(maxent_spam.doc)
maxent.false.neg<- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==2,]) / nrow(maxent_spam.doc)
maxent.true.neg <- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==2,]) / nrow(maxent_ham.doc)
maxent.false.pos<- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==4,]) / nrow(maxent_ham.doc)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.