Skip to content

Instantly share code, notes, and snippets.

@rterp

rterp/BayesScript.R

Last active Mar 1, 2017
Embed
What would you like to do?
install.packages("tm")
install.packages("e1071")
install.packages("gmodels")
install.package""
library(tm)
library(e1071)
library(gmodels)
library(wordcloud)
set.seed(123)
shipment.data.all <- read.table( "ShipmentsDescOnly.csv", sep="|", header=TRUE, stringsAsFactors = FALSE)
#Shuffle the shipments up so they aren't in any particular order.
shipment.data.all <- shipment.data.all[sample(nrow(shipment.data.all)),]
shipment.data.all$CompanyAbbreviation <- factor(shipment.data.all$CompanyAbbreviation)
# Build a word cloud for each company
aml <- subset(shipment.data.train, CompanyAbbreviation == "AML")
lint <- subset(shipment.data.train, CompanyAbbreviation == "LINT")
ltia <- subset(shipment.data.train, CompanyAbbreviation == "LTIA")
awe <- subset(shipment.data.train, CompanyAbbreviation == "AWE")
ltii <- subset(shipment.data.train, CompanyAbbreviation == "LTII")
lac <- subset(shipment.data.train, CompanyAbbreviation == "LAC")
wordcloud(aml$ShortDescription, max.words = 40, scale = c(3,0.5))
wordcloud(lint$ShortDescription, max.words = 40, scale = c(3,0.5))
wordcloud(ltia$ShortDescription, max.words = 40, scale = c(3,0.5))
wordcloud(awe$ShortDescription, max.words = 40, scale = c(3,0.5))
wordcloud(ltii$ShortDescription, max.words = 40, scale = c(3,0.5))
wordcloud(lac$ShortDescription, max.words = 40, scale = c(3,0.5))
#Cleanup the description fields remove numbers, punctuation etc.
corpus <- Corpus(VectorSource(shipment.data.all$ShortDescription))
corpus.clean <- tm_map(corpus, content_transformer(tolower))
corpus.clean <- tm_map(corpus.clean, removeNumbers)
corpus.clean <- tm_map(corpus.clean, removeWords, stopwords())
corpus.clean <- tm_map(corpus.clean, removePunctuation)
corpus.clean <- tm_map(corpus.clean, stripWhitespace)
document.term.matrix <- DocumentTermMatrix(corpus.clean)
#Break the data set into a training set containing 80% of the data, and a test set with the remaining.
training.set.size <- floor(0.80 * nrow(shipment.data.all))
training.index <- sample(seq_len(nrow(shipment.data.all)), size = training.set.size)
shipment.data.train <- shipment.data.all[training.index, ]
shipment.data.test <- shipment.data.all[-training.index, ]
#Get the data in a format the model can understand.
dtm.train <- document.term.matrix[training.index,]
dtm.test <- document.term.matrix[-training.index,]
corpus.train <- corpus.clean[training.index]
corpus.test <- corpus.clean[-training.index]
shipment.dict <- c(findFreqTerms(dtm.train,5))
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0,1), labels = c("No", "Yes"))
return(x)
}
shipments.train <- DocumentTermMatrix(corpus.train, list(dictionary=shipment.dict))
shipments.test <- DocumentTermMatrix(corpus.test, list(dictionary=shipment.dict))
shipments.train <- apply(shipments.train, MARGIN = 2, convert_counts)
shipments.test <- apply(shipments.test, MARGIN = 2, convert_counts)
#Train the model with the training data.
model <- naiveBayes(shipments.train, shipment.data.train$CompanyAbbreviation)
#See how well the model predicts based on the test data.
prediction <- predict(model, shipments.test)
#Print the results of the prediction
CrossTable(prediction, shipment.data.test$CompanyAbbreviation, prop.chisq = FALSE, prop.t = FALSE, dnn = c('predicted', 'actual'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment