wallingTACC/20NewsgroupsNaiveBayes.R

## 20NewsgroupsNaiveBayes.R
library(tm) #Text Mining
library(klaR) #Naive Bayes
library(parallel) #RCore parallel functions
library(data.table) #Improved version of data.frame with optimized helper methods like fread
library(caret) #Help us analyze the results
library(ggplot2) #Grammer for Graphics, very popular package
library(profr)

#-----
# Data: Download 20 newsgroup dataset here: http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
#----

#------------------------------------------------------
# 1: Piece everything together
#------------------------------------------------------
run.nBayes <- function(dir.path) {
  set.seed(23452) #Repeatable randomness
  options(mc.cores=4) #Sets value globally, important for 3rd party packages

  #Create initial dataset
  all.data <- create.dataset.parallel(dir.path)

  #Split into training vs test
  training.indexes <- sample(nrow(all.data), nrow(all.data)*0.9) #Randomly select docs for our training set
  training.data <- all.data[training.indexes,] #Subset all columns for matching rows
  test.data <- all.data[-training.indexes,] # '-' gives non-matches

  #Create TDM from just training data
  training.tdm <- get.tdm(training.data$.text)

  #Based on TDM, create doc 'term' vars
  training.data <- create.doc.vars.parallel(training.data, training.tdm)
  test.data <- create.doc.vars.parallel(test.data, training.tdm)

  #Train our model
  trained.model <- train.klaR.nBayes(training.data)

  #Test & Analyze
  predictions <- classify.klaR.nBayes.parallel(test.data, trained.model)
  predicted <- predictions$class
  actual <- test.data$.class

  return(list(predicted=predicted, actual=actual))
}

#------------------------------------------------------
# 2: Process a given document to extract information
#------------------------------------------------------
get.post.words <- function(file.path) {
  #Returns all the words in the 'subject' and 'body' of a newsgroup post located at file.path and extracts the 'class'

  #Extract newsgroup 'class' by taking next to last of the path when split by directory delimiter
  #The subdir specifies the 'class', the files represent a single post
  parts.list <- strsplit(file.path,"/") #Assumes using "/" delimiter in the path parameter!!!
  newsgroup <- parts.list[[1]][length(parts.list[[1]])-1] #This shows how to index a list object
  filename <- parts.list[[1]][length(parts.list[[1]])]

  #Now get contents from the file
  con <- file(file.path, open="rt", encoding="latin1") #Read in text mode
  text.list <- readLines(con) #Now we have all the text, one line per row in the text list
  close(con)

  #Get the subject w/o the 'Subject:' part.  This line will always start with 'Subject:'
  subject <- sub("Subject:", "", grep("Subject:", text.list, value=T))

  #Body is everything after the first blank line
  blank.index <- min(which(nchar(text.list)==0)) #'which' returns the indices where the statement is TRUE
  body.list <- text.list[blank.index+1:length(text.list)] #Example of subsetting a list
  cleansed.body.list <- gsub("\n", "", body.list) #Remove newlines
  cleansed.body.list <- cleansed.body.list[!is.na(cleansed.body.list)] #Remove NAs

  #Get all the text as single variable
  all.text <- paste(subject, cleansed.body.list, collapse=" ") #Collapse to a single 'string'.  paste = concatenate

  #Save the filename, class and text.  filename is useful for debugging, consider dropping in large runs
  result <- list(.file=filename, .class=newsgroup, .text=all.text) #A named 'list' is more efficient than a data.frame in many cases.  Use '.' so we don't have columns with same name as we add 'term' columns later
  return(result) #The return statement is not required, but makes code more readable
}

#------------------------------------------------------
# 3: Process directories
#------------------------------------------------------
create.dataset <- function(dir.path) {

  all.files <- list.files(dir.path, recursive=T, full.names=T)
  data.list <- lapply(all.files, function(i) get.post.words(i)) #Iterate over file paths in all.files, and process each file
  result <- do.call(rbind.data.frame, data.list) #Turn the list of lists into a data.frame
  result$.text <- as.character(result$.text) #data.frame by default turns all char to factor
  return(result)
}

create.dataset.parallel <- function(dir.path) {
  #A parallelized implementation of the above
  all.files <- list.files(dir.path, recursive=T, full.names=T)
  data.list <- mclapply(all.files, function(i) get.post.words(i)) #mclapply from package 'parallel', assigns files to each available core
  result <- rbindlist(data.list) #data.table package optimized implementation of lists -> data.table.  data.table inherits from data.frame and provides optimizations
  result$.class <- as.factor(result$.class) #rbindlist leaves char as char, but we want .class as.factor
  return(result)
}

#------------------------------------------------------
# 4: Build TermDocumentMatrix for a given Corpus of text
#------------------------------------------------------
get.tdm <- function(doc.vec, minDocFreq=min(10, length(doc.vec)/10)) {
  doc.corpus <- Corpus(VectorSource(doc.vec))

  #The order of the options in the following list can be changed, but will cause different results.  Each option is 'executed' in the order given
  control <- list(tolower=TRUE,
                  removePunctuation=TRUE,
                  removeNumbers=TRUE,
                  stopwords=TRUE,
                  bounds=list(global=c(minDocFreq,Inf)), #Sets the min/max number of docs in which a given term occurs
                  weighting=weightTf #Other options are available, such as TD-IDF
              )

  #TermDocumentMatrix uses mclapply under the hood, yay!!!
  doc.tdm <- TermDocumentMatrix(doc.corpus, control)
  return(doc.tdm)
}

#------------------------------------------------------
# 5: Translate the TDMs for a given doc into column variables
#------------------------------------------------------
create.doc.vars.parallel <- function(data, training.tdm) {
  #Returns a vector word occurance from the corpus based variables

  #Create a seperate tdm for each of the docs
  doc.tdm.list <- mclapply(data$.text, function(i) get.tdm(i, minDocFreq=1))

  #Get the indexes of the matching terms, one result per doc
  training.terms <- training.tdm$dimnames$Terms
  #Matches are the indexes in doc.tdm.list terms that match the training.terms corpus
  matches.list <- mclapply(doc.tdm.list, function(i) match(training.terms, i$dimnames$Terms))

  #Create variables based on those matches
  vars.list <- mclapply(1:length(matches.list), function(i) {
    r <- sapply(1:length(matches.list[[i]]), function(j) { #sapply is lapply that is 'simplified' to a vector
      val <- doc.tdm.list[[i]]$v[matches.list[[i]][j]]
      ifelse(is.na(val), 'F', 'T')
    })
    return(r)
  })

  #Mmapply
#   test.list <- mcmapply(FUN=function(i, j) { #sapply is lapply that is 'simplified' to a vector
#             val <- i$v[j]
#             ifelse(is.na(val), 'F', 'T')
#           }, doc.tdm.list, matches.list)

  #Only use lapply, task is small enough that the overhead of mclapply is not warranted
  named.vars <- lapply(vars.list, function(i) setNames(i, training.tdm$dimnames$Terms))
  named.vars.df <- data.frame(do.call(rbind, named.vars)) #Create a data.table which we can combine with the original.  Note: can't use rbindlist as it expects a list of lists/data.frame/data.table, we have a character vector
  result <- data.frame(c(data, named.vars.df))
  return(result)
}

#------------------------------------------------------
# 6: Train and utilize our Naive Bayes model using klaR directly
#------------------------------------------------------
train.klaR.nBayes <- function(training.data) {
  training.data.sub <- subset(training.data, select=-c(.file,.text)) #Remove the unused columns
  trained.model <- NaiveBayes(formula=.class ~ .,
                              data=training.data.sub,
                              fL=0, #No laplace correction, we don't have numeric data
                              usekernel=F) #Use gaussian
  return(trained.model)
}

classify.klaR.nBayes.parallel <- function(test.data, trained.model) {

  test.data.sub <- subset(test.data, select=-c(.file,.class,.text)) #Remove unused values
  result.list <- mclapply(1:nrow(test.data.sub), function(i) fastpredict.NaiveBayes(trained.model, test.data.sub[i,]))

  #Reformat the results to match basic predict.NaiveBayes return which is a named list(class, posterior) with X=nrow(test.data) list items
  classes <- sapply(result.list, function(i) i$class)
  posteriors <- t(sapply(result.list, function(i) i$posterior))
  result.formatted <- list(class=classes, posterior=posteriors)
  rownames(result.formatted$posterior) <- 1:nrow(test.data)
  colnames(result.formatted$posterior) <- colnames(result.list[[1]]$posterior)
  return(result.formatted)
}

#------------------------------------------------------
# 7: Use caret to analyze results
#------------------------------------------------------
analyze.results <- function(predicted, actual) {
  cm <- confusionMatrix(data=predicted, reference=actual) #Using package 'caret'
  values <- cm$byClass[,1] #First column from all rows
  classes <- sub('Class: ', '', rownames(cm$byClass))
  par(mar=c(12,4,2,2)) #Extra large bottom margin

  #This would go to Rplots.pdf in batch mode (ex. with Rscript)
  #Use getOption("device") to see default device for the given environment (normally windows() or X11())
  barplot(height=values,
          names.arg=classes,
          las=2,
          ylab='Sensitivity',
          main='Sensitivity by Newsgroup',
          col=rainbow(length(classes))
          )

  #A ggplot example
  #Very popular package, more control over how graphics are built up, good 'gallery' and documentation
  plot.data <- data.frame(value=values, class=classes)
  plot.ggplot <- ggplot(data=plot.data, aes(x=class, y=value, fill=class, label=value)) +
          geom_bar(stat='identity') +
          geom_text(size=8) +
          labs(title='Sensitivity by Newsgroup', y='Sensitivity', x='Newsgroup')

  #Send output to png
  png(file="myplot.png") #Opens a png 'device'
  print(plot.ggplot) #Need explicit print() in batch mode for ggplot graph objects, interactive mode uses implicit print()
  dev.off()
}

#------------------------------------------------------
# 8: Extra Credit - Code profiling
# line.profiling only available in R 3.0.0+
#------------------------------------------------------
run.Rprof <- function() {
  # Run RProf
  Rprof(filename="rprof.out", memory.profiling=F, line.profiling=T) #Be sure to 'source' code for proper line counting
  data <- create.dataset(dir.path)
  Rprof(NULL)
  summaryRprof(filename="rprof.out", lines="show")
  plot(parse_rprof("rprof.out"))
}

#------------------------------------------------------
# 9: Speedup klaR:::predict.NaiveBayes
#------------------------------------------------------
fastpredict.NaiveBayes <- function (object, newdata, threshold = 0.001, ...)
{
  if (missing(newdata))
    newdata <- object$x
  if ((!any(is.null(colnames(newdata)), is.null(object$varnames))) &&
        all(is.element(object$varnames, colnames(newdata))))
    newdata <- data.frame(newdata[, object$varnames])
  nattribs <- ncol(newdata)
  isnumeric <- sapply(newdata, is.numeric)
  newdata <- as.matrix(newdata) #Was using data.matrix which converts all factors to char and is slower
  Lfoo <- function(i) {
    tempfoo <- function(v) {
      nd <- ndata[v]
      if (is.na(nd))
        return(rep(1, length(object$apriori)))
      prob <- if (isnumeric[v]) {
        msd <- object$tables[[v]]
        if (object$usekernel)
          sapply(msd, FUN = function(y) dkernel(x = nd,
                                                kernel = y, ...))
        else dnorm(nd, msd[, 1], msd[, 2])
      }
      else {
        object$tables[[v]][, nd]
      }
      prob[prob == 0] <- threshold
      return(prob)
    }
    ndata <- newdata[i, ]
    tempres <- log(sapply(1:nattribs, tempfoo))
    L <- log(object$apriori) + rowSums(tempres)
    if (isTRUE(all.equal(sum(exp(L)), 0)))
      warning("Numerical 0 probability for all classes with observation ",
              i)
    L
  }
  L <- sapply(1:nrow(newdata), Lfoo)
  classdach <- factor(object$levels[apply(L, 2, which.max)],
                      levels = object$levels)
  posterior <- t(apply(exp(L), 2, function(x) x/sum(x)))
  colnames(posterior) <- object$levels
  rownames(posterior) <- names(classdach) <- rownames(newdata)
  return(list(class = classdach, posterior = posterior))
}

#------------------------------------------------------
# 10: A simple data set
#------------------------------------------------------
create.sample.data <- function() {
  sample.data <- rbind(list(.file=1, .class='Run', .text='The boy ran fast.', boy=T, girl=F, fast=T, slow=F, ran=T, dog=F, want=F, love=F),
                       list(.file=2, .class='Run', .text='The girl ran slow.', boy=F, girl=T, fast=F, slow=T, ran=T, dog=F, want=F, love=F),
                       list(.file=3, .class='Run', .text='He is fast.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=F, want=F, love=F),
                       list(.file=4, .class='Pet', .text='I love my dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=F, love=T),
                       list(.file=5, .class='Pet', .text='I want a dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=T, love=F),
                       list(.file=6, .class='Pet', .text='My dog is slow.', boy=F, girl=F, fast=F, slow=T, ran=F, dog=T, want=F, love=F)
  )
  sample.data
}

#------------------------------------------------------
# 11: A handy function for showing environment object sizes
#------------------------------------------------------
my.ls <- function(pos=1, sorted=F){
  .result <- sapply(ls(pos=pos, all.names=TRUE),
                    function(..x)object.size(eval(as.symbol(..x))) / 1048576.0)
  if (sorted){
    .result <- rev(sort(.result))
  }
  .ls <-
    as.data.frame(rbind(as.matrix(.result),"**Total"=sum(.result)))
  names(.ls) <- "Size (MB)"
  .ls$Size <- formatC(.ls$Size, big.mark=',', digits=0, format='f')
  .ls$Mode <- c(unlist(lapply(rownames(.ls)[-nrow(.ls)],
                              function(x)mode(eval(as.symbol(x))))), '-------')
  .ls[.ls$Mode != 'function',] #Exclude functions
}
	library(tm) #Text Mining
	library(klaR) #Naive Bayes
	library(parallel) #RCore parallel functions
	library(data.table) #Improved version of data.frame with optimized helper methods like fread
	library(caret) #Help us analyze the results
	library(ggplot2) #Grammer for Graphics, very popular package
	library(profr)

	#-----
	# Data: Download 20 newsgroup dataset here: http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
	#----

	#------------------------------------------------------
	# 1: Piece everything together
	#------------------------------------------------------
	run.nBayes <- function(dir.path) {
	set.seed(23452) #Repeatable randomness
	options(mc.cores=4) #Sets value globally, important for 3rd party packages

	#Create initial dataset
	all.data <- create.dataset.parallel(dir.path)

	#Split into training vs test
	training.indexes <- sample(nrow(all.data), nrow(all.data)*0.9) #Randomly select docs for our training set
	training.data <- all.data[training.indexes,] #Subset all columns for matching rows
	test.data <- all.data[-training.indexes,] # '-' gives non-matches

	#Create TDM from just training data
	training.tdm <- get.tdm(training.data$.text)

	#Based on TDM, create doc 'term' vars
	training.data <- create.doc.vars.parallel(training.data, training.tdm)
	test.data <- create.doc.vars.parallel(test.data, training.tdm)

	#Train our model
	trained.model <- train.klaR.nBayes(training.data)

	#Test & Analyze
	predictions <- classify.klaR.nBayes.parallel(test.data, trained.model)
	predicted <- predictions$class
	actual <- test.data$.class

	return(list(predicted=predicted, actual=actual))
	}

	#------------------------------------------------------
	# 2: Process a given document to extract information
	#------------------------------------------------------
	get.post.words <- function(file.path) {
	#Returns all the words in the 'subject' and 'body' of a newsgroup post located at file.path and extracts the 'class'

	#Extract newsgroup 'class' by taking next to last of the path when split by directory delimiter
	#The subdir specifies the 'class', the files represent a single post
	parts.list <- strsplit(file.path,"/") #Assumes using "/" delimiter in the path parameter!!!
	newsgroup <- parts.list[[1]][length(parts.list[[1]])-1] #This shows how to index a list object
	filename <- parts.list[[1]][length(parts.list[[1]])]

	#Now get contents from the file
	con <- file(file.path, open="rt", encoding="latin1") #Read in text mode
	text.list <- readLines(con) #Now we have all the text, one line per row in the text list
	close(con)

	#Get the subject w/o the 'Subject:' part. This line will always start with 'Subject:'
	subject <- sub("Subject:", "", grep("Subject:", text.list, value=T))

	#Body is everything after the first blank line
	blank.index <- min(which(nchar(text.list)==0)) #'which' returns the indices where the statement is TRUE
	body.list <- text.list[blank.index+1:length(text.list)] #Example of subsetting a list
	cleansed.body.list <- gsub("\n", "", body.list) #Remove newlines
	cleansed.body.list <- cleansed.body.list[!is.na(cleansed.body.list)] #Remove NAs

	#Get all the text as single variable
	all.text <- paste(subject, cleansed.body.list, collapse=" ") #Collapse to a single 'string'. paste = concatenate

	#Save the filename, class and text. filename is useful for debugging, consider dropping in large runs
	result <- list(.file=filename, .class=newsgroup, .text=all.text) #A named 'list' is more efficient than a data.frame in many cases. Use '.' so we don't have columns with same name as we add 'term' columns later
	return(result) #The return statement is not required, but makes code more readable
	}

	#------------------------------------------------------
	# 3: Process directories
	#------------------------------------------------------
	create.dataset <- function(dir.path) {

	all.files <- list.files(dir.path, recursive=T, full.names=T)
	data.list <- lapply(all.files, function(i) get.post.words(i)) #Iterate over file paths in all.files, and process each file
	result <- do.call(rbind.data.frame, data.list) #Turn the list of lists into a data.frame
	result$.text <- as.character(result$.text) #data.frame by default turns all char to factor
	return(result)
	}

	create.dataset.parallel <- function(dir.path) {
	#A parallelized implementation of the above
	all.files <- list.files(dir.path, recursive=T, full.names=T)
	data.list <- mclapply(all.files, function(i) get.post.words(i)) #mclapply from package 'parallel', assigns files to each available core
	result <- rbindlist(data.list) #data.table package optimized implementation of lists -> data.table. data.table inherits from data.frame and provides optimizations
	result$.class <- as.factor(result$.class) #rbindlist leaves char as char, but we want .class as.factor
	return(result)
	}

	#------------------------------------------------------
	# 4: Build TermDocumentMatrix for a given Corpus of text
	#------------------------------------------------------
	get.tdm <- function(doc.vec, minDocFreq=min(10, length(doc.vec)/10)) {
	doc.corpus <- Corpus(VectorSource(doc.vec))

	#The order of the options in the following list can be changed, but will cause different results. Each option is 'executed' in the order given
	control <- list(tolower=TRUE,
	removePunctuation=TRUE,
	removeNumbers=TRUE,
	stopwords=TRUE,
	bounds=list(global=c(minDocFreq,Inf)), #Sets the min/max number of docs in which a given term occurs
	weighting=weightTf #Other options are available, such as TD-IDF
	)

	#TermDocumentMatrix uses mclapply under the hood, yay!!!
	doc.tdm <- TermDocumentMatrix(doc.corpus, control)
	return(doc.tdm)
	}

	#------------------------------------------------------
	# 5: Translate the TDMs for a given doc into column variables
	#------------------------------------------------------
	create.doc.vars.parallel <- function(data, training.tdm) {
	#Returns a vector word occurance from the corpus based variables

	#Create a seperate tdm for each of the docs
	doc.tdm.list <- mclapply(data$.text, function(i) get.tdm(i, minDocFreq=1))

	#Get the indexes of the matching terms, one result per doc
	training.terms <- training.tdm$dimnames$Terms
	#Matches are the indexes in doc.tdm.list terms that match the training.terms corpus
	matches.list <- mclapply(doc.tdm.list, function(i) match(training.terms, i$dimnames$Terms))

	#Create variables based on those matches
	vars.list <- mclapply(1:length(matches.list), function(i) {
	r <- sapply(1:length(matches.list[[i]]), function(j) { #sapply is lapply that is 'simplified' to a vector
	val <- doc.tdm.list[[i]]$v[matches.list[[i]][j]]
	ifelse(is.na(val), 'F', 'T')
	})
	return(r)
	})

	#Mmapply
	# test.list <- mcmapply(FUN=function(i, j) { #sapply is lapply that is 'simplified' to a vector
	# val <- i$v[j]
	# ifelse(is.na(val), 'F', 'T')
	# }, doc.tdm.list, matches.list)

	#Only use lapply, task is small enough that the overhead of mclapply is not warranted
	named.vars <- lapply(vars.list, function(i) setNames(i, training.tdm$dimnames$Terms))
	named.vars.df <- data.frame(do.call(rbind, named.vars)) #Create a data.table which we can combine with the original. Note: can't use rbindlist as it expects a list of lists/data.frame/data.table, we have a character vector
	result <- data.frame(c(data, named.vars.df))
	return(result)
	}

	#------------------------------------------------------
	# 6: Train and utilize our Naive Bayes model using klaR directly
	#------------------------------------------------------
	train.klaR.nBayes <- function(training.data) {
	training.data.sub <- subset(training.data, select=-c(.file,.text)) #Remove the unused columns
	trained.model <- NaiveBayes(formula=.class ~ .,
	data=training.data.sub,
	fL=0, #No laplace correction, we don't have numeric data
	usekernel=F) #Use gaussian
	return(trained.model)
	}

	classify.klaR.nBayes.parallel <- function(test.data, trained.model) {

	test.data.sub <- subset(test.data, select=-c(.file,.class,.text)) #Remove unused values
	result.list <- mclapply(1:nrow(test.data.sub), function(i) fastpredict.NaiveBayes(trained.model, test.data.sub[i,]))

	#Reformat the results to match basic predict.NaiveBayes return which is a named list(class, posterior) with X=nrow(test.data) list items
	classes <- sapply(result.list, function(i) i$class)
	posteriors <- t(sapply(result.list, function(i) i$posterior))
	result.formatted <- list(class=classes, posterior=posteriors)
	rownames(result.formatted$posterior) <- 1:nrow(test.data)
	colnames(result.formatted$posterior) <- colnames(result.list[[1]]$posterior)
	return(result.formatted)
	}

	#------------------------------------------------------
	# 7: Use caret to analyze results
	#------------------------------------------------------
	analyze.results <- function(predicted, actual) {
	cm <- confusionMatrix(data=predicted, reference=actual) #Using package 'caret'
	values <- cm$byClass[,1] #First column from all rows
	classes <- sub('Class: ', '', rownames(cm$byClass))
	par(mar=c(12,4,2,2)) #Extra large bottom margin

	#This would go to Rplots.pdf in batch mode (ex. with Rscript)
	#Use getOption("device") to see default device for the given environment (normally windows() or X11())
	barplot(height=values,
	names.arg=classes,
	las=2,
	ylab='Sensitivity',
	main='Sensitivity by Newsgroup',
	col=rainbow(length(classes))
	)

	#A ggplot example
	#Very popular package, more control over how graphics are built up, good 'gallery' and documentation
	plot.data <- data.frame(value=values, class=classes)
	plot.ggplot <- ggplot(data=plot.data, aes(x=class, y=value, fill=class, label=value)) +
	geom_bar(stat='identity') +
	geom_text(size=8) +
	labs(title='Sensitivity by Newsgroup', y='Sensitivity', x='Newsgroup')

	#Send output to png
	png(file="myplot.png") #Opens a png 'device'
	print(plot.ggplot) #Need explicit print() in batch mode for ggplot graph objects, interactive mode uses implicit print()
	dev.off()
	}

	#------------------------------------------------------
	# 8: Extra Credit - Code profiling
	# line.profiling only available in R 3.0.0+
	#------------------------------------------------------
	run.Rprof <- function() {
	# Run RProf
	Rprof(filename="rprof.out", memory.profiling=F, line.profiling=T) #Be sure to 'source' code for proper line counting
	data <- create.dataset(dir.path)
	Rprof(NULL)
	summaryRprof(filename="rprof.out", lines="show")
	plot(parse_rprof("rprof.out"))
	}

	#------------------------------------------------------
	# 9: Speedup klaR:::predict.NaiveBayes
	#------------------------------------------------------
	fastpredict.NaiveBayes <- function (object, newdata, threshold = 0.001, ...)
	{
	if (missing(newdata))
	newdata <- object$x
	if ((!any(is.null(colnames(newdata)), is.null(object$varnames))) &&
	all(is.element(object$varnames, colnames(newdata))))
	newdata <- data.frame(newdata[, object$varnames])
	nattribs <- ncol(newdata)
	isnumeric <- sapply(newdata, is.numeric)
	newdata <- as.matrix(newdata) #Was using data.matrix which converts all factors to char and is slower
	Lfoo <- function(i) {
	tempfoo <- function(v) {
	nd <- ndata[v]
	if (is.na(nd))
	return(rep(1, length(object$apriori)))
	prob <- if (isnumeric[v]) {
	msd <- object$tables[[v]]
	if (object$usekernel)
	sapply(msd, FUN = function(y) dkernel(x = nd,
	kernel = y, ...))
	else dnorm(nd, msd[, 1], msd[, 2])
	}
	else {
	object$tables[[v]][, nd]
	}
	prob[prob == 0] <- threshold
	return(prob)
	}
	ndata <- newdata[i, ]
	tempres <- log(sapply(1:nattribs, tempfoo))
	L <- log(object$apriori) + rowSums(tempres)
	if (isTRUE(all.equal(sum(exp(L)), 0)))
	warning("Numerical 0 probability for all classes with observation ",
	i)
	L
	}
	L <- sapply(1:nrow(newdata), Lfoo)
	classdach <- factor(object$levels[apply(L, 2, which.max)],
	levels = object$levels)
	posterior <- t(apply(exp(L), 2, function(x) x/sum(x)))
	colnames(posterior) <- object$levels
	rownames(posterior) <- names(classdach) <- rownames(newdata)
	return(list(class = classdach, posterior = posterior))
	}

	#------------------------------------------------------
	# 10: A simple data set
	#------------------------------------------------------
	create.sample.data <- function() {
	sample.data <- rbind(list(.file=1, .class='Run', .text='The boy ran fast.', boy=T, girl=F, fast=T, slow=F, ran=T, dog=F, want=F, love=F),
	list(.file=2, .class='Run', .text='The girl ran slow.', boy=F, girl=T, fast=F, slow=T, ran=T, dog=F, want=F, love=F),
	list(.file=3, .class='Run', .text='He is fast.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=F, want=F, love=F),
	list(.file=4, .class='Pet', .text='I love my dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=F, love=T),
	list(.file=5, .class='Pet', .text='I want a dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=T, love=F),
	list(.file=6, .class='Pet', .text='My dog is slow.', boy=F, girl=F, fast=F, slow=T, ran=F, dog=T, want=F, love=F)
	)
	sample.data
	}

	#------------------------------------------------------
	# 11: A handy function for showing environment object sizes
	#------------------------------------------------------
	my.ls <- function(pos=1, sorted=F){
	.result <- sapply(ls(pos=pos, all.names=TRUE),
	function(..x)object.size(eval(as.symbol(..x))) / 1048576.0)
	if (sorted){
	.result <- rev(sort(.result))
	}
	.ls <-
	as.data.frame(rbind(as.matrix(.result),"**Total"=sum(.result)))
	names(.ls) <- "Size (MB)"
	.ls$Size <- formatC(.ls$Size, big.mark=',', digits=0, format='f')
	.ls$Mode <- c(unlist(lapply(rownames(.ls)[-nrow(.ls)],
	function(x)mode(eval(as.symbol(x))))), '-------')
	.ls[.ls$Mode != 'function',] #Exclude functions
	}