joseph-rickert/BIG DATA with RevoScale R

## BIG DATA with RevoScale R

#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# BIG DATA with RevoScaleR
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
# ----------------------------------------------------------------------
# LOOK AT THE MORTGATE DEFAULT DATA
#------------------------------------------------------------------------
dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
mdata <- file.path(dataDir,"mortDefault.xdf")
rxGetInfo(mdata,getVarInfo=TRUE)

#-----------------------------------------------------------------------------------
## Create a new data file having a variable with uniform random numbers
# going from 1 to 10. This variable will be used to create the training and test
# data sets.
# A little note on how the random numbers are created:
#		A transform should work on an arbitrary chunk of data.  Typically
#		RevoScaleR functions will test transforms on a small chunk before
#		fully processing. The internal variable (.rxNumRows) gives the size
#		of the chunk.

rxDataStep(inData = mdata, outFile = "mortDefault2",
      transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
      overwrite=TRUE)
rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)

#
#------------------------------------------------------------
# KMEANS ANALYSIS
#------------------------------------------------------------
rxDataStep(inData="mortDefault2",outFile="mortDefault3",
	       varsToDrop="default",
		   overwrite=TRUE)
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)

form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
md.km <- rxKmeans(formula=form,
						data = "mortDefault3",
						numClusters = 3,
 						outFile = "mortDefault3",
						algorithm = "lloyd",
						overwrite=TRUE)
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
md.km
# Build a data frame to do a plot
mdDf <- rxXdfToDataFrame(file="mortDefault3",
						 rowSelection=urns == 5,
                         maxRowsByCols = 1000)

plot(mdDf[,1:4],col=mdDf$.rxCluster)
title(main="Clusters in Mortgage Default Data",line=3)

###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
#---------------------------------------------------------------------------
# Some subsidary functions
#---------------------------------------------------------------------------
# Function to compute a "long form" of the confusion matrix
Cmatrix <- function(df){
	df <- as.data.frame(df)
	df$Result <- c("True Negative","False Negative","False Positive","True Positive")
	df$PCT <- round(df$Counts/sum(df$Counts),2)*100
	df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
	               df$Counts[2]/(df$Counts[2]+df$Counts[4]),
				   df$Counts[3]/(df$Counts[1]+df$Counts[3]),
	               df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
	names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
	return(df)
}
#------------------------------------------------------------------------------
##### CREATE TRAINING AND TEST FILES
#-----------------------------------
#info <- rxGetInfo(mdata)
#N <- info$numRows
#

#-------------------------------------------------------------------------------
# BUILD THE TRAINING FILE
#------------------------
rxDataStepXdf(inFile = "mortDefault2",
	          outFile = "mdTrain",
              rowSelection = urns < 9,
 			  transforms=list(CS = creditScore,
				              YR = year,
						      yrE = yearsEmploy,
						      HA = houseAge,
							  ccD = ccDebt),
			  blocksPerRead=20,
			  rowsPerRead=500000,
			  overwrite=TRUE )

rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
rxHistogram(~default,data="mdTrain")
#-------------------------
# BUILD THE TEST FILE
#-------------------------
rxDataStepXdf(inFile = "mortDefault2",
	          outFile = "mdTest",
              rowSelection = urns > 8,
 			  transforms=list(CS = creditScore,
				              YR = year,
						      yrE = yearsEmploy,
						      HA = houseAge,
							  ccD = ccDebt),
			  blocksPerRead=20,
			  rowsPerRead=500000,
			  overwrite=TRUE )
#
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
rxHistogram(~default,data="mdTest")
#---------------------------------------------------------------------------
# BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
#---------------------------------------------------------------------------
system.time(
model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt,
	             data="mdTrain",
				 reportProgress=rxGetOption("reportProgress") )
			)
#
#Elapsed computation time: 21.533 secs.
   #user  system elapsed
  #56.15   12.02   21.55


#Elapsed computation time: 23.149 secs.
   #user  system elapsed
  #56.81   10.58   23.17
#Elapsed computation time: 24.384 secs.
   #user  system elapsed
  #59.29   10.31   24.48

summary(model)

#----------------------------------------------------------------------
# MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
#----------------------------------------------------------------------
rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
#rxSummary(~default_Pred,data="mdTest")
# Add a new prediction variable
rxDataStep(inData="mdTest",outFile="mdTest",
	            transforms=list(LogitPred.L = as.logical(round(LogitPred))),
				overwrite=TRUE)
#
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)

#-------------------------------------------------------------------------------
# GENERATE THE CONFUSION MATRIX
#-------------------------------
conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
Cmatrix(conMc)

# Examine the performance of the model
total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
total.pct.correct
#-----------------------------------------------------------------------------------
# Generate the ROC Curve
#
rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
#
#-------------------------------------------------------------------------------------

# BUILD A TREE MODEL
system.time(
model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD,
	             data="mdTrain",
				 blocksPerRead = 1,
				 maxDepth=5,
				 reportProgress=rxGetOption("reportProgress") )
			)
##

#Elapsed time for RxDTreeBase: 89.545 secs.
#
   #user  system elapsed
 #245.13   12.50   89.57


#Elapsed time for RxDTreeBase: 403.785 secs.
# This was to fully build out the tree
#user  system elapsed
#1092.37   75.89  403.83

model.tree
#
#----------------------------------------------------------------
# Plot the Tree
plot(rxAddInheritance(model.tree),uniform=TRUE)
text(rxAddInheritance(model.tree),digits=2)
title(main="Classification Tree for Mortgage Data",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#-------------------------------------------------------------------

###### - END DEMO HERE - ###########


## GETTING STARTED
#
#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# GETTING STARTED
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html

#---------------------------------------------------------------------
# Execute the following command to install all of the packages needed for the webinar
#install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse",
			        #"ISwR","partykit","pROC","rattle","RColorBrewer",
					#"rpart","Snowball","ROCR","tm","twitteR","wordcloud"))
#
#----------------------------------------------------------------------
# A First look at R
# A simple regression example from
# Statistics and Computing, Introductory Statistics with R
# Peter Dalgaard, Springer 2002
##
library(ISwR)			# Load a library
data()					# Have a look at what data sets are available
data(thuesen)			# Load thuesen into the environment
thuesen					# Have a look at it
class(thuesen)          # Find out what kind of object thuesen is
sapply(thuesen,class)   # See what kinds of animal the variables are
#
plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface
#
plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame
#
model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model
summary(model)			# Look at the results
str(model)				# Look at the structure of the model object
# Build a fancier plot
plot(x=thuesen$blood.glucose,
	 y=thuesen$short.velocity,
	 xlab="blood glucose (mmol / l)",
	 ylab = "circumferential shortening velocity (%/s)",
	 main = "Thuesen Data set",
	 col="blue",
	 pch=19
	 )
abline(model,col="red")
#
par(mfrow=c(2,2))				# Set up for multiple plots
plot(model, col="blue")			# look at some diagnostics

#---------------------------------------------------------------------
#
# A FIRST LOOK AT FUNCTIONS
#
# Let's create a simple function
joe.stats <- function(data){
	           min <- min(data)
			   max <- max(data)
			   q <- quantile(data,probs=seq(0,1,0.25))
			   res <- list(min,max,q)
			   return(res)
		}

attach(thuesen)			 # make the columns of thuesen available
                         # in the global environment as variables
joe.stats(blood.glucose) # Run our function

summary(blood.glucose)   # R does it better


# Set up for later
rm(list=ls())
load("WEBINAR_2-14-13_Intro_R_DM_caret .RData")
#--------------------------------------------------------------------------------
#SOME ADDITIONAL ONLINE RESOURCES
#An Introduction to R
#Notes on R: A Programming Environment for Data Analysis and Graphics
#Version 2.15.2 (2012-10-26)
#http://cran.r-project.org/doc/manuals/R-intro.pdf
#
#Using R for Data Analysis and Graphics
#Introduction, Code and Commentary
#J H Maindonald
#http://cran.r-project.org/doc/contrib/usingR.pdf

## IN THE TREES
#------------------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
#### BUILD A TREE MODEL WITH RPART AND EVALUATE #####
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#-------------------------------------------------------------------------
# This script divides the data into training, validation and testing data,
# builds two different decision trees (rpart) using the training data and
# evaluates their performance using the test data set
# An ROC curve is produced for the better model
#------------------------------------------------------------------------
library(rattle)
library(rpart)
library(ROCR)
library(caret)
# -----------------------------------------------------------------------
# Read in the data from disk
#	name <- "weather.csv"
#	path <- file.path(getwd(),name)
#	weather <- read.csv(path,header=TRUE)
#	Show weather on the IDE editor
data(weather)
head(weather)
#------------------------------------------------------------------------
# Select variables for the model
weather <- subset(weather,select=c(MinTemp:RainTomorrow))
set.seed(42)											# Set seed
#-------------------------------------------------------------------------
# Determined the observations for the training,validate and test datasets.
N <- nrow(weather)										# 366 observations
train <- sample(N, 0.8*N)								# 292 observations
test <-  setdiff(seq_len(N),train)                      #  74 observations not intrain
#-------------------------------------------------------------------------
# Build the model
M <- ncol(weather)
input  <- names(weather)[1:(M-2)]						# names of input variables
target <- "RainTomorrow"								# name of target variable
form <- formula(RainTomorrow ~ .)						# Describe the model to R
tree.m <- rpart(RainTomorrow ~ .,
			    data=weather[train, c(input,target)],
				method="class",
				parms=list(split="information"),
				control=rpart.control(usesurrogate=0, maxsurrogate=0))
#---------------------------------------------------------------------------
# Look at the textual description of the tree.
tree.m						# print the model
printcp(tree.m)				# print the table of optimal prunings based on the complexity parameter
#----------------------------------------------------------------------------
# Plot the tree
drawTreeNodes(tree.m)
title(main="Weather Data tree.m",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#----------------------------------------------------------------------------
# Evaluate performance
# Run the tree model on the validate set
  pred <- predict(tree.m, weather[test, c(input,target)], type="class")
  levels(pred) <- c("Yes","No")								# change order of levesl to match documentation for confusionMatrix
# Generate the confusion matrix
  actual <- weather[test, c(input,target)]$RainTomorrow
  levels(actual) <- c("Yes","No")							# change order of levels to match documantation for confusion matrix
  AP <- c("Predicted","Actual")								# row names for CM
  CM <- table(pred,actual,dnn=AP)							# CM counts
  confusionMatrix(CM)										# from the caret package
  ?confusionMatrix											# Look at meaning of confusionMatrix outputs

# Notes
# The\no-information rate"shown on the output is the largest proportion of the observed classes
# A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater
# than the rate of the largest class. This is helpful for data sets where there is a large imbalance
# between the classes.
#
# The kappa statistic yields a measure of how well the actual and predicted values agree
# See http://www.chestx-ray.com/statistics/kappa.html or
# http://en.wikipedia.org/wiki/Cohen%27s_kappa
#
# The null hypothesis for McNemar's chi squared test is that the actual and predicted
# probabilities are the same
# See http://en.wikipedia.org/wiki/McNemar%27s_test
#
#--------------------------------------------------------------------------------------------
# Try another model using different variables
form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am)
tree.m2 <- rpart(form,
			    data=weather[train, c(input,target)],
				method="class",
				parms=list(split="information"),
				control=rpart.control(usesurrogate=2,
									  maxsurrogate=0,
									  minsplit=30,
									  maxdepth=20))
#----------------------------------------------------------------------------------------------
# Plot the new tree
drawTreeNodes(tree.m2)
title(main="Weather Data tree.m2",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))

tree.mod.p <- as.party(tree.m2)		# make the tree.mod object into a party object
plot(tree.mod.p)

#----------------------------------------------------------------------------
# Evaluate performance of the new model on the test set
pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class")
levels(pred2) <- c("Yes","No")
CM2 <- table(pred2,actual,dnn=AP)
confusionMatrix(CM2)
# -----------------------------------------------------------------------------------
#
# GENERATE THE ROC CURVE FOR THE BEST MODEL
prROC <- predict(tree.m, weather[test, c(input,target)])[,2]
#
# Get vector RainTommorrow in test data set
testRT <- weather[test, c(input,target)]$RainTomorrow
pr <- prediction(prROC, testRT)
#------------------------------------------------------------------------------------
# Plot the ROC curve
plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE)
	#fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
    #tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
segments(0,0,1,1,col="blue",lwd=2)
# Add a legend to the plot.
legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05))
# Add decorations to the plot.
title(main="ROC Curve  weather.csv [test data]",
      sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#


## INTRO to CARET

#------------------------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# DATA MINING with CARET
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#------------------------------------------------------------------------------
# INTRODUCTION TO THE CARET PACKAGE
# caret is a feature rich package for doing data mining in R.
# This script explores caret's capabilities using data included in the
# package that was described in the paper:
# Hill et al "Impact of image segmentation on high-content
# screening data quality for SK-BR-3 cells"
# BMC fioinformatics (2007) vol 8 (1) pp. 340
#
# Background
# Well-segmented cells are cells for which location and size may be accurrately detremined
# through optical measurements. Cells that are not Well-segmented (WS) are said to be
# Poorly-segmented (PS).
#
# Problem
# Given a set of optical measurements can we predict which cells will be PS?
# This is a classic classification problem
#---------------------------
library(ada)				# Boosting algorithms
library(caret)
library(rpart)				# CART algorithm for decision trees
library(partykit)			# Plotting trees
library(doParallel)			# parallel processing
                            # by default
							#     Multicore functionality on Unix (single machine only)
							#     Snow functionality on Windows (cluster)


library(pROC)				# plot the ROC curve
library(corrplot)			# plot correlations

#---------------------------
# data(package="caret")
data(segmentationData)		# Load the segmentation data set
dim(segmentationData)
head(segmentationData)		# Have a look at the data
#[1] 2019   61
trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE)
trainData <- segmentationData[trainIndex,]
dim(trainData)
 #1010   61
testData  <- segmentationData[-trainIndex,]
dim(testData)
 #1009   61
#-------------------------------------------------------------------------------------
# VISUALIZE CORRELATIONS
trainV <- trainData[,4:61]
corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse")

#-----------------------------------------------------------------
# BUILD AN ADABOOST MODEL WITH ADA
form <- formula(Class ~ .)
control <- rpart.control(maxdepth=30,   # the maximum depth of any node of the final tree
	                     cp = 0.01,     # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted.
						 minsplit=20,   # the minimum number of observations that must exist in a node in order for a split to be attempted
						 xval=10)       # number of cross-validations

ada.model <- ada(formula=form,
	             data=trainData,
				 control=control,
				 nu = .01,				# shrinkage parameter for boosting
				 iter=50)

ada.model$model[[1]]					# Look at the trees in the model
ada.model								# look at the model performance
plot(ada.model,TRUE)					# Plot error rate vs. iterations of the model
varplot(ada.model)						# Variable importance plot
#----------------------------------------------------------------------
# FIND THE "BEST" MODEL
#
# This is an interesting model, but how do you select the best values for the
# for the three tuning parameters?
#		nu
#		iter
#		maxdepth
#---------------------------------------------------------------------------------
# Algorithm for training the model:
# for each resampled data set do
#		hold out some samples
#		for each combination of the three tuning parameters
#			do
#				Fit the model on the resampled data set
#				Predict the values of class on the hold out samples
#			end
#		Calculate AUC: the area under the ROC for each sample
#		Select the combination of tuning parmeters that yields the best AUC
#
# caret provides the "train" function to do all of this
#
# The trainControl function to set the training method
# Note the default method of picking the best model is accuracy and Cohen's Kappa
#
#-----------------------------------------------------------------------------------
# Set up the parameters to run the boosting function
ctrl <- trainControl(method="repeatedcv",					# use repeated 10fold cross validation
						number=5,							# the number of folds
						repeats=2,							# do 2 repititions of 5-fold cv
						summaryFunction=twoClassSummary,	# Use AUC to pick the best model
						classProbs=TRUE)
# Use the expand.grid to specify the search space
# Note that the default search grid selects 3 values of each tuning parameter
#
grid <- expand.grid(.nu=c(.1,1), #
					.iter=c(20,50),
					.maxdepth=c(20,30))			#
#
set.seed(1)
#names(trainData)
trainX <-trainData[,4:61]
#-----------------------------------------------------------------
# PARALLEL COMPUTING
# vignette("gettingstartedParallel")

cl <- makeCluster(4)		 # Use this to manually create a cluster
                             # But, since I only have a single Windows machine
							 # all I am doing is passing the number of cores to use to
							 # registerDoParallel()
registerDoParallel(cl)		 # Registrer a parallel backend for train
getDoParWorkers()

system.time(ada.tune <- train(x=trainX,y=trainData$Class,
				method = "ada",
				metric = "ROC",
				trControl = ctrl,
				control=control,
				tuneGrid=grid))
#
stopCluster(cl)

#user  system elapsed
#14.33    0.02  206.25
#-------------------------------------------------------------------------------
# ADA RESULTS
ada.tune						# Look at the results for the training grid
ada.tune$finalModel				# Look at the performance of the final model
plot(ada.tune)					# Plot the performance of the training models
#--------------------------------------------------------------------------------
# ADA PREDICTIONS
testX <- testData[,4:61]
ada.pred <- predict(ada.tune,testX)
#
confusionMatrix(ada.pred,testData$Class)
#-----------------------------------------------------------------
# DRAW THE ROC CURVE
# Use roc function from the pROC package
ada.probs <- predict(ada.tune,testX,type="prob")
ada.ROC <- roc(predictor=ada.probs$PS,
				response=testData$Class,
				levels=rev(levels(testData$Class)))
plot(ada.ROC,col=2)
ada.ROC$auc		# Get the area under the curve
#------------------------------------------------------------------------------------
#
# SUPPORT VECTOR MACHINE MODEL
#
set.seed(1)
registerDoParallel(4,cores=4)
getDoParWorkers()
system.time(
	svm.tune <- train(x=trainX,
					y= trainData$Class,
					method = "svmRadial",
					tuneLength = 5,					# 5 values of the cost function
					preProc = c("center","scale"),
					metric="ROC",
					trControl=ctrl)					# same as for ada above
)

#user  system elapsed
   #2.40    0.14   26.10


#--------------------------------------------------------------
# SVM RESULTS
svm.tune						# Look at the results for the training grid
svm.tune$finalModel				# Look at the performance of the final mode
plot(svm.tune,
	metric="ROC",
	scales=list(x=list(log=2)))
#---------------------------------------------------------------
# SVM PREDICTIONS
svm.pred <- predict(svm.tune,testX)
confusionMatrix(svm.pred,testData$Class)
#
#----------------------------------------------------------------
# COMPARE THE SVM AND ADA MODELS USING RESAMPLING
#
# Because we set the same seed before running the models we can compare the models using resampling
# See Hothorn at al, "The design and analysis of benchmark experiments"
# Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699
# for comparing models using resampling.
#
# The resamples function in caret collates the resampling results from the two models
rValues <- resamples(list(svm=svm.tune,ada=ada.tune))
rValues$values					# Look at the resample values
summary(rValues)				# Summarize the resamples

#---------------------------------------------
xyplot(rValues,metric="ROC")		# scatter plot
bwplot(rValues,metric="ROC")		# boxplot
parallel(rValues,metric="ROC")		# parallel plot
dotplot(rValues,metric="ROC")		# dotplot
#

## ROLL with RATTLE
##############################################################################
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# ROLL with RATTLE
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html

#################################################################################
#
library(rattle)									# load the rattle package
rattle()										# start the rattle user interface
#
#data()											# see what data sets are available in all of the loaded packages
data(package="rattle")							# see what data sets are availeble in rattle
ls("package:rattle")							# See what functions are in the Rattle package
#lsf.str("package:rattle")						# see what functions are in rattle
#
# THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
#
# LOAD THE WEATHER DATA SET.
# The weather data set consists of observations made at a weather monitoring station
# in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
# See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for
# Knowlwdge Discovery, Springer 2011
#
#		Go to the Data Tab and click on Execute
#       Rattle will ask if you want to use the weather data as default. Click yes.
#
# SUMMARY STATISTICS
#		Go to the Explore Tab
#		Select summary and basics
#       Hit Execute
#
# SCATTER PLOTS
#		Go to the Explore Tab
#		Select Distributions
#		Click on Execute
#
# LOOK AT A SINGLE VARIABLE
#		Go to Explore Tab
#		Select RainTomorrow Bar Plot
#		Hit Execute
#		This produces a bar plot of the target variable RainTomorrow
#		84% of the observations have no rain
#		A model that always predicts no rain should be about 84% accurate
#
# INVESTIGATE MULTIPLE VARIABLES
#		Go to the Explore Tab
#       In the upper panel select Box Plot and Histogram for both
#			MaxTemp
#			Sunshine
#		Click Execute
#
#		Boxplots top left: Temperature generally higher day before it rains
#		Boxplots top right: Less sunshine day before it rains
#
# CORRELATIONS
#		Go to the Explore Tab
#		Un select any variables that may be selected
#		Select Correlation
#		Click on Execute
#
#
# INTERACTIVELY EXPLORE DATA
#	Select Interactive and then Lattiscist
#	In bottom center panel
#		Select MaxTemp for y axis and
#		Select Sunshine for x axis
#		Place crosshair on outlier and right click
#
# BUILD A TREE MODEL
#	Go to Model Tab
#	Select Tree
#	Click Execute
#	Click on Draw button to get the graph
#	Click on Rules button to see rules
#	Select Log Tab to look at R code
#
# EVALUATE THE MODEL
#	Go to the Evaluate tab
#	Select
#		Type = Error Matrix
#		Model = Tree
#		Data = Testing
#	Click on Execute
#
# Error matrix for the Decision Tree model on weather.csv [test] (counts):
#
      #Predicted
#Actual No Yes
   #No  35   6  False positive rate = FP/N =  6/(35+6) = .146   = negatives incorrectly classified / total negatives
   #Yes  5  10  True positive rate  = TP/P = 10/(10+5) = .667   = positives correctly classified / total positives
#                                   = sensitivity  = recall = hit rate
#               True negative rate  = TN / (FP + TN) = 1 - FP rate  = .854
#                                   = specificity
#
#               False positives = 6 = Type  I Error (Test rejects true null hypothesis)
#               False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)

#Error matrix for the Decision Tree model on weather.csv [test] (%):
#
      #Predicted
#Actual No Yes
   #No  62  11    62% (35/56) of cases model predicts it won't rain and it didn't
   #Yes  9  18    18% (10/56) of cases model predicts it would rain and it did
#                 Accracy of test = 62% + 18% = 80%
#
#Overall error: 0.1964286

## WORDCLOUD

#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# JUST FOR FUN - BUILD A WORD CLOUD
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html


# From example at RDataMining
# http://www.rdatamining.com/examples/text-mining
# This page shows an example on text mining of Twitter
#-----------------------------------------------------------------------------
# Load the libraries necesary
library(twitteR)			# twitteR provides access to Twitter data
library(tm)					# tm provides functions for text mining
library(Snowball)           # Wrappers for Weka Java stemming funcitons
library(wordcloud)          # wordcloud visualizes the result with a word cloud
library(RColorBrewer)       # provides the rainbow colors
#------------------------------------------------------------------------------
# retrieve the first 100 tweets (or all tweets if fewer than 100)
# from the user timeline of @rdatammining
#
Tweets <- searchTwitter("#rstats",n=100)
n <- length(Tweets)
# Tweets[1:3]
#
#-------------------------------------------------------------------------------
#Transforming Text
#The tweets are first converted to a data frame and then to a corpus.
df <- do.call("rbind", lapply(Tweets, as.data.frame))
#dim(df)
# Just in case twitter is off-line
#df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1)
#head(df)
#
# Build a corpus, which is a collection of text documents
# VectorSource specifies that the source is character vectors.
myCorpus <- Corpus(VectorSource(df$text))

#After that, the corpus needs a couple of transformations, including
   #changing letters to lower case,
   #removing punctuations/numbers and removing stop words.
#The general English stop-word list is tailored by
#adding "available" and "via" and removing "r".

myCorpus <- tm_map(myCorpus, tolower)					# lower case
myCorpus <- tm_map(myCorpus, removePunctuation)			# remove punctuation
myCorpus <- tm_map(myCorpus, removeNumbers)				# remove numbers
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#----------------------------------------------------------------------------
#Stemming Words
#  In many cases, words need to be stemmed to retrieve their radicals.
#  For instance, "example" and "examples" are both stemmed to "exampl".
#  However, after that, one may want to complete the stems to their original
#  forms, so that the words would look "normal".

dictCorpus <- myCorpus
# stem words in a text document with the snowball stemmers,
# which requires packages Snowball, RWeka, rJava, RWekajars
myCorpus <- tm_map(myCorpus, stemDocument)

#inspect(myCorpus[1:3])   # inspect the first three ``documents"
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion
#
#
#inspect(myCorpus[1:3])   #Print the first three documents in the built corpus.
#----------------------------------------------------------------------------------------
#Building a Document-Term Matrix
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
# inspect(myDtm[266:270,31:40])
#
# findFreqTerms(myDtm, lowfreq=10)		#Frequent Terms and Associations
# findAssocs(myDtm, 'analytics', 0.30)    # which words are associated with "analytics"?
#-----------------------------------------------------------------------------------------
#Build the word cloud
#After building a document-term matrix, we can show the importance of
#words with a word cloud (also kown as a tag cloud) .
m <- as.matrix(myDtm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
# Plot the word cloud
pal <- brewer.pal(6,"Dark2")
pal <- pal[-(1)]
#random colors
wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal)

	#------------------------------------------------------------
	# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
	# February 14, 2013
	# Joseph B. Rickert
	# Technical Marketing Manager
	#
	# BIG DATA with RevoScaleR
	#
	# Copyright: Revolution Analytics
	# This script is licensed under the GPLv2 license
	# http://www.gnu.org/licenses/gpl-2.0.html
	# ----------------------------------------------------------------------
	# LOOK AT THE MORTGATE DEFAULT DATA
	#------------------------------------------------------------------------
	dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
	mdata <- file.path(dataDir,"mortDefault.xdf")
	rxGetInfo(mdata,getVarInfo=TRUE)

	#-----------------------------------------------------------------------------------
	## Create a new data file having a variable with uniform random numbers
	# going from 1 to 10. This variable will be used to create the training and test
	# data sets.
	# A little note on how the random numbers are created:
	# A transform should work on an arbitrary chunk of data. Typically
	# RevoScaleR functions will test transforms on a small chunk before
	# fully processing. The internal variable (.rxNumRows) gives the size
	# of the chunk.

	rxDataStep(inData = mdata, outFile = "mortDefault2",
	transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
	overwrite=TRUE)
	rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)

	#
	#------------------------------------------------------------
	# KMEANS ANALYSIS
	#------------------------------------------------------------
	rxDataStep(inData="mortDefault2",outFile="mortDefault3",
	varsToDrop="default",
	overwrite=TRUE)
	rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)

	form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
	md.km <- rxKmeans(formula=form,
	data = "mortDefault3",
	numClusters = 3,
	outFile = "mortDefault3",
	algorithm = "lloyd",
	overwrite=TRUE)
	rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
	md.km
	# Build a data frame to do a plot
	mdDf <- rxXdfToDataFrame(file="mortDefault3",
	rowSelection=urns == 5,
	maxRowsByCols = 1000)

	plot(mdDf[,1:4],col=mdDf$.rxCluster)
	title(main="Clusters in Mortgage Default Data",line=3)

	###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
	#---------------------------------------------------------------------------
	# Some subsidary functions
	#---------------------------------------------------------------------------
	# Function to compute a "long form" of the confusion matrix
	Cmatrix <- function(df){
	df <- as.data.frame(df)
	df$Result <- c("True Negative","False Negative","False Positive","True Positive")
	df$PCT <- round(df$Counts/sum(df$Counts),2)*100
	df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
	df$Counts[2]/(df$Counts[2]+df$Counts[4]),
	df$Counts[3]/(df$Counts[1]+df$Counts[3]),
	df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
	names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
	return(df)
	}
	#------------------------------------------------------------------------------
	##### CREATE TRAINING AND TEST FILES
	#-----------------------------------
	#info <- rxGetInfo(mdata)
	#N <- info$numRows
	#

	#-------------------------------------------------------------------------------
	# BUILD THE TRAINING FILE
	#------------------------
	rxDataStepXdf(inFile = "mortDefault2",
	outFile = "mdTrain",
	rowSelection = urns < 9,
	transforms=list(CS = creditScore,
	YR = year,
	yrE = yearsEmploy,
	HA = houseAge,
	ccD = ccDebt),
	blocksPerRead=20,
	rowsPerRead=500000,
	overwrite=TRUE )

	rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
	rxHistogram(~default,data="mdTrain")
	#-------------------------
	# BUILD THE TEST FILE
	#-------------------------
	rxDataStepXdf(inFile = "mortDefault2",
	outFile = "mdTest",
	rowSelection = urns > 8,
	transforms=list(CS = creditScore,
	YR = year,
	yrE = yearsEmploy,
	HA = houseAge,
	ccD = ccDebt),
	blocksPerRead=20,
	rowsPerRead=500000,
	overwrite=TRUE )
	#
	rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
	rxHistogram(~default,data="mdTest")
	#---------------------------------------------------------------------------
	# BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
	#---------------------------------------------------------------------------
	system.time(
	model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt,
	data="mdTrain",
	reportProgress=rxGetOption("reportProgress") )
	)
	#
	#Elapsed computation time: 21.533 secs.
	#user system elapsed
	#56.15 12.02 21.55


	#Elapsed computation time: 23.149 secs.
	#user system elapsed
	#56.81 10.58 23.17
	#Elapsed computation time: 24.384 secs.
	#user system elapsed
	#59.29 10.31 24.48

	summary(model)

	#----------------------------------------------------------------------
	# MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
	#----------------------------------------------------------------------
	rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
	rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
	#rxSummary(~default_Pred,data="mdTest")
	# Add a new prediction variable
	rxDataStep(inData="mdTest",outFile="mdTest",
	transforms=list(LogitPred.L = as.logical(round(LogitPred))),
	overwrite=TRUE)
	#
	rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)

	#-------------------------------------------------------------------------------
	# GENERATE THE CONFUSION MATRIX
	#-------------------------------
	conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
	Cmatrix(conMc)

	# Examine the performance of the model
	total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
	total.pct.correct
	#-----------------------------------------------------------------------------------
	# Generate the ROC Curve
	#
	rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
	#
	#-------------------------------------------------------------------------------------

	# BUILD A TREE MODEL
	system.time(
	model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD,
	data="mdTrain",
	blocksPerRead = 1,
	maxDepth=5,
	reportProgress=rxGetOption("reportProgress") )
	)
	##

	#Elapsed time for RxDTreeBase: 89.545 secs.
	#
	#user system elapsed
	#245.13 12.50 89.57


	#Elapsed time for RxDTreeBase: 403.785 secs.
	# This was to fully build out the tree
	#user system elapsed
	#1092.37 75.89 403.83

	model.tree
	#
	#----------------------------------------------------------------
	# Plot the Tree
	plot(rxAddInheritance(model.tree),uniform=TRUE)
	text(rxAddInheritance(model.tree),digits=2)
	title(main="Classification Tree for Mortgage Data",
	sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
	#-------------------------------------------------------------------

	###### - END DEMO HERE - ###########
	##############################################################################
	# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
	# February 14, 2013
	# Joseph B. Rickert
	# Technical Marketing Manager
	#
	# ROLL with RATTLE
	#
	# Copyright: Revolution Analytics
	# This script is licensed under the GPLv2 license
	# http://www.gnu.org/licenses/gpl-2.0.html

	#################################################################################
	#
	library(rattle) # load the rattle package
	rattle() # start the rattle user interface
	#
	#data() # see what data sets are available in all of the loaded packages
	data(package="rattle") # see what data sets are availeble in rattle
	ls("package:rattle") # See what functions are in the Rattle package
	#lsf.str("package:rattle") # see what functions are in rattle
	#
	# THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
	#
	# LOAD THE WEATHER DATA SET.
	# The weather data set consists of observations made at a weather monitoring station
	# in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
	# See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for
	# Knowlwdge Discovery, Springer 2011
	#
	# Go to the Data Tab and click on Execute
	# Rattle will ask if you want to use the weather data as default. Click yes.
	#
	# SUMMARY STATISTICS
	# Go to the Explore Tab
	# Select summary and basics
	# Hit Execute
	#
	# SCATTER PLOTS
	# Go to the Explore Tab
	# Select Distributions
	# Click on Execute
	#
	# LOOK AT A SINGLE VARIABLE
	# Go to Explore Tab
	# Select RainTomorrow Bar Plot
	# Hit Execute
	# This produces a bar plot of the target variable RainTomorrow
	# 84% of the observations have no rain
	# A model that always predicts no rain should be about 84% accurate
	#
	# INVESTIGATE MULTIPLE VARIABLES
	# Go to the Explore Tab
	# In the upper panel select Box Plot and Histogram for both
	# MaxTemp
	# Sunshine
	# Click Execute
	#
	# Boxplots top left: Temperature generally higher day before it rains
	# Boxplots top right: Less sunshine day before it rains
	#
	# CORRELATIONS
	# Go to the Explore Tab
	# Un select any variables that may be selected
	# Select Correlation
	# Click on Execute
	#
	#
	# INTERACTIVELY EXPLORE DATA
	# Select Interactive and then Lattiscist
	# In bottom center panel
	# Select MaxTemp for y axis and
	# Select Sunshine for x axis
	# Place crosshair on outlier and right click
	#
	# BUILD A TREE MODEL
	# Go to Model Tab
	# Select Tree
	# Click Execute
	# Click on Draw button to get the graph
	# Click on Rules button to see rules
	# Select Log Tab to look at R code
	#
	# EVALUATE THE MODEL
	# Go to the Evaluate tab
	# Select
	# Type = Error Matrix
	# Model = Tree
	# Data = Testing
	# Click on Execute
	#
	# Error matrix for the Decision Tree model on weather.csv [test] (counts):
	#
	#Predicted
	#Actual No Yes
	#No 35 6 False positive rate = FP/N = 6/(35+6) = .146 = negatives incorrectly classified / total negatives
	#Yes 5 10 True positive rate = TP/P = 10/(10+5) = .667 = positives correctly classified / total positives
	# = sensitivity = recall = hit rate
	# True negative rate = TN / (FP + TN) = 1 - FP rate = .854
	# = specificity
	#
	# False positives = 6 = Type I Error (Test rejects true null hypothesis)
	# False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)

	#Error matrix for the Decision Tree model on weather.csv [test] (%):
	#
	#Predicted
	#Actual No Yes
	#No 62 11 62% (35/56) of cases model predicts it won't rain and it didn't
	#Yes 9 18 18% (10/56) of cases model predicts it would rain and it did
	# Accracy of test = 62% + 18% = 80%
	#
	#Overall error: 0.1964286