public

Scripts for 2/14/13 Webinar Introduction to R for Data Mining

  • Download Gist
BIG DATA with RevoScale R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
 
#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# BIG DATA with RevoScaleR
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
# ----------------------------------------------------------------------
# LOOK AT THE MORTGATE DEFAULT DATA
#------------------------------------------------------------------------
dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
mdata <- file.path(dataDir,"mortDefault.xdf")
rxGetInfo(mdata,getVarInfo=TRUE)
 
#-----------------------------------------------------------------------------------
## Create a new data file having a variable with uniform random numbers
# going from 1 to 10. This variable will be used to create the training and test
# data sets.
# A little note on how the random numbers are created:
# A transform should work on an arbitrary chunk of data. Typically
# RevoScaleR functions will test transforms on a small chunk before
# fully processing. The internal variable (.rxNumRows) gives the size
# of the chunk.
 
rxDataStep(inData = mdata, outFile = "mortDefault2",
transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
overwrite=TRUE)
rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)
 
#
#------------------------------------------------------------
# KMEANS ANALYSIS
#------------------------------------------------------------
rxDataStep(inData="mortDefault2",outFile="mortDefault3",
varsToDrop="default",
overwrite=TRUE)
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
 
form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
md.km <- rxKmeans(formula=form,
data = "mortDefault3",
numClusters = 3,
outFile = "mortDefault3",
algorithm = "lloyd",
overwrite=TRUE)
rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
md.km
# Build a data frame to do a plot
mdDf <- rxXdfToDataFrame(file="mortDefault3",
rowSelection=urns == 5,
maxRowsByCols = 1000)
plot(mdDf[,1:4],col=mdDf$.rxCluster)
title(main="Clusters in Mortgage Default Data",line=3)
 
###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
#---------------------------------------------------------------------------
# Some subsidary functions
#---------------------------------------------------------------------------
# Function to compute a "long form" of the confusion matrix
Cmatrix <- function(df){
df <- as.data.frame(df)
df$Result <- c("True Negative","False Negative","False Positive","True Positive")
df$PCT <- round(df$Counts/sum(df$Counts),2)*100
df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
df$Counts[2]/(df$Counts[2]+df$Counts[4]),
df$Counts[3]/(df$Counts[1]+df$Counts[3]),
df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
return(df)
}
#------------------------------------------------------------------------------
##### CREATE TRAINING AND TEST FILES
#-----------------------------------
#info <- rxGetInfo(mdata)
#N <- info$numRows
#
 
#-------------------------------------------------------------------------------
# BUILD THE TRAINING FILE
#------------------------
rxDataStepXdf(inFile = "mortDefault2",
outFile = "mdTrain",
rowSelection = urns < 9,
transforms=list(CS = creditScore,
YR = year,
yrE = yearsEmploy,
HA = houseAge,
ccD = ccDebt),
blocksPerRead=20,
rowsPerRead=500000,
overwrite=TRUE )
 
rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
rxHistogram(~default,data="mdTrain")
#-------------------------
# BUILD THE TEST FILE
#-------------------------
rxDataStepXdf(inFile = "mortDefault2",
outFile = "mdTest",
rowSelection = urns > 8,
transforms=list(CS = creditScore,
YR = year,
yrE = yearsEmploy,
HA = houseAge,
ccD = ccDebt),
blocksPerRead=20,
rowsPerRead=500000,
overwrite=TRUE )
#
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
rxHistogram(~default,data="mdTest")
#---------------------------------------------------------------------------
# BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
#---------------------------------------------------------------------------
system.time(
model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt,
data="mdTrain",
reportProgress=rxGetOption("reportProgress") )
)
#
#Elapsed computation time: 21.533 secs.
#user system elapsed
#56.15 12.02 21.55
 
#Elapsed computation time: 23.149 secs.
#user system elapsed
#56.81 10.58 23.17
#Elapsed computation time: 24.384 secs.
#user system elapsed
#59.29 10.31 24.48
 
summary(model)
 
#----------------------------------------------------------------------
# MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
#----------------------------------------------------------------------
rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
#rxSummary(~default_Pred,data="mdTest")
# Add a new prediction variable
rxDataStep(inData="mdTest",outFile="mdTest",
transforms=list(LogitPred.L = as.logical(round(LogitPred))),
overwrite=TRUE)
#
rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
 
#-------------------------------------------------------------------------------
# GENERATE THE CONFUSION MATRIX
#-------------------------------
conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
Cmatrix(conMc)
 
# Examine the performance of the model
total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
total.pct.correct
#-----------------------------------------------------------------------------------
# Generate the ROC Curve
#
rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
#
#-------------------------------------------------------------------------------------
 
# BUILD A TREE MODEL
system.time(
model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD,
data="mdTrain",
blocksPerRead = 1,
maxDepth=5,
reportProgress=rxGetOption("reportProgress") )
)
##
 
#Elapsed time for RxDTreeBase: 89.545 secs.
#
#user system elapsed
#245.13 12.50 89.57
 
 
#Elapsed time for RxDTreeBase: 403.785 secs.
# This was to fully build out the tree
#user system elapsed
#1092.37 75.89 403.83
 
model.tree
#
#----------------------------------------------------------------
# Plot the Tree
plot(rxAddInheritance(model.tree),uniform=TRUE)
text(rxAddInheritance(model.tree),digits=2)
title(main="Classification Tree for Mortgage Data",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#-------------------------------------------------------------------
 
###### - END DEMO HERE - ###########
GETTING STARTED
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
#
#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# GETTING STARTED
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
 
#---------------------------------------------------------------------
# Execute the following command to install all of the packages needed for the webinar
#install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse",
#"ISwR","partykit","pROC","rattle","RColorBrewer",
#"rpart","Snowball","ROCR","tm","twitteR","wordcloud"))
#
#----------------------------------------------------------------------
# A First look at R
# A simple regression example from
# Statistics and Computing, Introductory Statistics with R
# Peter Dalgaard, Springer 2002
##
library(ISwR) # Load a library
data() # Have a look at what data sets are available
data(thuesen) # Load thuesen into the environment
thuesen # Have a look at it
class(thuesen) # Find out what kind of object thuesen is
sapply(thuesen,class) # See what kinds of animal the variables are
#
plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface
#
plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame
#
model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model
summary(model) # Look at the results
str(model) # Look at the structure of the model object
# Build a fancier plot
plot(x=thuesen$blood.glucose,
y=thuesen$short.velocity,
xlab="blood glucose (mmol / l)",
ylab = "circumferential shortening velocity (%/s)",
main = "Thuesen Data set",
col="blue",
pch=19
)
abline(model,col="red")
#
par(mfrow=c(2,2)) # Set up for multiple plots
plot(model, col="blue") # look at some diagnostics
 
#---------------------------------------------------------------------
#
# A FIRST LOOK AT FUNCTIONS
#
# Let's create a simple function
joe.stats <- function(data){
min <- min(data)
max <- max(data)
q <- quantile(data,probs=seq(0,1,0.25))
res <- list(min,max,q)
return(res)
}
attach(thuesen) # make the columns of thuesen available
# in the global environment as variables
joe.stats(blood.glucose) # Run our function
summary(blood.glucose) # R does it better
 
 
# Set up for later
rm(list=ls())
load("WEBINAR_2-14-13_Intro_R_DM_caret .RData")
#--------------------------------------------------------------------------------
#SOME ADDITIONAL ONLINE RESOURCES
#An Introduction to R
#Notes on R: A Programming Environment for Data Analysis and Graphics
#Version 2.15.2 (2012-10-26)
#http://cran.r-project.org/doc/manuals/R-intro.pdf
#
#Using R for Data Analysis and Graphics
#Introduction, Code and Commentary
#J H Maindonald
#http://cran.r-project.org/doc/contrib/usingR.pdf
IN THE TREES
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
#------------------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
#### BUILD A TREE MODEL WITH RPART AND EVALUATE #####
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#-------------------------------------------------------------------------
# This script divides the data into training, validation and testing data,
# builds two different decision trees (rpart) using the training data and
# evaluates their performance using the test data set
# An ROC curve is produced for the better model
#------------------------------------------------------------------------
library(rattle)
library(rpart)
library(ROCR)
library(caret)
# -----------------------------------------------------------------------
# Read in the data from disk
# name <- "weather.csv"
# path <- file.path(getwd(),name)
# weather <- read.csv(path,header=TRUE)
# Show weather on the IDE editor
data(weather)
head(weather)
#------------------------------------------------------------------------
# Select variables for the model
weather <- subset(weather,select=c(MinTemp:RainTomorrow))
set.seed(42) # Set seed
#-------------------------------------------------------------------------
# Determined the observations for the training,validate and test datasets.
N <- nrow(weather) # 366 observations
train <- sample(N, 0.8*N) # 292 observations
test <- setdiff(seq_len(N),train) # 74 observations not intrain
#-------------------------------------------------------------------------
# Build the model
M <- ncol(weather)
input <- names(weather)[1:(M-2)] # names of input variables
target <- "RainTomorrow" # name of target variable
form <- formula(RainTomorrow ~ .) # Describe the model to R
tree.m <- rpart(RainTomorrow ~ .,
data=weather[train, c(input,target)],
method="class",
parms=list(split="information"),
control=rpart.control(usesurrogate=0, maxsurrogate=0))
#---------------------------------------------------------------------------
# Look at the textual description of the tree.
tree.m # print the model
printcp(tree.m) # print the table of optimal prunings based on the complexity parameter
#----------------------------------------------------------------------------
# Plot the tree
drawTreeNodes(tree.m)
title(main="Weather Data tree.m",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#----------------------------------------------------------------------------
# Evaluate performance
# Run the tree model on the validate set
pred <- predict(tree.m, weather[test, c(input,target)], type="class")
levels(pred) <- c("Yes","No") # change order of levesl to match documentation for confusionMatrix
# Generate the confusion matrix
actual <- weather[test, c(input,target)]$RainTomorrow
levels(actual) <- c("Yes","No") # change order of levels to match documantation for confusion matrix
AP <- c("Predicted","Actual") # row names for CM
CM <- table(pred,actual,dnn=AP) # CM counts
confusionMatrix(CM) # from the caret package
?confusionMatrix # Look at meaning of confusionMatrix outputs
# Notes
# The\no-information rate"shown on the output is the largest proportion of the observed classes
# A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater
# than the rate of the largest class. This is helpful for data sets where there is a large imbalance
# between the classes.
#
# The kappa statistic yields a measure of how well the actual and predicted values agree
# See http://www.chestx-ray.com/statistics/kappa.html or
# http://en.wikipedia.org/wiki/Cohen%27s_kappa
#
# The null hypothesis for McNemar's chi squared test is that the actual and predicted
# probabilities are the same
# See http://en.wikipedia.org/wiki/McNemar%27s_test
#
#--------------------------------------------------------------------------------------------
# Try another model using different variables
form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am)
tree.m2 <- rpart(form,
data=weather[train, c(input,target)],
method="class",
parms=list(split="information"),
control=rpart.control(usesurrogate=2,
maxsurrogate=0,
minsplit=30,
maxdepth=20))
#----------------------------------------------------------------------------------------------
# Plot the new tree
drawTreeNodes(tree.m2)
title(main="Weather Data tree.m2",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
 
tree.mod.p <- as.party(tree.m2) # make the tree.mod object into a party object
plot(tree.mod.p)
 
#----------------------------------------------------------------------------
# Evaluate performance of the new model on the test set
pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class")
levels(pred2) <- c("Yes","No")
CM2 <- table(pred2,actual,dnn=AP)
confusionMatrix(CM2)
# -----------------------------------------------------------------------------------
#
# GENERATE THE ROC CURVE FOR THE BEST MODEL
prROC <- predict(tree.m, weather[test, c(input,target)])[,2]
#
# Get vector RainTommorrow in test data set
testRT <- weather[test, c(input,target)]$RainTomorrow
pr <- prediction(prROC, testRT)
#------------------------------------------------------------------------------------
# Plot the ROC curve
plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE)
#fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
#tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
segments(0,0,1,1,col="blue",lwd=2)
# Add a legend to the plot.
legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05))
# Add decorations to the plot.
title(main="ROC Curve weather.csv [test data]",
sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#
INTRO to CARET
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
#------------------------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# DATA MINING with CARET
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
#------------------------------------------------------------------------------
# INTRODUCTION TO THE CARET PACKAGE
# caret is a feature rich package for doing data mining in R.
# This script explores caret's capabilities using data included in the
# package that was described in the paper:
# Hill et al "Impact of image segmentation on high-content
# screening data quality for SK-BR-3 cells"
# BMC fioinformatics (2007) vol 8 (1) pp. 340
#
# Background
# Well-segmented cells are cells for which location and size may be accurrately detremined
# through optical measurements. Cells that are not Well-segmented (WS) are said to be
# Poorly-segmented (PS).
#
# Problem
# Given a set of optical measurements can we predict which cells will be PS?
# This is a classic classification problem
#---------------------------
library(ada) # Boosting algorithms
library(caret)
library(rpart) # CART algorithm for decision trees
library(partykit) # Plotting trees
library(doParallel) # parallel processing
# by default
# Multicore functionality on Unix (single machine only)
# Snow functionality on Windows (cluster)
library(pROC) # plot the ROC curve
library(corrplot) # plot correlations
 
#---------------------------
# data(package="caret")
data(segmentationData) # Load the segmentation data set
dim(segmentationData)
head(segmentationData) # Have a look at the data
#[1] 2019 61
trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE)
trainData <- segmentationData[trainIndex,]
dim(trainData)
#1010 61
testData <- segmentationData[-trainIndex,]
dim(testData)
#1009 61
#-------------------------------------------------------------------------------------
# VISUALIZE CORRELATIONS
trainV <- trainData[,4:61]
corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse")
 
#-----------------------------------------------------------------
# BUILD AN ADABOOST MODEL WITH ADA
form <- formula(Class ~ .)
control <- rpart.control(maxdepth=30, # the maximum depth of any node of the final tree
cp = 0.01, # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted.
minsplit=20, # the minimum number of observations that must exist in a node in order for a split to be attempted
xval=10) # number of cross-validations
ada.model <- ada(formula=form,
data=trainData,
control=control,
nu = .01, # shrinkage parameter for boosting
iter=50)
 
ada.model$model[[1]] # Look at the trees in the model
ada.model # look at the model performance
plot(ada.model,TRUE) # Plot error rate vs. iterations of the model
varplot(ada.model) # Variable importance plot
#----------------------------------------------------------------------
# FIND THE "BEST" MODEL
#
# This is an interesting model, but how do you select the best values for the
# for the three tuning parameters?
# nu
# iter
# maxdepth
#---------------------------------------------------------------------------------
# Algorithm for training the model:
# for each resampled data set do
# hold out some samples
# for each combination of the three tuning parameters
# do
# Fit the model on the resampled data set
# Predict the values of class on the hold out samples
# end
# Calculate AUC: the area under the ROC for each sample
# Select the combination of tuning parmeters that yields the best AUC
#
# caret provides the "train" function to do all of this
#
# The trainControl function to set the training method
# Note the default method of picking the best model is accuracy and Cohen's Kappa
#
#-----------------------------------------------------------------------------------
# Set up the parameters to run the boosting function
ctrl <- trainControl(method="repeatedcv", # use repeated 10fold cross validation
number=5, # the number of folds
repeats=2, # do 2 repititions of 5-fold cv
summaryFunction=twoClassSummary, # Use AUC to pick the best model
classProbs=TRUE)
# Use the expand.grid to specify the search space
# Note that the default search grid selects 3 values of each tuning parameter
#
grid <- expand.grid(.nu=c(.1,1), #
.iter=c(20,50),
.maxdepth=c(20,30)) #
#
set.seed(1)
#names(trainData)
trainX <-trainData[,4:61]
#-----------------------------------------------------------------
# PARALLEL COMPUTING
# vignette("gettingstartedParallel")
 
cl <- makeCluster(4) # Use this to manually create a cluster
# But, since I only have a single Windows machine
# all I am doing is passing the number of cores to use to
# registerDoParallel()
registerDoParallel(cl) # Registrer a parallel backend for train
getDoParWorkers()
 
system.time(ada.tune <- train(x=trainX,y=trainData$Class,
method = "ada",
metric = "ROC",
trControl = ctrl,
control=control,
tuneGrid=grid))
#
stopCluster(cl)
#user system elapsed
#14.33 0.02 206.25
#-------------------------------------------------------------------------------
# ADA RESULTS
ada.tune # Look at the results for the training grid
ada.tune$finalModel # Look at the performance of the final model
plot(ada.tune) # Plot the performance of the training models
#--------------------------------------------------------------------------------
# ADA PREDICTIONS
testX <- testData[,4:61]
ada.pred <- predict(ada.tune,testX)
#
confusionMatrix(ada.pred,testData$Class)
#-----------------------------------------------------------------
# DRAW THE ROC CURVE
# Use roc function from the pROC package
ada.probs <- predict(ada.tune,testX,type="prob")
ada.ROC <- roc(predictor=ada.probs$PS,
response=testData$Class,
levels=rev(levels(testData$Class)))
plot(ada.ROC,col=2)
ada.ROC$auc # Get the area under the curve
#------------------------------------------------------------------------------------
#
# SUPPORT VECTOR MACHINE MODEL
#
set.seed(1)
registerDoParallel(4,cores=4)
getDoParWorkers()
system.time(
svm.tune <- train(x=trainX,
y= trainData$Class,
method = "svmRadial",
tuneLength = 5, # 5 values of the cost function
preProc = c("center","scale"),
metric="ROC",
trControl=ctrl) # same as for ada above
)
#user system elapsed
#2.40 0.14 26.10
 
 
#--------------------------------------------------------------
# SVM RESULTS
svm.tune # Look at the results for the training grid
svm.tune$finalModel # Look at the performance of the final mode
plot(svm.tune,
metric="ROC",
scales=list(x=list(log=2)))
#---------------------------------------------------------------
# SVM PREDICTIONS
svm.pred <- predict(svm.tune,testX)
confusionMatrix(svm.pred,testData$Class)
#
#----------------------------------------------------------------
# COMPARE THE SVM AND ADA MODELS USING RESAMPLING
#
# Because we set the same seed before running the models we can compare the models using resampling
# See Hothorn at al, "The design and analysis of benchmark experiments"
# Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699
# for comparing models using resampling.
#
# The resamples function in caret collates the resampling results from the two models
rValues <- resamples(list(svm=svm.tune,ada=ada.tune))
rValues$values # Look at the resample values
summary(rValues) # Summarize the resamples
 
#---------------------------------------------
xyplot(rValues,metric="ROC") # scatter plot
bwplot(rValues,metric="ROC") # boxplot
parallel(rValues,metric="ROC") # parallel plot
dotplot(rValues,metric="ROC") # dotplot
#
ROLL with RATTLE
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
##############################################################################
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# ROLL with RATTLE
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
 
#################################################################################
#
library(rattle) # load the rattle package
rattle() # start the rattle user interface
#
#data() # see what data sets are available in all of the loaded packages
data(package="rattle") # see what data sets are availeble in rattle
ls("package:rattle") # See what functions are in the Rattle package
#lsf.str("package:rattle") # see what functions are in rattle
#
# THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
#
# LOAD THE WEATHER DATA SET.
# The weather data set consists of observations made at a weather monitoring station
# in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
# See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for
# Knowlwdge Discovery, Springer 2011
#
# Go to the Data Tab and click on Execute
# Rattle will ask if you want to use the weather data as default. Click yes.
#
# SUMMARY STATISTICS
# Go to the Explore Tab
# Select summary and basics
# Hit Execute
#
# SCATTER PLOTS
# Go to the Explore Tab
# Select Distributions
# Click on Execute
#
# LOOK AT A SINGLE VARIABLE
# Go to Explore Tab
# Select RainTomorrow Bar Plot
# Hit Execute
# This produces a bar plot of the target variable RainTomorrow
# 84% of the observations have no rain
# A model that always predicts no rain should be about 84% accurate
#
# INVESTIGATE MULTIPLE VARIABLES
# Go to the Explore Tab
# In the upper panel select Box Plot and Histogram for both
# MaxTemp
# Sunshine
# Click Execute
#
# Boxplots top left: Temperature generally higher day before it rains
# Boxplots top right: Less sunshine day before it rains
#
# CORRELATIONS
# Go to the Explore Tab
# Un select any variables that may be selected
# Select Correlation
# Click on Execute
#
#
# INTERACTIVELY EXPLORE DATA
# Select Interactive and then Lattiscist
# In bottom center panel
# Select MaxTemp for y axis and
# Select Sunshine for x axis
# Place crosshair on outlier and right click
#
# BUILD A TREE MODEL
# Go to Model Tab
# Select Tree
# Click Execute
# Click on Draw button to get the graph
# Click on Rules button to see rules
# Select Log Tab to look at R code
#
# EVALUATE THE MODEL
# Go to the Evaluate tab
# Select
# Type = Error Matrix
# Model = Tree
# Data = Testing
# Click on Execute
#
# Error matrix for the Decision Tree model on weather.csv [test] (counts):
#
#Predicted
#Actual No Yes
#No 35 6 False positive rate = FP/N = 6/(35+6) = .146 = negatives incorrectly classified / total negatives
#Yes 5 10 True positive rate = TP/P = 10/(10+5) = .667 = positives correctly classified / total positives
# = sensitivity = recall = hit rate
# True negative rate = TN / (FP + TN) = 1 - FP rate = .854
# = specificity
#
# False positives = 6 = Type I Error (Test rejects true null hypothesis)
# False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)
 
#Error matrix for the Decision Tree model on weather.csv [test] (%):
#
#Predicted
#Actual No Yes
#No 62 11 62% (35/56) of cases model predicts it won't rain and it didn't
#Yes 9 18 18% (10/56) of cases model predicts it would rain and it did
# Accracy of test = 62% + 18% = 80%
#
#Overall error: 0.1964286
WORDCLOUD
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
 
#------------------------------------------------------------
# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
# February 14, 2013
# Joseph B. Rickert
# Technical Marketing Manager
#
# JUST FOR FUN - BUILD A WORD CLOUD
#
# Copyright: Revolution Analytics
# This script is licensed under the GPLv2 license
# http://www.gnu.org/licenses/gpl-2.0.html
 
 
# From example at RDataMining
# http://www.rdatamining.com/examples/text-mining
# This page shows an example on text mining of Twitter
#-----------------------------------------------------------------------------
# Load the libraries necesary
library(twitteR) # twitteR provides access to Twitter data
library(tm) # tm provides functions for text mining
library(Snowball) # Wrappers for Weka Java stemming funcitons
library(wordcloud) # wordcloud visualizes the result with a word cloud
library(RColorBrewer) # provides the rainbow colors
#------------------------------------------------------------------------------
# retrieve the first 100 tweets (or all tweets if fewer than 100)
# from the user timeline of @rdatammining
#
Tweets <- searchTwitter("#rstats",n=100)
n <- length(Tweets)
# Tweets[1:3]
#
#-------------------------------------------------------------------------------
#Transforming Text
#The tweets are first converted to a data frame and then to a corpus.
df <- do.call("rbind", lapply(Tweets, as.data.frame))
#dim(df)
# Just in case twitter is off-line
#df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1)
#head(df)
#
# Build a corpus, which is a collection of text documents
# VectorSource specifies that the source is character vectors.
myCorpus <- Corpus(VectorSource(df$text))
 
#After that, the corpus needs a couple of transformations, including
#changing letters to lower case,
#removing punctuations/numbers and removing stop words.
#The general English stop-word list is tailored by
#adding "available" and "via" and removing "r".
 
myCorpus <- tm_map(myCorpus, tolower) # lower case
myCorpus <- tm_map(myCorpus, removePunctuation) # remove punctuation
myCorpus <- tm_map(myCorpus, removeNumbers) # remove numbers
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#----------------------------------------------------------------------------
#Stemming Words
# In many cases, words need to be stemmed to retrieve their radicals.
# For instance, "example" and "examples" are both stemmed to "exampl".
# However, after that, one may want to complete the stems to their original
# forms, so that the words would look "normal".
 
dictCorpus <- myCorpus
# stem words in a text document with the snowball stemmers,
# which requires packages Snowball, RWeka, rJava, RWekajars
myCorpus <- tm_map(myCorpus, stemDocument)
 
#inspect(myCorpus[1:3]) # inspect the first three ``documents"
#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion
#
#
#inspect(myCorpus[1:3]) #Print the first three documents in the built corpus.
#----------------------------------------------------------------------------------------
#Building a Document-Term Matrix
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
# inspect(myDtm[266:270,31:40])
#
# findFreqTerms(myDtm, lowfreq=10) #Frequent Terms and Associations
# findAssocs(myDtm, 'analytics', 0.30) # which words are associated with "analytics"?
#-----------------------------------------------------------------------------------------
#Build the word cloud
#After building a document-term matrix, we can show the importance of
#words with a word cloud (also kown as a tag cloud) .
m <- as.matrix(myDtm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
# Plot the word cloud
pal <- brewer.pal(6,"Dark2")
pal <- pal[-(1)]
#random colors
wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal)
 
 

These scripts are for the Revolution Analytics Webinar: Introduction to R for Data Mining
February 14, 2013

i test the first part of your code on R but that didnt work. Which paket i have to install befor using your Skript

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.