Created
November 2, 2015 00:51
-
-
Save tobigithub/db6bcd0dbc56b8603dc5 to your computer and use it in GitHub Desktop.
Wrapper for caret package to train multiple models in one call
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Basketball: A function to run a lot of different models | |
#' | |
#' This function allows you to run a lot of different models from the caret package | |
#' @param method The methods to use i.e c("glm", "rf") | |
#' @param formula The formula to use | |
#' @param data The data frame to use. Can leave blank if supplying independent training and testing | |
#' @param regression Logical TRUE if type of model is "regression" | |
#' @param metric Metic to use. For example, "RMSE" if regression and "Accuracy" if classification | |
#' @param Train Do you want to supply a training set bypassing the partitioning | |
#' @param Test If you bypassed partition need to also supply testing set | |
#' @param p What percentage to split for the training and testing sets if you didn't supply training and testing | |
#' @param seed Set the seed | |
#' @param predictType What type of prediction to use; note "probs" does not work for all classification problems | |
#' @param na.action What should the prediction do with NAs. i.e. na.omit or NULL | |
#' @param roundPred Should you round the predictions, useful for problems with count data | |
#' @param verbose Logical; should it be verbose or not | |
#' @param plotting Logical; should the function also evaluate plots | |
#' @param ctrl Specify the ctrl, see caret package for more details | |
#' @param ntree Number of trees for random forest | |
#' @param family The family for a glm model | |
#' @param familyGBM The family for the gbm model | |
#' @param familyGLMboost The family for the glmboost model | |
#' @param glmBoostGrid Grid for glmboost #getModelInfo("glmboost", FALSE)[[1]]$grid | |
#' @param gbmGrid Grid for gbm | |
#' @param multinomGrid Grid for multinom | |
#' @param rfGrid Grid for random forest | |
#' @param LMTGrid Grid for LMT | |
#' @param LogitBoostGrid Grid for LogitBoost | |
#' @keywords caret | |
#' @keywords machine-learning | |
#' @export | |
#' @examples | |
#' #data(iris) | |
#' #models <- nbaParallel(method=c('rpart', 'ctree', 'rpart2'), metric="Accuracy", formula= Species~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data=iris, regression=FALSE) | |
#' #Could use to predict with mean of models' predictions | |
#' #predlist <- predict(models$Tune, newdata=models$Test)#,na.action=NULL) | |
#' #predlist <- lapply(predlist, as.integer) | |
#' #predlist <- do.call(cbind, predlist) | |
#' #predagg <- round(apply(predlist, 1, mean),0) | |
nbaParallel <- | |
function(method, | |
formula, | |
data, | |
regression = NULL, #what type of analysis are you doing | |
metric = NULL, #RMSE, Accuracy | |
Train = NULL, | |
Test = NULL, | |
p=0.8, #how to split the data | |
seed = 1, #a general seed to use | |
predictType = "raw", | |
na.action = na.omit, | |
roundPred = FALSE, #if count data or using a non-classification method for classification | |
verbose = FALSE, | |
plotting = TRUE, | |
ctrl = trainControl(method = "cv", | |
number = 10, | |
savePred=T), | |
ntree = 500, #for random forest | |
family = NULL, | |
familyGBM = NULL, | |
familyGLMboost = NULL, #Binomial(link = c("logit")), | |
glmBoostGrid = NULL, # data.frame(mstop = floor((1:10) * 50), prune = "no"), | |
gbmGrid = NULL, #expand.grid(interaction.depth = seq(1, 2), n.trees = floor((1:10) * 50), shrinkage = .05), gbmGrid <- expand.grid(.interaction.depth = seq(1, 7, by = 2),.n.trees = seq(100, 1000, by = 50), .shrinkage = c(0.01, 0.1)) | |
multinomGrid = NULL, #expand.grid(decay = c(0, 5 ^ seq(-1, -4, length = 60))) | |
rfGrid = NULL, #mtryVals <- floor(seq(10, ncol(solTrainXtrans), length = 10)) mtryGrid <- data.frame(.mtry = mtryVals) | |
LMTGrid = NULL, | |
LogitBoostGrid = NULL | |
){ | |
#**********Begin the function*************************************************************************************# | |
library('caret') | |
library('foreach') | |
library('ggplot2') | |
#seperate predictors from not and create new data | |
y <- all.vars(formula)[1] | |
predictors <- all.vars(formula)[all.vars(formula) != y] | |
data <- data[,c(y, predictors)] | |
#use supplied control method | |
set.seed(seed) | |
ctrl <- ctrl | |
#**********Seperate into training and testing**********************************************************************# | |
#seperate into training and testing sets if not supplied, else use the supplied | |
if(is.null(Train) & is.null(Test)){ | |
set.seed(seed) | |
inTrainingSet <- createDataPartition( data[,y], p = p, list = FALSE) | |
Train <- data[ inTrainingSet,] | |
Test <- data[-inTrainingSet,] | |
} else if(!is.null(Train) & !is.null(Test)){ | |
Train <- Train | |
Test <- Test | |
} else if(!is.null(Train) & is.null(Test)){ | |
Train <- Train | |
Test <- NULL | |
} | |
#**********FUNCTION TO ALLOW FOR FAMILY AND PARAMETER**********************************************************************# | |
addParam <- function(expr, grid=NULL, family=NULL) { | |
if(!is.null(grid) & is.null(family)) { | |
ll<-as.list(expr) | |
ll$tuneGrid <- grid | |
as.call(ll) | |
} else if(!is.null(family) & is.null(grid)) { | |
ll<-as.list(expr) | |
ll$family <- family | |
as.call(ll) | |
} else if(!is.null(grid) & !is.null(family)) { | |
ll<-as.list(expr) | |
ll$family <- family | |
ll$tuneGrid <- grid | |
as.call(ll) | |
} else { | |
expr | |
} | |
} | |
#' mycall<-quote(train(form= medv~., data = Boston, method = "glmboost")) | |
#' call <- addParam(mycall, grid=quote(glmBoostGrid), family=NULL) | |
#' eval(call) | |
#********************************************************************************# | |
#create empty vectors (bad idea should allocate upfront) | |
Tune <- vector("list", length(method)) | |
names(Tune) <- method | |
Predict <- vector("list", length(method)) | |
names(Predict) <- method | |
Confusion <- vector("list", length(method)) | |
names(Confusion) <- method | |
if(regression==FALSE){ | |
if(ctrl$classProbs==TRUE){ #using two class summary | |
testSummary <- data.frame(ROC = numeric(0), Sens=numeric(0), Spec=numeric(0)) | |
} else { #using default summary | |
testSummary <- data.frame(Accuracy = numeric(0), Kappa=numeric(0)) | |
} | |
} else { | |
testSummary <- data.frame(RMSE = numeric(0), R2=numeric(0)) | |
} | |
Plot <- list() | |
Plot2 <- list() | |
Plot3 <- list() | |
#********************************************************************************# | |
Tune <- foreach(i=1:length(method), .inorder=TRUE) %do% { | |
cat("Tuning model", method[i], "\n") | |
#*******************# | |
#evaluate models | |
#*******************# | |
set.seed(seed) | |
#glm but poisson and quasipoisson specified below | |
if(method[i]=="glm"){ | |
set.seed(seed) | |
mycall <- quote( train( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl)) | |
mycall <- addParam(mycall, grid=NULL, family=family) | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#Gradient Boosting with no Family Specified and default grid | |
} else if(method[i]=="gbm"){ | |
set.seed(seed) | |
mycall <- quote ( train( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl, | |
verbose = verbose)) | |
mycall <- addParam( mycall, grid=gbmGrid, family=NULL) #it's called distribution for gbm | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#Other GLM Boosts (using default Grid) | |
} else if(method[i]=="glmboost"){ | |
library(mboost) | |
set.seed(seed) | |
mycall <- quote( train ( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl)) | |
mycall <- addParam( mycall, grid=glmBoostGrid, family=familyGLMboost) | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#lmBoost | |
} else if(method[i]=="lmBoost"){ | |
library(mboost) | |
set.seed(seed) | |
mycall <- quote( train ( form = update(formula, as.numeric(as.character(.))~.), | |
data = Train, | |
method = "glmboost", | |
metric = metric, | |
trControl = ctrl) ) | |
mycall <- addParam( mycall, grid=glmBoostGrid, family=Gaussian()) | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#LogitBoost | |
} else if(method[i]=="LogitBoost"){ | |
set.seed(seed) | |
mycall <- quote( train ( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl) ) | |
mycall <- addParam( mycall, grid=LogitBoostGrid) | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#LogitBoost | |
} else if(method[i]=="LMT"){ | |
set.seed(seed) | |
mycall <- quote( train ( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl) ) | |
mycall <- addParam( mycall, grid=LMTGrid) | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#POISSON | |
} else if(method[i]=="poisson"){ | |
set.seed(seed) | |
Tune[[i]] <- train( form = update(formula, as.numeric(as.character(.))~.), | |
data = Train, | |
method = "glm", | |
trControl = ctrl, | |
metric = metric, | |
family = poisson(link = "log") ) | |
#QUASI POISSON | |
} else if(method[i]=="quasipoisson"){ | |
Tune[[i]] <- train( form = update(formula, as.numeric(as.character(.))~.), | |
data = Train, | |
method = "glm", | |
trControl = ctrl, | |
metric = metric, | |
family = quasipoisson(link = "log") ) | |
#Linear Model | |
} else if(method[i]=="lm"){ #no verbose argument and need to change factor to numeric | |
set.seed(seed) | |
Tune[[i]] <- train( form = update(formula, as.numeric(as.character(.)) ~.), | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl ) | |
#Multinomial | |
} else if(method[i]=="multinom"){ #no verbose argument and need to change factor to numeric | |
set.seed(seed) | |
mycall <- quote( train( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl) ) | |
mycall <- addParam( mycall, grid=multinomGrid, family=NULL) | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#Random Forest | |
} else if(method[i]=="rf"){ #added ntree argument for randomForest | |
set.seed(seed) | |
mycall <- quote( train( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl, | |
ntree = ntree, | |
verbose = verbose ) ) | |
mycall <- addParam( mycall, grid=rfGrid, family=NULL) | |
Tune[[i]] <- eval(mycall) | |
Tune[[i]] | |
#All Other Models | |
} else { | |
set.seed(seed) | |
Tune[[i]] <- train( form = formula, | |
data = Train, | |
method = method[i], | |
metric = metric, | |
trControl = ctrl ) | |
} | |
}#end foreach | |
names(Tune) <- method | |
#********************************************************************************# | |
if(!(!is.null(Train) & is.null(Test))) { #if you want to skip predictions when not specifying Tune | |
#*******************# | |
#evaluate predictions | |
#*******************# | |
Predict <- foreach(i=1:length(method), .inorder=TRUE) %do% { | |
cat("Predicting with model", method[i], "\n") | |
if(regression == FALSE){ | |
try( | |
if(roundPred == TRUE & class(predict.train(Tune[[i]], newdata=Test, type = predictType))!= "factor"){ | |
try( factor( round( predict.train(Tune[[i]], newdata=Test, type = predictType, na.action=na.action), 0) ) ) | |
} else if(class(predict.train(Tune[[i]], newdata=Test, type = predictType))!= "factor"){ | |
try( factor( predict.train( Tune[[i]], newdata=Test, type = predictType, na.action=na.action ) ) ) | |
} else { | |
try(predict.train(Tune[[i]], newdata=Test, type = predictType, na.action = na.action)) | |
} | |
) | |
} else if(regression == TRUE | method[i] == "lm"){ | |
if(roundPred == FALSE){ | |
predict.train( Tune[[i]], newdata=Test, na.action = na.action) | |
} else { | |
round( predict.train(Tune[[i]], newdata=Test, na.action = na.action), 0) | |
} | |
} | |
} #end Predict Loop | |
names(Predict) <- method | |
#********************************************************************************# | |
testSummary <- foreach(i=1:length(method), .inorder=TRUE, .combine=rbind) %do% { | |
if(regression == FALSE){ | |
cat("Model", method[i], | |
"has levels: ", levels(predict.train(Tune[[i]], newdata=Test, type=predictType)) , | |
"and the observed levels are: ", levels(factor(Test[, y])), "\n" ) | |
if(ctrl$classProbs==TRUE){ | |
try(rbind( twoClassSummary( data.frame( obs=factor(Test[, y]), pred=Predict[[i]]) ) )) | |
} else { | |
try(rbind( defaultSummary( data.frame( obs=factor(Test[, y]), pred=Predict[[i]]) ) )) | |
} | |
} else if(regression == TRUE){ | |
rbind( defaultSummary( data.frame( obs=Test[, y], pred=Predict[[i]]) ) ) | |
} | |
}#end foreach | |
rownames(testSummary) <- method | |
if(regression == FALSE){ | |
Confusion <- foreach(i=1:length(method), .inorder=TRUE) %do% { | |
try(confusionMatrix(data = Predict[[i]], reference = factor(Test[,y]))) | |
}#end foreach | |
names(Confusion) <- method | |
}#end if | |
#**********PLOTTING**********************************************************************# | |
if(plotting==TRUE){ | |
if(regression==TRUE){ | |
#***********# | |
Plot2 <- foreach(i=1:length(method), .inorder=TRUE) %do% { | |
print(qplot( Test[,y], | |
Predict[[i]], | |
xlab = "Observed", | |
ylab = "Predicted", | |
main = paste(method[i],"- Test Set - Observed vs. Predicted" | |
)) + geom_abline(intercept = 0, slope = 1, colour = "red", size = 1)) | |
}#end foreach | |
names(Plot2) <- method | |
#***********# | |
Plot3 <- foreach(i=1:length(method), .inorder=TRUE) %do% { | |
print(qplot( Predict[[i]], | |
Test[,y] - Predict[[i]], | |
xlab = "Predicted", | |
ylab = "Error", | |
main = paste(method[i], "- Test Set - Error vs. Predicted" | |
))+ geom_abline(intercept = 0, slope = 0, colour = "red", size = 1)) | |
} | |
names(Plot3) <- method | |
}#end if regression==True | |
} | |
#********************************************************************************# | |
if(length(method)>1){ | |
cvValues <- resamples(Tune) | |
if(regression == TRUE){ | |
finalSummary <- data.frame(cbind(Methods = cvValues$methods, | |
testSummary, | |
cvRMSE = summary(cvValues)$statistics$RMSE[,"Mean"], | |
cvR2 = summary(cvValues)$statistics$Rsquared[,"Mean"], | |
Time = round(cvValues$timings$Everything, 2) | |
)) | |
finalSummary <- finalSummary[order(finalSummary[,2], decreasing=FALSE), ] | |
} else { | |
#if using default summary | |
if(ctrl$classProbs==FALSE){ | |
finalSummary <- data.frame(cbind(Methods = cvValues$methods, | |
testSummary, | |
cvAccuracy = summary(cvValues)$statistics$Accuracy[,"Mean"], | |
cvKappa = summary(cvValues)$statistics$Kappa[,"Mean"], | |
Time = round(cvValues$timings$Everything, 2) )) | |
#if using twoclass summary | |
} else { | |
finalSummary <- data.frame(cbind(Methods = cvValues$methods, | |
testSummary, | |
cvROC = summary(cvValues)$statistics$ROC[,"Mean"], | |
cvSens = summary(cvValues)$statistics$Sens[,"Mean"], | |
cvSpec = summary(cvValues)$statistics$Spec[,"Mean"], | |
Time = round(cvValues$timings$Everything, 2) )) | |
} | |
finalSummary <- finalSummary[order(finalSummary[,2], decreasing=TRUE), ] | |
} | |
} else { | |
cvValues <- "Needs more than one model" | |
finalSummary <- "Needs more than one model" | |
} | |
Observations <- data.frame(Train.Obs = nrow(Train), | |
Train.Mean = mean( as.numeric(Train[,y])), | |
Train.SD = sd( as.numeric(Train[,y])), | |
Test.Obs = nrow(Test), | |
Test.Mean = mean( as.numeric(Test[,y])), | |
Test.SD = sd( as.numeric(Test[,y])) ) | |
#********************************************************************************# | |
} else { | |
Observations <- data.frame(Train.Obs = nrow(Train), | |
Train.Mean = mean( as.numeric(Train[,y])), | |
Train.SD = sd( as.numeric(Train[,y]))) | |
finalSummary <- "You did not specify a test set" | |
Confusion <- "You did not specify a test set" | |
}#if you wanted to add if(!(!is.null(Train) & is.null(Test))) from since prediction | |
#********TUNE SUMMARY AND PLOTS************************************************************************# | |
col.want <- c("Accuracy","Kappa", "AccuracySD", "KappaSD", | |
"ROC", "Spec","Sens","ROCSD","SpecSD", "SensSD", | |
"RMSE", "Rsquared", "RMSESD", "RsquaredSD") | |
# tuneSummary <- sapply(method, function(x) Tune[[x]]$results[rownames(Tune[[x]]$bestTune), colnames(Tune[[x]]$results)%in%col.want]) | |
# tuneSummary <- t(tuneSummary) #still treats each element it as a list | |
tuneSummary <- lapply(method, function(x) | |
Tune[[x]]$results[rownames(Tune[[x]]$bestTune), | |
colnames(Tune[[x]]$results)%in%col.want]) | |
tuneSummary <- do.call("rbind", tuneSummary) | |
rownames(tuneSummary) <- method | |
#*****PLOT TUNE GRID AND PLOT PERFORMANCE******************************************# | |
if(plotting==TRUE){ | |
Plot <- foreach(i=1:length(method), .inorder=TRUE) %do% { | |
if(inherits(tryCatch(ggplot(Tune[[i]]), error=function(e) e), "error")) { | |
c() | |
} else { | |
print(ggplot(Tune[[i]]) + labs(title=method[i])) | |
} | |
}#end foreach | |
names(Plot) <- method | |
if(length(method)>1){ | |
(Plot4 <- dotplot(resamples(Tune), metric=metric, main="Resamples Plot")) | |
} | |
} | |
#********************************************************************************# | |
index <- Tune[[1]]$control$index | |
#********************************************************************************# | |
Results <- list( Observations = Observations, | |
index = index, | |
Train = Train, | |
Test = Test, | |
Tune = Tune, | |
Predict = Predict, | |
Plot = Plot, | |
Plot2 = Plot2, | |
Plot3 = Plot3, | |
Plot4 = Plot4, | |
tuneSummary = tuneSummary, | |
testSummary = testSummary, | |
Confusion = Confusion, | |
# cvValues = cvValues, | |
finalSummary = finalSummary | |
) | |
return(Results) | |
} #end function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
i've learn a lot for build the function from this. im just starter but this is really good to learn from you thank you so much !!