Skip to content

Instantly share code, notes, and snippets.

@tobigithub
Created November 2, 2015 00:51
Show Gist options
  • Save tobigithub/db6bcd0dbc56b8603dc5 to your computer and use it in GitHub Desktop.
Save tobigithub/db6bcd0dbc56b8603dc5 to your computer and use it in GitHub Desktop.
Wrapper for caret package to train multiple models in one call
#' Basketball: A function to run a lot of different models
#'
#' This function allows you to run a lot of different models from the caret package
#' @param method The methods to use i.e c("glm", "rf")
#' @param formula The formula to use
#' @param data The data frame to use. Can leave blank if supplying independent training and testing
#' @param regression Logical TRUE if type of model is "regression"
#' @param metric Metic to use. For example, "RMSE" if regression and "Accuracy" if classification
#' @param Train Do you want to supply a training set bypassing the partitioning
#' @param Test If you bypassed partition need to also supply testing set
#' @param p What percentage to split for the training and testing sets if you didn't supply training and testing
#' @param seed Set the seed
#' @param predictType What type of prediction to use; note "probs" does not work for all classification problems
#' @param na.action What should the prediction do with NAs. i.e. na.omit or NULL
#' @param roundPred Should you round the predictions, useful for problems with count data
#' @param verbose Logical; should it be verbose or not
#' @param plotting Logical; should the function also evaluate plots
#' @param ctrl Specify the ctrl, see caret package for more details
#' @param ntree Number of trees for random forest
#' @param family The family for a glm model
#' @param familyGBM The family for the gbm model
#' @param familyGLMboost The family for the glmboost model
#' @param glmBoostGrid Grid for glmboost #getModelInfo("glmboost", FALSE)[[1]]$grid
#' @param gbmGrid Grid for gbm
#' @param multinomGrid Grid for multinom
#' @param rfGrid Grid for random forest
#' @param LMTGrid Grid for LMT
#' @param LogitBoostGrid Grid for LogitBoost
#' @keywords caret
#' @keywords machine-learning
#' @export
#' @examples
#' #data(iris)
#' #models <- nbaParallel(method=c('rpart', 'ctree', 'rpart2'), metric="Accuracy", formula= Species~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data=iris, regression=FALSE)
#' #Could use to predict with mean of models' predictions
#' #predlist <- predict(models$Tune, newdata=models$Test)#,na.action=NULL)
#' #predlist <- lapply(predlist, as.integer)
#' #predlist <- do.call(cbind, predlist)
#' #predagg <- round(apply(predlist, 1, mean),0)
nbaParallel <-
function(method,
formula,
data,
regression = NULL, #what type of analysis are you doing
metric = NULL, #RMSE, Accuracy
Train = NULL,
Test = NULL,
p=0.8, #how to split the data
seed = 1, #a general seed to use
predictType = "raw",
na.action = na.omit,
roundPred = FALSE, #if count data or using a non-classification method for classification
verbose = FALSE,
plotting = TRUE,
ctrl = trainControl(method = "cv",
number = 10,
savePred=T),
ntree = 500, #for random forest
family = NULL,
familyGBM = NULL,
familyGLMboost = NULL, #Binomial(link = c("logit")),
glmBoostGrid = NULL, # data.frame(mstop = floor((1:10) * 50), prune = "no"),
gbmGrid = NULL, #expand.grid(interaction.depth = seq(1, 2), n.trees = floor((1:10) * 50), shrinkage = .05), gbmGrid <- expand.grid(.interaction.depth = seq(1, 7, by = 2),.n.trees = seq(100, 1000, by = 50), .shrinkage = c(0.01, 0.1))
multinomGrid = NULL, #expand.grid(decay = c(0, 5 ^ seq(-1, -4, length = 60)))
rfGrid = NULL, #mtryVals <- floor(seq(10, ncol(solTrainXtrans), length = 10)) mtryGrid <- data.frame(.mtry = mtryVals)
LMTGrid = NULL,
LogitBoostGrid = NULL
){
#**********Begin the function*************************************************************************************#
library('caret')
library('foreach')
library('ggplot2')
#seperate predictors from not and create new data
y <- all.vars(formula)[1]
predictors <- all.vars(formula)[all.vars(formula) != y]
data <- data[,c(y, predictors)]
#use supplied control method
set.seed(seed)
ctrl <- ctrl
#**********Seperate into training and testing**********************************************************************#
#seperate into training and testing sets if not supplied, else use the supplied
if(is.null(Train) & is.null(Test)){
set.seed(seed)
inTrainingSet <- createDataPartition( data[,y], p = p, list = FALSE)
Train <- data[ inTrainingSet,]
Test <- data[-inTrainingSet,]
} else if(!is.null(Train) & !is.null(Test)){
Train <- Train
Test <- Test
} else if(!is.null(Train) & is.null(Test)){
Train <- Train
Test <- NULL
}
#**********FUNCTION TO ALLOW FOR FAMILY AND PARAMETER**********************************************************************#
addParam <- function(expr, grid=NULL, family=NULL) {
if(!is.null(grid) & is.null(family)) {
ll<-as.list(expr)
ll$tuneGrid <- grid
as.call(ll)
} else if(!is.null(family) & is.null(grid)) {
ll<-as.list(expr)
ll$family <- family
as.call(ll)
} else if(!is.null(grid) & !is.null(family)) {
ll<-as.list(expr)
ll$family <- family
ll$tuneGrid <- grid
as.call(ll)
} else {
expr
}
}
#' mycall<-quote(train(form= medv~., data = Boston, method = "glmboost"))
#' call <- addParam(mycall, grid=quote(glmBoostGrid), family=NULL)
#' eval(call)
#********************************************************************************#
#create empty vectors (bad idea should allocate upfront)
Tune <- vector("list", length(method))
names(Tune) <- method
Predict <- vector("list", length(method))
names(Predict) <- method
Confusion <- vector("list", length(method))
names(Confusion) <- method
if(regression==FALSE){
if(ctrl$classProbs==TRUE){ #using two class summary
testSummary <- data.frame(ROC = numeric(0), Sens=numeric(0), Spec=numeric(0))
} else { #using default summary
testSummary <- data.frame(Accuracy = numeric(0), Kappa=numeric(0))
}
} else {
testSummary <- data.frame(RMSE = numeric(0), R2=numeric(0))
}
Plot <- list()
Plot2 <- list()
Plot3 <- list()
#********************************************************************************#
Tune <- foreach(i=1:length(method), .inorder=TRUE) %do% {
cat("Tuning model", method[i], "\n")
#*******************#
#evaluate models
#*******************#
set.seed(seed)
#glm but poisson and quasipoisson specified below
if(method[i]=="glm"){
set.seed(seed)
mycall <- quote( train( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl))
mycall <- addParam(mycall, grid=NULL, family=family)
Tune[[i]] <- eval(mycall)
Tune[[i]]
#Gradient Boosting with no Family Specified and default grid
} else if(method[i]=="gbm"){
set.seed(seed)
mycall <- quote ( train( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl,
verbose = verbose))
mycall <- addParam( mycall, grid=gbmGrid, family=NULL) #it's called distribution for gbm
Tune[[i]] <- eval(mycall)
Tune[[i]]
#Other GLM Boosts (using default Grid)
} else if(method[i]=="glmboost"){
library(mboost)
set.seed(seed)
mycall <- quote( train ( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl))
mycall <- addParam( mycall, grid=glmBoostGrid, family=familyGLMboost)
Tune[[i]] <- eval(mycall)
Tune[[i]]
#lmBoost
} else if(method[i]=="lmBoost"){
library(mboost)
set.seed(seed)
mycall <- quote( train ( form = update(formula, as.numeric(as.character(.))~.),
data = Train,
method = "glmboost",
metric = metric,
trControl = ctrl) )
mycall <- addParam( mycall, grid=glmBoostGrid, family=Gaussian())
Tune[[i]] <- eval(mycall)
Tune[[i]]
#LogitBoost
} else if(method[i]=="LogitBoost"){
set.seed(seed)
mycall <- quote( train ( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl) )
mycall <- addParam( mycall, grid=LogitBoostGrid)
Tune[[i]] <- eval(mycall)
Tune[[i]]
#LogitBoost
} else if(method[i]=="LMT"){
set.seed(seed)
mycall <- quote( train ( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl) )
mycall <- addParam( mycall, grid=LMTGrid)
Tune[[i]] <- eval(mycall)
Tune[[i]]
#POISSON
} else if(method[i]=="poisson"){
set.seed(seed)
Tune[[i]] <- train( form = update(formula, as.numeric(as.character(.))~.),
data = Train,
method = "glm",
trControl = ctrl,
metric = metric,
family = poisson(link = "log") )
#QUASI POISSON
} else if(method[i]=="quasipoisson"){
Tune[[i]] <- train( form = update(formula, as.numeric(as.character(.))~.),
data = Train,
method = "glm",
trControl = ctrl,
metric = metric,
family = quasipoisson(link = "log") )
#Linear Model
} else if(method[i]=="lm"){ #no verbose argument and need to change factor to numeric
set.seed(seed)
Tune[[i]] <- train( form = update(formula, as.numeric(as.character(.)) ~.),
data = Train,
method = method[i],
metric = metric,
trControl = ctrl )
#Multinomial
} else if(method[i]=="multinom"){ #no verbose argument and need to change factor to numeric
set.seed(seed)
mycall <- quote( train( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl) )
mycall <- addParam( mycall, grid=multinomGrid, family=NULL)
Tune[[i]] <- eval(mycall)
Tune[[i]]
#Random Forest
} else if(method[i]=="rf"){ #added ntree argument for randomForest
set.seed(seed)
mycall <- quote( train( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl,
ntree = ntree,
verbose = verbose ) )
mycall <- addParam( mycall, grid=rfGrid, family=NULL)
Tune[[i]] <- eval(mycall)
Tune[[i]]
#All Other Models
} else {
set.seed(seed)
Tune[[i]] <- train( form = formula,
data = Train,
method = method[i],
metric = metric,
trControl = ctrl )
}
}#end foreach
names(Tune) <- method
#********************************************************************************#
if(!(!is.null(Train) & is.null(Test))) { #if you want to skip predictions when not specifying Tune
#*******************#
#evaluate predictions
#*******************#
Predict <- foreach(i=1:length(method), .inorder=TRUE) %do% {
cat("Predicting with model", method[i], "\n")
if(regression == FALSE){
try(
if(roundPred == TRUE & class(predict.train(Tune[[i]], newdata=Test, type = predictType))!= "factor"){
try( factor( round( predict.train(Tune[[i]], newdata=Test, type = predictType, na.action=na.action), 0) ) )
} else if(class(predict.train(Tune[[i]], newdata=Test, type = predictType))!= "factor"){
try( factor( predict.train( Tune[[i]], newdata=Test, type = predictType, na.action=na.action ) ) )
} else {
try(predict.train(Tune[[i]], newdata=Test, type = predictType, na.action = na.action))
}
)
} else if(regression == TRUE | method[i] == "lm"){
if(roundPred == FALSE){
predict.train( Tune[[i]], newdata=Test, na.action = na.action)
} else {
round( predict.train(Tune[[i]], newdata=Test, na.action = na.action), 0)
}
}
} #end Predict Loop
names(Predict) <- method
#********************************************************************************#
testSummary <- foreach(i=1:length(method), .inorder=TRUE, .combine=rbind) %do% {
if(regression == FALSE){
cat("Model", method[i],
"has levels: ", levels(predict.train(Tune[[i]], newdata=Test, type=predictType)) ,
"and the observed levels are: ", levels(factor(Test[, y])), "\n" )
if(ctrl$classProbs==TRUE){
try(rbind( twoClassSummary( data.frame( obs=factor(Test[, y]), pred=Predict[[i]]) ) ))
} else {
try(rbind( defaultSummary( data.frame( obs=factor(Test[, y]), pred=Predict[[i]]) ) ))
}
} else if(regression == TRUE){
rbind( defaultSummary( data.frame( obs=Test[, y], pred=Predict[[i]]) ) )
}
}#end foreach
rownames(testSummary) <- method
if(regression == FALSE){
Confusion <- foreach(i=1:length(method), .inorder=TRUE) %do% {
try(confusionMatrix(data = Predict[[i]], reference = factor(Test[,y])))
}#end foreach
names(Confusion) <- method
}#end if
#**********PLOTTING**********************************************************************#
if(plotting==TRUE){
if(regression==TRUE){
#***********#
Plot2 <- foreach(i=1:length(method), .inorder=TRUE) %do% {
print(qplot( Test[,y],
Predict[[i]],
xlab = "Observed",
ylab = "Predicted",
main = paste(method[i],"- Test Set - Observed vs. Predicted"
)) + geom_abline(intercept = 0, slope = 1, colour = "red", size = 1))
}#end foreach
names(Plot2) <- method
#***********#
Plot3 <- foreach(i=1:length(method), .inorder=TRUE) %do% {
print(qplot( Predict[[i]],
Test[,y] - Predict[[i]],
xlab = "Predicted",
ylab = "Error",
main = paste(method[i], "- Test Set - Error vs. Predicted"
))+ geom_abline(intercept = 0, slope = 0, colour = "red", size = 1))
}
names(Plot3) <- method
}#end if regression==True
}
#********************************************************************************#
if(length(method)>1){
cvValues <- resamples(Tune)
if(regression == TRUE){
finalSummary <- data.frame(cbind(Methods = cvValues$methods,
testSummary,
cvRMSE = summary(cvValues)$statistics$RMSE[,"Mean"],
cvR2 = summary(cvValues)$statistics$Rsquared[,"Mean"],
Time = round(cvValues$timings$Everything, 2)
))
finalSummary <- finalSummary[order(finalSummary[,2], decreasing=FALSE), ]
} else {
#if using default summary
if(ctrl$classProbs==FALSE){
finalSummary <- data.frame(cbind(Methods = cvValues$methods,
testSummary,
cvAccuracy = summary(cvValues)$statistics$Accuracy[,"Mean"],
cvKappa = summary(cvValues)$statistics$Kappa[,"Mean"],
Time = round(cvValues$timings$Everything, 2) ))
#if using twoclass summary
} else {
finalSummary <- data.frame(cbind(Methods = cvValues$methods,
testSummary,
cvROC = summary(cvValues)$statistics$ROC[,"Mean"],
cvSens = summary(cvValues)$statistics$Sens[,"Mean"],
cvSpec = summary(cvValues)$statistics$Spec[,"Mean"],
Time = round(cvValues$timings$Everything, 2) ))
}
finalSummary <- finalSummary[order(finalSummary[,2], decreasing=TRUE), ]
}
} else {
cvValues <- "Needs more than one model"
finalSummary <- "Needs more than one model"
}
Observations <- data.frame(Train.Obs = nrow(Train),
Train.Mean = mean( as.numeric(Train[,y])),
Train.SD = sd( as.numeric(Train[,y])),
Test.Obs = nrow(Test),
Test.Mean = mean( as.numeric(Test[,y])),
Test.SD = sd( as.numeric(Test[,y])) )
#********************************************************************************#
} else {
Observations <- data.frame(Train.Obs = nrow(Train),
Train.Mean = mean( as.numeric(Train[,y])),
Train.SD = sd( as.numeric(Train[,y])))
finalSummary <- "You did not specify a test set"
Confusion <- "You did not specify a test set"
}#if you wanted to add if(!(!is.null(Train) & is.null(Test))) from since prediction
#********TUNE SUMMARY AND PLOTS************************************************************************#
col.want <- c("Accuracy","Kappa", "AccuracySD", "KappaSD",
"ROC", "Spec","Sens","ROCSD","SpecSD", "SensSD",
"RMSE", "Rsquared", "RMSESD", "RsquaredSD")
# tuneSummary <- sapply(method, function(x) Tune[[x]]$results[rownames(Tune[[x]]$bestTune), colnames(Tune[[x]]$results)%in%col.want])
# tuneSummary <- t(tuneSummary) #still treats each element it as a list
tuneSummary <- lapply(method, function(x)
Tune[[x]]$results[rownames(Tune[[x]]$bestTune),
colnames(Tune[[x]]$results)%in%col.want])
tuneSummary <- do.call("rbind", tuneSummary)
rownames(tuneSummary) <- method
#*****PLOT TUNE GRID AND PLOT PERFORMANCE******************************************#
if(plotting==TRUE){
Plot <- foreach(i=1:length(method), .inorder=TRUE) %do% {
if(inherits(tryCatch(ggplot(Tune[[i]]), error=function(e) e), "error")) {
c()
} else {
print(ggplot(Tune[[i]]) + labs(title=method[i]))
}
}#end foreach
names(Plot) <- method
if(length(method)>1){
(Plot4 <- dotplot(resamples(Tune), metric=metric, main="Resamples Plot"))
}
}
#********************************************************************************#
index <- Tune[[1]]$control$index
#********************************************************************************#
Results <- list( Observations = Observations,
index = index,
Train = Train,
Test = Test,
Tune = Tune,
Predict = Predict,
Plot = Plot,
Plot2 = Plot2,
Plot3 = Plot3,
Plot4 = Plot4,
tuneSummary = tuneSummary,
testSummary = testSummary,
Confusion = Confusion,
# cvValues = cvValues,
finalSummary = finalSummary
)
return(Results)
} #end function
@Snlz
Copy link

Snlz commented Sep 5, 2021

i've learn a lot for build the function from this. im just starter but this is really good to learn from you thank you so much !!

@tobigithub
Copy link
Author

@Snlz, glad it helped, but I am sure i just forked that from somewhere.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment