Bergvca/stratified_sampling.R

## stratified_sampling.R
len_pos <- nrow(example_dataset[example_dataset$target==1,])
len_neg <- nrow(example_dataset[example_dataset$target==0,])

train_model <- function(training_data, labels, model_type, ...) {
  experiment_control <- trainControl(method="repeatedcv",
                                     number = 10,
                                     repeats = 2,
                                     classProbs = T,
                                     summaryFunction = custom_summary_function)
  train(x = training_data,
        y = labels,
        method = model_type,
        metric = "custom_score",
        trControl = experiment_control,
        verbose = F,
        ...)
}

# strata refers to which feature to do stratified sampling on.
# sampsize refers to the size of the bootstrap samples to be taken from each class. These samples will be taken as input
# for each tree.

fit_results <- train_model(example_dataset
                           , as.factor(sprintf("c%d", as.numeric(example_dataset$target)))
                           ,"rf"
                           ,tuneGrid = expand.grid(mtry = c( 3,5,10))
                           ,ntree=500
                           ,strata=as.factor(example_dataset$target)
                           ,sampsize = c('1'=as.integer(len_pos*0.25),'0'=as.integer(len_neg*0.8))
                     )
	len_pos <- nrow(example_dataset[example_dataset$target==1,])
	len_neg <- nrow(example_dataset[example_dataset$target==0,])

	train_model <- function(training_data, labels, model_type, ...) {
	experiment_control <- trainControl(method="repeatedcv",
	number = 10,
	repeats = 2,
	classProbs = T,
	summaryFunction = custom_summary_function)
	train(x = training_data,
	y = labels,
	method = model_type,
	metric = "custom_score",
	trControl = experiment_control,
	verbose = F,
	...)
	}

	# strata refers to which feature to do stratified sampling on.
	# sampsize refers to the size of the bootstrap samples to be taken from each class. These samples will be taken as input
	# for each tree.

	fit_results <- train_model(example_dataset
	, as.factor(sprintf("c%d", as.numeric(example_dataset$target)))
	,"rf"
	,tuneGrid = expand.grid(mtry = c( 3,5,10))
	,ntree=500
	,strata=as.factor(example_dataset$target)
	,sampsize = c('1'=as.integer(len_pos0.25),'0'=as.integer(len_neg0.8))
	)