Skip to content

Instantly share code, notes, and snippets.

@tjvananne
Last active May 31, 2022 22:37
Show Gist options
  • Save tjvananne/5b6444ad4c44240d2eaba0179b1dfb27 to your computer and use it in GitHub Desktop.
Save tjvananne/5b6444ad4c44240d2eaba0179b1dfb27 to your computer and use it in GitHub Desktop.
Target Shuffling in R - iris data
#' Target Shuffling
#' Author: Taylor Van Anne
#'
#' Note: this is just my interpretation of what target shuffling means
#' to me. I think there are a few different ways to actually conduct
#' the shuffling, but this is a single approach.
#'
#' A different approach than what I did here would be to shuffle the
#' entire target variable before the train/test split. I chose to
#' instead shuffle only within the test label values (after splitting
#' the label values into train/test)
# load libraries
library(randomForest)
library(ggplot2)
# this is the number of iterations of model building
num_iters <- 100
# allocating numeric vector space to store our results as we loop
results <- numeric(num_iters)
results_shuffled <- numeric(num_iters)
# setting a random seed for reproducibility
set.seed(4)
# begin the loop:
for(i in 1:num_iters) {
# replicate the iris data
myiris <- iris
# report out every tenth iteration
if(i %% 10 == 0) {print(paste0("iteration: ", i))}
# capture labels in character vector, remove label from x-data
myiris_labels <- myiris$Species
myiris$Species <- NULL
# identify train/test split
indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
indx_test <- setdiff(1:nrow(myiris), indx_train)
# split features (x) into train and test
myiris_train <- myiris[indx_train, ]
myiris_test <- myiris[indx_test, ]
# split labels (y) into train, test, and store a shuffled version of test y values as well
y_train <- myiris_labels[indx_train]
y_test <- myiris_labels[indx_test]
y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]
# build model based on real y values, then one based on shuffled y values
myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)
# make predictions based on real y values, then based on the model that saw shuffled y values
myrf_preds <- predict(myrf, myiris_test)
myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)
rm(myrf, myrf_shuffled)
# determine accuracy of each model
myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)
# store the accuracy in the pre-allocated numeric vector space
results[i] <- myrf_accuracy
results_shuffled[i] <- myrf_accuracy_shuffled
}
# label which results came from which experiment and combine into one data frame
df_results <- data.frame(accuracy=results, type='y_test')
df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
df_all <- rbind(df_results, df_results_shuffled)
# plot the density distribution of each group
ggplot(df_all, aes(x=accuracy, fill=type)) +
geom_density(alpha=0.4) +
theme_bw(base_size=16) +
ggtitle("Model vs Target-Shuffled Model")
# input shuffling
# exploring this concept
# based on this video: https://www.youtube.com/watch?v=OlHW7frH3ug
# just found another one: https://www.youtube.com/watch?v=tD8HZuWqIQw
# libraries:
library(randomForest)
library(ggplot2)
# data set:
myiris <- iris
# function definitions:
shuffle_this_col <- function(param_df, param_col) {
# shuffles a column in the dataframe
# example: shuffle_this_col(myiris, 'Sepal.Width')
col_vals <- param_df[, param_col]
shuf_vals <- col_vals[sample(1:length(col_vals), length(col_vals))]
param_df[, param_col] <- shuf_vals
return(param_df)
}
# "config" for experiment
iter_per_var <- 100
target_var <- 'Species'
vars_to_shuf <- setdiff(names(myiris), target_var)
set.seed(1776)
# scores will be model accuracy score, exp_desc is description of the experiment
scores <- numeric(iter_per_var * length(vars_to_shuf) * 2)
exp_desc <- character(iter_per_var * length(vars_to_shuf) * 2)
#' explaining the pre-allocated space above: I want `iter_per_var` number of
#' iterations per shuffled variable, so those are multiplied. I'd also like
#' to keep a "base-case" non-shuffled score in there as well, so that is why
#' I multiply by 2 at the end there.
# for each variable to shuffle, for number of iterations, build models and test
# old school incrementer for "simplicity" (I'm an old soul)
count <- 1
for(i in 1:length(vars_to_shuf)) {
# now we only have to do this subset once
this_var <- vars_to_shuf[i]
for(j in 1:iter_per_var) {
if(j %% 10 == 0) {print(paste0("iteration: ", j, " -- for variable: ", this_var))}
# isolate a copy of the data
myiris <- iris
# capture labels in character vector, remove label from x-data
myiris_labels <- myiris$Species
myiris$Species <- NULL
myiris_shuf <- shuffle_this_col(myiris, this_var)
# identify train/test split
indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
indx_test <- setdiff(1:nrow(myiris), indx_train)
# split features (x) into train and test
myiris_train <- myiris[indx_train, ]
myiris_train_shuf <- myiris_shuf[indx_train, ]
myiris_test <- myiris[indx_test, ]
myiris_test_shuf <- myiris_shuf[indx_test, ] # <----- should we do this? (try it with and without)
# split labels (y) into train, test, and store a shuffled version of test y values as well
y_train <- myiris_labels[indx_train]
y_test <- myiris_labels[indx_test]
# build model based on real y values, then one based on shuffled y values
myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
myrf_shuffled <- randomForest(x=myiris_train_shuf, y=y_train, xtest=myiris_test_shuf, ytest=y_test, keep.forest = TRUE)
# make predictions based on real y values, then based on the model that saw shuffled y values
myrf_preds <- predict(myrf, myiris_test)
myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test_shuf)
rm(myrf, myrf_shuffled)
# determine accuracy of each model
myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test, na.rm=T) / length(y_test)
# capture the "baseline" scores
scores[count] <- myrf_accuracy
exp_desc[count] <- 'baseline'
# mid loop increment -- it makes sense, trust me.
count <- count + 1
# capture the "shuffled scores"
scores[count] <- myrf_accuracy_shuffled
exp_desc[count] <- this_var
# end of loop increment
count <- count + 1
} # end inner for (j)
} # end outter for (i)
results_all <- data.frame(scores=scores, type=exp_desc)
ggplot(data=results_all, aes(x=scores, fill=type)) +
geom_density(alpha=0.3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment