tjvananne/aaa_target_shuffling.R

## aaa_target_shuffling.R


#' Target Shuffling
#' Author: Taylor Van Anne
#'
#' Note: this is just my interpretation of what target shuffling means
#' to me. I think there are a few different ways to actually conduct
#' the shuffling, but this is a single approach.
#'
#' A different approach than what I did here would be to shuffle the
#' entire target variable before the train/test split. I chose to
#' instead shuffle only within the test label values (after splitting
#' the label values into train/test)


# load libraries
library(randomForest)
library(ggplot2)

# this is the number of iterations of model building
num_iters <- 100

# allocating numeric vector space to store our results as we loop
results <- numeric(num_iters)
results_shuffled <- numeric(num_iters)

# setting a random seed for reproducibility
set.seed(4)


# begin the loop:
for(i in 1:num_iters) {

    # replicate the iris data
    myiris <- iris

    # report out every tenth iteration
    if(i %% 10 == 0) {print(paste0("iteration: ", i))}

    # capture labels in character vector, remove label from x-data
    myiris_labels <- myiris$Species
    myiris$Species <- NULL

    # identify train/test split
    indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
    indx_test <- setdiff(1:nrow(myiris), indx_train)

    # split features (x) into train and test
    myiris_train <- myiris[indx_train, ]
    myiris_test <- myiris[indx_test, ]

    # split labels (y) into train, test, and store a shuffled version of test y values as well
    y_train <- myiris_labels[indx_train]
    y_test <- myiris_labels[indx_test]
    y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]

    # build model based on real y values, then one based on shuffled y values
    myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
    myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)

    # make predictions based on real y values, then based on the model that saw shuffled y values
    myrf_preds <- predict(myrf, myiris_test)
    myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)
    rm(myrf, myrf_shuffled)

    # determine accuracy of each model
    myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
    myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)

    # store the accuracy in the pre-allocated numeric vector space
    results[i] <- myrf_accuracy
    results_shuffled[i] <- myrf_accuracy_shuffled
}


# label which results came from which experiment and combine into one data frame
df_results <- data.frame(accuracy=results, type='y_test')
df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
df_all <- rbind(df_results, df_results_shuffled)


# plot the density distribution of each group
ggplot(df_all, aes(x=accuracy, fill=type)) +
    geom_density(alpha=0.4) +
    theme_bw(base_size=16) +
    ggtitle("Model vs Target-Shuffled Model")


## zzz_input_shuffling.R


# input shuffling
# exploring this concept
# based on this video: https://www.youtube.com/watch?v=OlHW7frH3ug
# just found another one: https://www.youtube.com/watch?v=tD8HZuWqIQw


# libraries:
library(randomForest)
library(ggplot2)


# data set:
myiris <- iris


# function definitions:

    shuffle_this_col <- function(param_df, param_col) {
        # shuffles a column in the dataframe
        # example: shuffle_this_col(myiris, 'Sepal.Width')

        col_vals <- param_df[, param_col]
        shuf_vals <- col_vals[sample(1:length(col_vals), length(col_vals))]
        param_df[, param_col] <- shuf_vals

        return(param_df)
    }


# "config" for experiment
iter_per_var <- 100
target_var <- 'Species'
vars_to_shuf <- setdiff(names(myiris), target_var)
set.seed(1776)


# scores will be model accuracy score, exp_desc is description of the experiment
scores <- numeric(iter_per_var * length(vars_to_shuf) * 2)
exp_desc <- character(iter_per_var * length(vars_to_shuf) * 2)

    #' explaining the pre-allocated space above: I want `iter_per_var` number of
    #' iterations per shuffled variable, so those are multiplied. I'd also like
    #' to keep a "base-case" non-shuffled score in there as well, so that is why
    #' I multiply by 2 at the end there.


# for each variable to shuffle, for number of iterations, build models and test

# old school incrementer for "simplicity" (I'm an old soul)
count <- 1


for(i in 1:length(vars_to_shuf)) {

    # now we only have to do this subset once
    this_var <- vars_to_shuf[i]


    for(j in 1:iter_per_var) {

        if(j %% 10 == 0) {print(paste0("iteration: ", j, " -- for variable: ", this_var))}

        # isolate a copy of the data
        myiris <- iris

        # capture labels in character vector, remove label from x-data
        myiris_labels <- myiris$Species
        myiris$Species <- NULL
        myiris_shuf <- shuffle_this_col(myiris, this_var)

        # identify train/test split
        indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
        indx_test <- setdiff(1:nrow(myiris), indx_train)

        # split features (x) into train and test
        myiris_train <- myiris[indx_train, ]
        myiris_train_shuf <- myiris_shuf[indx_train, ]
        myiris_test <- myiris[indx_test, ]
        myiris_test_shuf <- myiris_shuf[indx_test, ]  # <----- should we do this? (try it with and without)


        # split labels (y) into train, test, and store a shuffled version of test y values as well
        y_train <- myiris_labels[indx_train]
        y_test <- myiris_labels[indx_test]

        # build model based on real y values, then one based on shuffled y values
        myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
        myrf_shuffled <- randomForest(x=myiris_train_shuf, y=y_train, xtest=myiris_test_shuf, ytest=y_test, keep.forest = TRUE)

        # make predictions based on real y values, then based on the model that saw shuffled y values
        myrf_preds <- predict(myrf, myiris_test)
        myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test_shuf)
        rm(myrf, myrf_shuffled)

        # determine accuracy of each model
        myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
        myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test, na.rm=T) / length(y_test)

        # capture the "baseline" scores
        scores[count] <- myrf_accuracy
        exp_desc[count] <- 'baseline'


        # mid loop increment -- it makes sense, trust me.
        count <- count + 1

        # capture the "shuffled scores"
        scores[count] <- myrf_accuracy_shuffled
        exp_desc[count] <- this_var


        # end of loop increment
        count <- count + 1


    } # end inner for (j)

} # end outter for (i)

results_all <- data.frame(scores=scores, type=exp_desc)

ggplot(data=results_all, aes(x=scores, fill=type)) +
    geom_density(alpha=0.3)


	#' Target Shuffling
	#' Author: Taylor Van Anne
	#'
	#' Note: this is just my interpretation of what target shuffling means
	#' to me. I think there are a few different ways to actually conduct
	#' the shuffling, but this is a single approach.
	#'
	#' A different approach than what I did here would be to shuffle the
	#' entire target variable before the train/test split. I chose to
	#' instead shuffle only within the test label values (after splitting
	#' the label values into train/test)


	# load libraries
	library(randomForest)
	library(ggplot2)

	# this is the number of iterations of model building
	num_iters <- 100

	# allocating numeric vector space to store our results as we loop
	results <- numeric(num_iters)
	results_shuffled <- numeric(num_iters)

	# setting a random seed for reproducibility
	set.seed(4)


	# begin the loop:
	for(i in 1:num_iters) {

	# replicate the iris data
	myiris <- iris

	# report out every tenth iteration
	if(i %% 10 == 0) {print(paste0("iteration: ", i))}

	# capture labels in character vector, remove label from x-data
	myiris_labels <- myiris$Species
	myiris$Species <- NULL

	# identify train/test split
	indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
	indx_test <- setdiff(1:nrow(myiris), indx_train)

	# split features (x) into train and test
	myiris_train <- myiris[indx_train, ]
	myiris_test <- myiris[indx_test, ]

	# split labels (y) into train, test, and store a shuffled version of test y values as well
	y_train <- myiris_labels[indx_train]
	y_test <- myiris_labels[indx_test]
	y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]

	# build model based on real y values, then one based on shuffled y values
	myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
	myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)

	# make predictions based on real y values, then based on the model that saw shuffled y values
	myrf_preds <- predict(myrf, myiris_test)
	myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)
	rm(myrf, myrf_shuffled)

	# determine accuracy of each model
	myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
	myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)

	# store the accuracy in the pre-allocated numeric vector space
	results[i] <- myrf_accuracy
	results_shuffled[i] <- myrf_accuracy_shuffled
	}


	# label which results came from which experiment and combine into one data frame
	df_results <- data.frame(accuracy=results, type='y_test')
	df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
	df_all <- rbind(df_results, df_results_shuffled)


	# plot the density distribution of each group
	ggplot(df_all, aes(x=accuracy, fill=type)) +
	geom_density(alpha=0.4) +
	theme_bw(base_size=16) +
	ggtitle("Model vs Target-Shuffled Model")


	# input shuffling
	# exploring this concept
	# based on this video: https://www.youtube.com/watch?v=OlHW7frH3ug
	# just found another one: https://www.youtube.com/watch?v=tD8HZuWqIQw


	# libraries:
	library(randomForest)
	library(ggplot2)


	# data set:
	myiris <- iris


	# function definitions:

	shuffle_this_col <- function(param_df, param_col) {
	# shuffles a column in the dataframe
	# example: shuffle_this_col(myiris, 'Sepal.Width')

	col_vals <- param_df[, param_col]
	shuf_vals <- col_vals[sample(1:length(col_vals), length(col_vals))]
	param_df[, param_col] <- shuf_vals

	return(param_df)
	}



	# "config" for experiment
	iter_per_var <- 100
	target_var <- 'Species'
	vars_to_shuf <- setdiff(names(myiris), target_var)
	set.seed(1776)



	# scores will be model accuracy score, exp_desc is description of the experiment
	scores <- numeric(iter_per_var * length(vars_to_shuf) * 2)
	exp_desc <- character(iter_per_var * length(vars_to_shuf) * 2)

	#' explaining the pre-allocated space above: I want `iter_per_var` number of
	#' iterations per shuffled variable, so those are multiplied. I'd also like
	#' to keep a "base-case" non-shuffled score in there as well, so that is why
	#' I multiply by 2 at the end there.



	# for each variable to shuffle, for number of iterations, build models and test

	# old school incrementer for "simplicity" (I'm an old soul)
	count <- 1


	for(i in 1:length(vars_to_shuf)) {

	# now we only have to do this subset once
	this_var <- vars_to_shuf[i]


	for(j in 1:iter_per_var) {

	if(j %% 10 == 0) {print(paste0("iteration: ", j, " -- for variable: ", this_var))}

	# isolate a copy of the data
	myiris <- iris

	# capture labels in character vector, remove label from x-data
	myiris_labels <- myiris$Species
	myiris$Species <- NULL
	myiris_shuf <- shuffle_this_col(myiris, this_var)

	# identify train/test split
	indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
	indx_test <- setdiff(1:nrow(myiris), indx_train)

	# split features (x) into train and test
	myiris_train <- myiris[indx_train, ]
	myiris_train_shuf <- myiris_shuf[indx_train, ]
	myiris_test <- myiris[indx_test, ]
	myiris_test_shuf <- myiris_shuf[indx_test, ] # <----- should we do this? (try it with and without)


	# split labels (y) into train, test, and store a shuffled version of test y values as well
	y_train <- myiris_labels[indx_train]
	y_test <- myiris_labels[indx_test]

	# build model based on real y values, then one based on shuffled y values
	myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
	myrf_shuffled <- randomForest(x=myiris_train_shuf, y=y_train, xtest=myiris_test_shuf, ytest=y_test, keep.forest = TRUE)

	# make predictions based on real y values, then based on the model that saw shuffled y values
	myrf_preds <- predict(myrf, myiris_test)
	myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test_shuf)
	rm(myrf, myrf_shuffled)

	# determine accuracy of each model
	myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
	myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test, na.rm=T) / length(y_test)

	# capture the "baseline" scores
	scores[count] <- myrf_accuracy
	exp_desc[count] <- 'baseline'


	# mid loop increment -- it makes sense, trust me.
	count <- count + 1

	# capture the "shuffled scores"
	scores[count] <- myrf_accuracy_shuffled
	exp_desc[count] <- this_var


	# end of loop increment
	count <- count + 1


	} # end inner for (j)

	} # end outter for (i)

	results_all <- data.frame(scores=scores, type=exp_desc)

	ggplot(data=results_all, aes(x=scores, fill=type)) +
	geom_density(alpha=0.3)