jrosen48/gist:094eee8627be43b167c7c79388027174

## gistfile1.txt
centering_function <- function(data, method_of_centering, grouping_vector, to_standardize = F){
    center_this <- function(x){
        x - mean(x, na.rm = T)
    }

    scale_this <- function(x) {
        if (stats::sd(x, na.rm = T) == 0){
            x - mean(x, na.rm = T)
        } else {
            (x - mean(x)) / stats::sd(x)
        }
    }
    if (method_of_centering == "grand" & to_standardize == F) {
        out <- sapply(data, function(x) center_this(x))
        out <- as.data.frame(out)
    }
    if (method_of_centering == "group" & to_standardize == F) {
        out <- data %>%
            cbind(grouping_vector) %>%
            dplyr::group_by(grouping_vector) %>%
            dplyr::mutate_each(dplyr::funs(center_this))
        out <- as.data.frame(out[, 1:ncol(data)])
    }
    if (method_of_centering == "grand" & to_standardize == T) {
        out <- sapply(data, function(x) scale_this(x))
        out <- as.data.frame(out)
    }
    if (method_of_centering == "group" & to_standardize == T) {
        out <- data %>%
            cbind(grouping_vector) %>%
            dplyr::group_by(grouping_vector) %>%
            dplyr::mutate_each(dplyr::funs(scale_this))
        out <- as.data.frame(out[, 1:ncol(data)])
    }
    if (method_of_centering == "raw") {
        out <- as.data.frame(data)
    }
    return(out)
}

removed_obs_df_maker <- function(raw_data_matrix, cases_to_keep){
    removed_obs_df <- data.frame(row = row.names(raw_data_matrix), raw_data_matrix, stringsAsFactors = F)
    removed_obs_df$reason_removed <- NA
    removed_obs_df$reason_removed[!cases_to_keep] <- "incomplete_case"
    return(removed_obs_df)
}

uv_outlier_detector <- function(x, na.rm = T, ...) {
    # need to figure out where this came from - from a SO question, can probably re-write
    qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
    H <- 1.5 * IQR(x, na.rm = na.rm)
    y <- x
    y[x < (qnt[1] - H)] <- NA
    y[x > (qnt[2] + H)] <- NA
    return(y)
}

remove_uv_out_func <- function(data){
    x <- sapply(data, uv_outlier_detector)
    return(x)
}

remove_mv_out_func <- function(data){
    mvout <- chemometrics::Moutlier(data, quantile = 0.99, plot = F)
    the_index <- which(mvout$md > mvout$cutoff)
    if (any(the_index) == T){
        return(the_index)
    } else{
        return(data)
    }
}

remove_mv_main_func <- function(data, removed_obs_df, cases_to_keep, found_uv_outlier_bool = F, uv_outliers = NULL, print_status){
    out_tmp <- remove_mv_out_func(data)
    if(print_status == T){
        print(paste0("### Note: ", length(out_tmp), " cases with multivariate outliers out of ", nrow(data), " cases removed, so ", nrow(data) - length(out_tmp), " used in subsequent analysis ###"))
    }
    x <- removed_obs_df[cases_to_keep, ]
    if(found_uv_outlier_bool == T){
        y <- x[-uv_outliers, ]
        z <- as.numeric(y$row[out_tmp])
    } else{
        z <- as.numeric(x$row[out_tmp])
    }
    removed_obs_df$reason_removed[z] <- "multivariate_outlier"
    data_out <- data[-out_tmp, ] # this is the first list item (data with mv outliers removed), second is the cases to be output as an attribute returned from prepare_data()
    data_out <- list(data_out, removed_obs_df)
    return(data_out)
}


prepare_data <- function(raw_data_matrix, method_of_centering = "raw",
                         grouping_vector = NULL, to_standardize = FALSE,
                         remove_uv_outliers = FALSE, remove_mv_outliers = FALSE,
                         print_status = TRUE){
    cases_to_keep <- complete.cases(raw_data_matrix) # to use later for comparing function to index which cases to keep
    removed_obs_df <- removed_obs_df_maker(raw_data_matrix, cases_to_keep)
    data_tmp <- raw_data_matrix[cases_to_keep, ] # removes incomplete cases
    print("### Created the following output ... ")
    print("### 1. Prepared data ###")
    if(print_status == T){
        if(length(table(cases_to_keep)) > 1){
            print(paste0("### Note: ", table(cases_to_keep)[1], " incomplete cases out of ", sum(table(cases_to_keep)), " total cases removed, so ", sum(table(cases_to_keep)) - table(cases_to_keep)[1], " used in subsequent analysis ###"))
        } else{
            print(paste0("### Note: 0 incomplete cases out of ", nrow(data_tmp), " total cases removed, so ", nrow(data_tmp), " used in subsequent analysis ###"))
        }
    }
    if (remove_uv_outliers == T){
        tmp1 <- remove_uv_main_func(data_tmp, removed_obs_df, cases_to_keep, print_status)
        data_tmp <- tmp1[[1]]
        removed_obs_df <- tmp1[[2]]
    }
    if(any(as.character(removed_obs_df$reason_removed) == "univariate_outlier", na.rm = T)){
        found_uv_outlier_bool <- T
    } else{
        found_uv_outlier_bool <- F
    }
    if (remove_mv_outliers == T){
        tmp2 <- remove_mv_main_func(data_tmp, removed_obs_df, cases_to_keep, found_uv_outlier_bool, uv_outliers = tmp1, print_status)
        data_tmp <- tmp2[[1]]
        removed_obs_df <- tmp2[[2]]
    }
    grouping_vector <- grouping_vector[cases_to_keep]
    out <- centering_function(as.data.frame(data_tmp), method_of_centering, grouping_vector, to_standardize)
    cases_to_keep = row.names(raw_data_matrix) %in% removed_obs_df$row[is.na(removed_obs_df$reason_removed)]
    attributes(out) <- list(uncentered_cleaned_data = data_tmp,
                            method_of_centering = method_of_centering, cases_to_keep = cases_to_keep,
                            cases_removed_df = removed_obs_df[, 2:5], variable_names = names(raw_data_matrix))
    if(print_status == T){
        print("### Note. Print the cases_removed_df attribute to view cases removed ###")
    }
    return(out)
}
	centering_function <- function(data, method_of_centering, grouping_vector, to_standardize = F){
	center_this <- function(x){
	x - mean(x, na.rm = T)
	}

	scale_this <- function(x) {
	if (stats::sd(x, na.rm = T) == 0){
	x - mean(x, na.rm = T)
	} else {
	(x - mean(x)) / stats::sd(x)
	}
	}
	if (method_of_centering == "grand" & to_standardize == F) {
	out <- sapply(data, function(x) center_this(x))
	out <- as.data.frame(out)
	}
	if (method_of_centering == "group" & to_standardize == F) {
	out <- data %>%
	cbind(grouping_vector) %>%
	dplyr::group_by(grouping_vector) %>%
	dplyr::mutate_each(dplyr::funs(center_this))
	out <- as.data.frame(out[, 1:ncol(data)])
	}
	if (method_of_centering == "grand" & to_standardize == T) {
	out <- sapply(data, function(x) scale_this(x))
	out <- as.data.frame(out)
	}
	if (method_of_centering == "group" & to_standardize == T) {
	out <- data %>%
	cbind(grouping_vector) %>%
	dplyr::group_by(grouping_vector) %>%
	dplyr::mutate_each(dplyr::funs(scale_this))
	out <- as.data.frame(out[, 1:ncol(data)])
	}
	if (method_of_centering == "raw") {
	out <- as.data.frame(data)
	}
	return(out)
	}

	removed_obs_df_maker <- function(raw_data_matrix, cases_to_keep){
	removed_obs_df <- data.frame(row = row.names(raw_data_matrix), raw_data_matrix, stringsAsFactors = F)
	removed_obs_df$reason_removed <- NA
	removed_obs_df$reason_removed[!cases_to_keep] <- "incomplete_case"
	return(removed_obs_df)
	}

	uv_outlier_detector <- function(x, na.rm = T, ...) {
	# need to figure out where this came from - from a SO question, can probably re-write
	qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
	H <- 1.5 * IQR(x, na.rm = na.rm)
	y <- x
	y[x < (qnt[1] - H)] <- NA
	y[x > (qnt[2] + H)] <- NA
	return(y)
	}

	remove_uv_out_func <- function(data){
	x <- sapply(data, uv_outlier_detector)
	return(x)
	}

	remove_mv_out_func <- function(data){
	mvout <- chemometrics::Moutlier(data, quantile = 0.99, plot = F)
	the_index <- which(mvout$md > mvout$cutoff)
	if (any(the_index) == T){
	return(the_index)
	} else{
	return(data)
	}
	}

	remove_mv_main_func <- function(data, removed_obs_df, cases_to_keep, found_uv_outlier_bool = F, uv_outliers = NULL, print_status){
	out_tmp <- remove_mv_out_func(data)
	if(print_status == T){
	print(paste0("### Note: ", length(out_tmp), " cases with multivariate outliers out of ", nrow(data), " cases removed, so ", nrow(data) - length(out_tmp), " used in subsequent analysis ###"))
	}
	x <- removed_obs_df[cases_to_keep, ]
	if(found_uv_outlier_bool == T){
	y <- x[-uv_outliers, ]
	z <- as.numeric(y$row[out_tmp])
	} else{
	z <- as.numeric(x$row[out_tmp])
	}
	removed_obs_df$reason_removed[z] <- "multivariate_outlier"
	data_out <- data[-out_tmp, ] # this is the first list item (data with mv outliers removed), second is the cases to be output as an attribute returned from prepare_data()
	data_out <- list(data_out, removed_obs_df)
	return(data_out)
	}


	prepare_data <- function(raw_data_matrix, method_of_centering = "raw",
	grouping_vector = NULL, to_standardize = FALSE,
	remove_uv_outliers = FALSE, remove_mv_outliers = FALSE,
	print_status = TRUE){
	cases_to_keep <- complete.cases(raw_data_matrix) # to use later for comparing function to index which cases to keep
	removed_obs_df <- removed_obs_df_maker(raw_data_matrix, cases_to_keep)
	data_tmp <- raw_data_matrix[cases_to_keep, ] # removes incomplete cases
	print("### Created the following output ... ")
	print("### 1. Prepared data ###")
	if(print_status == T){
	if(length(table(cases_to_keep)) > 1){
	print(paste0("### Note: ", table(cases_to_keep)[1], " incomplete cases out of ", sum(table(cases_to_keep)), " total cases removed, so ", sum(table(cases_to_keep)) - table(cases_to_keep)[1], " used in subsequent analysis ###"))
	} else{
	print(paste0("### Note: 0 incomplete cases out of ", nrow(data_tmp), " total cases removed, so ", nrow(data_tmp), " used in subsequent analysis ###"))
	}
	}
	if (remove_uv_outliers == T){
	tmp1 <- remove_uv_main_func(data_tmp, removed_obs_df, cases_to_keep, print_status)
	data_tmp <- tmp1[[1]]
	removed_obs_df <- tmp1[[2]]
	}
	if(any(as.character(removed_obs_df$reason_removed) == "univariate_outlier", na.rm = T)){
	found_uv_outlier_bool <- T
	} else{
	found_uv_outlier_bool <- F
	}
	if (remove_mv_outliers == T){
	tmp2 <- remove_mv_main_func(data_tmp, removed_obs_df, cases_to_keep, found_uv_outlier_bool, uv_outliers = tmp1, print_status)
	data_tmp <- tmp2[[1]]
	removed_obs_df <- tmp2[[2]]
	}
	grouping_vector <- grouping_vector[cases_to_keep]
	out <- centering_function(as.data.frame(data_tmp), method_of_centering, grouping_vector, to_standardize)
	cases_to_keep = row.names(raw_data_matrix) %in% removed_obs_df$row[is.na(removed_obs_df$reason_removed)]
	attributes(out) <- list(uncentered_cleaned_data = data_tmp,
	method_of_centering = method_of_centering, cases_to_keep = cases_to_keep,
	cases_removed_df = removed_obs_df[, 2:5], variable_names = names(raw_data_matrix))
	if(print_status == T){
	print("### Note. Print the cases_removed_df attribute to view cases removed ###")
	}
	return(out)
	}