Created
September 19, 2018 13:10
-
-
Save jrosen48/094eee8627be43b167c7c79388027174 to your computer and use it in GitHub Desktop.
prepare_data() func
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
centering_function <- function(data, method_of_centering, grouping_vector, to_standardize = F){ | |
center_this <- function(x){ | |
x - mean(x, na.rm = T) | |
} | |
scale_this <- function(x) { | |
if (stats::sd(x, na.rm = T) == 0){ | |
x - mean(x, na.rm = T) | |
} else { | |
(x - mean(x)) / stats::sd(x) | |
} | |
} | |
if (method_of_centering == "grand" & to_standardize == F) { | |
out <- sapply(data, function(x) center_this(x)) | |
out <- as.data.frame(out) | |
} | |
if (method_of_centering == "group" & to_standardize == F) { | |
out <- data %>% | |
cbind(grouping_vector) %>% | |
dplyr::group_by(grouping_vector) %>% | |
dplyr::mutate_each(dplyr::funs(center_this)) | |
out <- as.data.frame(out[, 1:ncol(data)]) | |
} | |
if (method_of_centering == "grand" & to_standardize == T) { | |
out <- sapply(data, function(x) scale_this(x)) | |
out <- as.data.frame(out) | |
} | |
if (method_of_centering == "group" & to_standardize == T) { | |
out <- data %>% | |
cbind(grouping_vector) %>% | |
dplyr::group_by(grouping_vector) %>% | |
dplyr::mutate_each(dplyr::funs(scale_this)) | |
out <- as.data.frame(out[, 1:ncol(data)]) | |
} | |
if (method_of_centering == "raw") { | |
out <- as.data.frame(data) | |
} | |
return(out) | |
} | |
removed_obs_df_maker <- function(raw_data_matrix, cases_to_keep){ | |
removed_obs_df <- data.frame(row = row.names(raw_data_matrix), raw_data_matrix, stringsAsFactors = F) | |
removed_obs_df$reason_removed <- NA | |
removed_obs_df$reason_removed[!cases_to_keep] <- "incomplete_case" | |
return(removed_obs_df) | |
} | |
uv_outlier_detector <- function(x, na.rm = T, ...) { | |
# need to figure out where this came from - from a SO question, can probably re-write | |
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...) | |
H <- 1.5 * IQR(x, na.rm = na.rm) | |
y <- x | |
y[x < (qnt[1] - H)] <- NA | |
y[x > (qnt[2] + H)] <- NA | |
return(y) | |
} | |
remove_uv_out_func <- function(data){ | |
x <- sapply(data, uv_outlier_detector) | |
return(x) | |
} | |
remove_mv_out_func <- function(data){ | |
mvout <- chemometrics::Moutlier(data, quantile = 0.99, plot = F) | |
the_index <- which(mvout$md > mvout$cutoff) | |
if (any(the_index) == T){ | |
return(the_index) | |
} else{ | |
return(data) | |
} | |
} | |
remove_mv_main_func <- function(data, removed_obs_df, cases_to_keep, found_uv_outlier_bool = F, uv_outliers = NULL, print_status){ | |
out_tmp <- remove_mv_out_func(data) | |
if(print_status == T){ | |
print(paste0("### Note: ", length(out_tmp), " cases with multivariate outliers out of ", nrow(data), " cases removed, so ", nrow(data) - length(out_tmp), " used in subsequent analysis ###")) | |
} | |
x <- removed_obs_df[cases_to_keep, ] | |
if(found_uv_outlier_bool == T){ | |
y <- x[-uv_outliers, ] | |
z <- as.numeric(y$row[out_tmp]) | |
} else{ | |
z <- as.numeric(x$row[out_tmp]) | |
} | |
removed_obs_df$reason_removed[z] <- "multivariate_outlier" | |
data_out <- data[-out_tmp, ] # this is the first list item (data with mv outliers removed), second is the cases to be output as an attribute returned from prepare_data() | |
data_out <- list(data_out, removed_obs_df) | |
return(data_out) | |
} | |
prepare_data <- function(raw_data_matrix, method_of_centering = "raw", | |
grouping_vector = NULL, to_standardize = FALSE, | |
remove_uv_outliers = FALSE, remove_mv_outliers = FALSE, | |
print_status = TRUE){ | |
cases_to_keep <- complete.cases(raw_data_matrix) # to use later for comparing function to index which cases to keep | |
removed_obs_df <- removed_obs_df_maker(raw_data_matrix, cases_to_keep) | |
data_tmp <- raw_data_matrix[cases_to_keep, ] # removes incomplete cases | |
print("### Created the following output ... ") | |
print("### 1. Prepared data ###") | |
if(print_status == T){ | |
if(length(table(cases_to_keep)) > 1){ | |
print(paste0("### Note: ", table(cases_to_keep)[1], " incomplete cases out of ", sum(table(cases_to_keep)), " total cases removed, so ", sum(table(cases_to_keep)) - table(cases_to_keep)[1], " used in subsequent analysis ###")) | |
} else{ | |
print(paste0("### Note: 0 incomplete cases out of ", nrow(data_tmp), " total cases removed, so ", nrow(data_tmp), " used in subsequent analysis ###")) | |
} | |
} | |
if (remove_uv_outliers == T){ | |
tmp1 <- remove_uv_main_func(data_tmp, removed_obs_df, cases_to_keep, print_status) | |
data_tmp <- tmp1[[1]] | |
removed_obs_df <- tmp1[[2]] | |
} | |
if(any(as.character(removed_obs_df$reason_removed) == "univariate_outlier", na.rm = T)){ | |
found_uv_outlier_bool <- T | |
} else{ | |
found_uv_outlier_bool <- F | |
} | |
if (remove_mv_outliers == T){ | |
tmp2 <- remove_mv_main_func(data_tmp, removed_obs_df, cases_to_keep, found_uv_outlier_bool, uv_outliers = tmp1, print_status) | |
data_tmp <- tmp2[[1]] | |
removed_obs_df <- tmp2[[2]] | |
} | |
grouping_vector <- grouping_vector[cases_to_keep] | |
out <- centering_function(as.data.frame(data_tmp), method_of_centering, grouping_vector, to_standardize) | |
cases_to_keep = row.names(raw_data_matrix) %in% removed_obs_df$row[is.na(removed_obs_df$reason_removed)] | |
attributes(out) <- list(uncentered_cleaned_data = data_tmp, | |
method_of_centering = method_of_centering, cases_to_keep = cases_to_keep, | |
cases_removed_df = removed_obs_df[, 2:5], variable_names = names(raw_data_matrix)) | |
if(print_status == T){ | |
print("### Note. Print the cases_removed_df attribute to view cases removed ###") | |
} | |
return(out) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
dat$var1 %>% uv_outlier_detector()