Skip to content

Instantly share code, notes, and snippets.

@jrosen48
Created September 19, 2018 13:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jrosen48/094eee8627be43b167c7c79388027174 to your computer and use it in GitHub Desktop.
Save jrosen48/094eee8627be43b167c7c79388027174 to your computer and use it in GitHub Desktop.
prepare_data() func
centering_function <- function(data, method_of_centering, grouping_vector, to_standardize = F){
center_this <- function(x){
x - mean(x, na.rm = T)
}
scale_this <- function(x) {
if (stats::sd(x, na.rm = T) == 0){
x - mean(x, na.rm = T)
} else {
(x - mean(x)) / stats::sd(x)
}
}
if (method_of_centering == "grand" & to_standardize == F) {
out <- sapply(data, function(x) center_this(x))
out <- as.data.frame(out)
}
if (method_of_centering == "group" & to_standardize == F) {
out <- data %>%
cbind(grouping_vector) %>%
dplyr::group_by(grouping_vector) %>%
dplyr::mutate_each(dplyr::funs(center_this))
out <- as.data.frame(out[, 1:ncol(data)])
}
if (method_of_centering == "grand" & to_standardize == T) {
out <- sapply(data, function(x) scale_this(x))
out <- as.data.frame(out)
}
if (method_of_centering == "group" & to_standardize == T) {
out <- data %>%
cbind(grouping_vector) %>%
dplyr::group_by(grouping_vector) %>%
dplyr::mutate_each(dplyr::funs(scale_this))
out <- as.data.frame(out[, 1:ncol(data)])
}
if (method_of_centering == "raw") {
out <- as.data.frame(data)
}
return(out)
}
removed_obs_df_maker <- function(raw_data_matrix, cases_to_keep){
removed_obs_df <- data.frame(row = row.names(raw_data_matrix), raw_data_matrix, stringsAsFactors = F)
removed_obs_df$reason_removed <- NA
removed_obs_df$reason_removed[!cases_to_keep] <- "incomplete_case"
return(removed_obs_df)
}
uv_outlier_detector <- function(x, na.rm = T, ...) {
# need to figure out where this came from - from a SO question, can probably re-write
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
return(y)
}
remove_uv_out_func <- function(data){
x <- sapply(data, uv_outlier_detector)
return(x)
}
remove_mv_out_func <- function(data){
mvout <- chemometrics::Moutlier(data, quantile = 0.99, plot = F)
the_index <- which(mvout$md > mvout$cutoff)
if (any(the_index) == T){
return(the_index)
} else{
return(data)
}
}
remove_mv_main_func <- function(data, removed_obs_df, cases_to_keep, found_uv_outlier_bool = F, uv_outliers = NULL, print_status){
out_tmp <- remove_mv_out_func(data)
if(print_status == T){
print(paste0("### Note: ", length(out_tmp), " cases with multivariate outliers out of ", nrow(data), " cases removed, so ", nrow(data) - length(out_tmp), " used in subsequent analysis ###"))
}
x <- removed_obs_df[cases_to_keep, ]
if(found_uv_outlier_bool == T){
y <- x[-uv_outliers, ]
z <- as.numeric(y$row[out_tmp])
} else{
z <- as.numeric(x$row[out_tmp])
}
removed_obs_df$reason_removed[z] <- "multivariate_outlier"
data_out <- data[-out_tmp, ] # this is the first list item (data with mv outliers removed), second is the cases to be output as an attribute returned from prepare_data()
data_out <- list(data_out, removed_obs_df)
return(data_out)
}
prepare_data <- function(raw_data_matrix, method_of_centering = "raw",
grouping_vector = NULL, to_standardize = FALSE,
remove_uv_outliers = FALSE, remove_mv_outliers = FALSE,
print_status = TRUE){
cases_to_keep <- complete.cases(raw_data_matrix) # to use later for comparing function to index which cases to keep
removed_obs_df <- removed_obs_df_maker(raw_data_matrix, cases_to_keep)
data_tmp <- raw_data_matrix[cases_to_keep, ] # removes incomplete cases
print("### Created the following output ... ")
print("### 1. Prepared data ###")
if(print_status == T){
if(length(table(cases_to_keep)) > 1){
print(paste0("### Note: ", table(cases_to_keep)[1], " incomplete cases out of ", sum(table(cases_to_keep)), " total cases removed, so ", sum(table(cases_to_keep)) - table(cases_to_keep)[1], " used in subsequent analysis ###"))
} else{
print(paste0("### Note: 0 incomplete cases out of ", nrow(data_tmp), " total cases removed, so ", nrow(data_tmp), " used in subsequent analysis ###"))
}
}
if (remove_uv_outliers == T){
tmp1 <- remove_uv_main_func(data_tmp, removed_obs_df, cases_to_keep, print_status)
data_tmp <- tmp1[[1]]
removed_obs_df <- tmp1[[2]]
}
if(any(as.character(removed_obs_df$reason_removed) == "univariate_outlier", na.rm = T)){
found_uv_outlier_bool <- T
} else{
found_uv_outlier_bool <- F
}
if (remove_mv_outliers == T){
tmp2 <- remove_mv_main_func(data_tmp, removed_obs_df, cases_to_keep, found_uv_outlier_bool, uv_outliers = tmp1, print_status)
data_tmp <- tmp2[[1]]
removed_obs_df <- tmp2[[2]]
}
grouping_vector <- grouping_vector[cases_to_keep]
out <- centering_function(as.data.frame(data_tmp), method_of_centering, grouping_vector, to_standardize)
cases_to_keep = row.names(raw_data_matrix) %in% removed_obs_df$row[is.na(removed_obs_df$reason_removed)]
attributes(out) <- list(uncentered_cleaned_data = data_tmp,
method_of_centering = method_of_centering, cases_to_keep = cases_to_keep,
cases_removed_df = removed_obs_df[, 2:5], variable_names = names(raw_data_matrix))
if(print_status == T){
print("### Note. Print the cases_removed_df attribute to view cases removed ###")
}
return(out)
}
@jrosen48
Copy link
Author

jrosen48 commented Sep 19, 2018

dat$var1 %>% uv_outlier_detector()

dat  %>% 
     select(var1, var2, var3) %>% 
     remove_mv_out_func()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment