Skip to content

Instantly share code, notes, and snippets.

@BillPetti
Last active February 4, 2019 20:28
Show Gist options
  • Save BillPetti/28649dac8e3d5f6986af872d48d20dd1 to your computer and use it in GitHub Desktop.
Save BillPetti/28649dac8e3d5f6986af872d48d20dd1 to your computer and use it in GitHub Desktop.
A function for calculating the value for which the difference in ecdf is largest for two groups
# df is a dataframe
# group_var is the variable that contains your two groups that you want to compare--think 1 vs. 0
# feature_var is the feature or variable whose values you are interested in exploring
# this function calculates the value with the max difference and returns that only
max_ecdf <- function(df, group_var, feature_var) {
var <- enquo(group_var)
feature_enquo <- enquo(feature_var)
positive_cases <- df %>%
filter((!!var) == 1) %>%
pull(!!feature_enquo)
negative_cases <- df %>%
filter((!!var) == 0) %>%
pull(!!feature_enquo)
values <- data_frame(values = c(positive_cases,
negative_cases)) %>%
drop_na() %>%
arrange(values)
positive_ecdf <- ecdf(positive_cases)
negative_ecdf <- ecdf(negative_cases)
pos_ecdf_values <- map(.x = values$values,
~positive_ecdf(.x)) %>%
set_names(values$values) %>%
bind_rows() %>%
gather(key = values, value = cum_dist) %>%
mutate(type = 1)
neg_ecdf_values <- map(.x = values$values,
~negative_ecdf(.x)) %>%
set_names(values$values) %>%
bind_rows() %>%
gather(key = values, value = cum_dist) %>%
mutate(type = 1)
comb_ecdf <- pos_ecdf_values %>%
left_join(neg_ecdf_values, by = "values") %>%
select(-type.x, -type.y)
names(comb_ecdf) <- c("values", "pos_cum_dist", "neg_cum_dist")
comb_ecdf <- comb_ecdf %>%
mutate(values = as.numeric(values),
diff = abs(pos_cum_dist - neg_cum_dist)) %>%
filter(diff == max(diff))
comb_ecdf
}
# this function calculates the ecdf separately for each value and then returns
# a data frame of the entire cumulative distribution
max_ecdf_dist <- function(df, group_var, feature_var, feature_string) {
var <- enquo(group_var)
feature_enquo <- enquo(feature_var)
positive_cases <- df %>%
filter((!!var) == 1) %>%
pull(!!feature_enquo)
negative_cases <- df %>%
filter((!!var) == 0) %>%
pull(!!feature_enquo)
values <- data_frame(values = c(positive_cases,
negative_cases)) %>%
drop_na() %>%
arrange(values)
positive_ecdf <- ecdf(positive_cases)
negative_ecdf <- ecdf(negative_cases)
pos_ecdf_values <- map(.x = values$values,
~positive_ecdf(.x)) %>%
set_names(values$values) %>%
bind_rows() %>%
gather(key = values, value = cum_dist) %>%
mutate(type = 1)
neg_ecdf_values <- map(.x = values$values,
~negative_ecdf(.x)) %>%
set_names(values$values) %>%
bind_rows() %>%
gather(key = values, value = cum_dist) %>%
mutate(type = 1)
comb_ecdf <- pos_ecdf_values %>%
left_join(neg_ecdf_values, by = "values") %>%
select(-type.x, -type.y)
names(comb_ecdf) <- c("values", "pos_cum_dist", "neg_cum_dist")
comb_ecdf <- comb_ecdf %>%
mutate(values = as.numeric(values),
feature = paste(feature_string)) %>%
select(feature, everything())
comb_ecdf
}
# example to run using the iris data set
iris_2 <- iris %>%
mutate(Species = ifelse(Species == "setosa", 1, 0))
max_ecdf(iris_2,
group_var = Species,
feature_var = Sepal.Width)
max_ecdf_dist(iris_2,
group_var = Species,
feature_var = Sepal.Width,
feature_string = "Sepal.Width")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment