library(tidyverse)
# artificial data, groups "a" and "c" are alike... b is different
set.seed(12)
data <- tibble(group = c(rep("a", 30), rep("b", 20), rep("c", 20)),
value = c(rnorm(30, 8, 5), rt(20, 11, 5), rnorm(20, 8, 5))
)
# plot example
data %>%
ggplot(aes(x = value, fill = group))+
geom_density(alpha = .3)
# there is a more elegant way of setting up this dataframe... but forgetting it now...
nested_data <- data %>%
group_by(group) %>%
summarise(value = list(value))
nested_data_both <- nested_data %>%
mutate(data2 = list(nested_data)) %>%
unnest(data2, .drop = FALSE)
nested_data_both %>%
mutate(ks_test = map2(value, value1, stats::ks.test)) %>%
mutate(ks_stat = map_dbl(ks_test, "statistic"))
#> Warning in .f(.x[[i]], .y[[i]], ...): cannot compute exact p-value with
#> ties
#> Warning in .f(.x[[i]], .y[[i]], ...): cannot compute exact p-value with
#> ties
#> Warning in .f(.x[[i]], .y[[i]], ...): cannot compute exact p-value with
#> ties
#> # A tibble: 9 x 6
#> group value group1 value1 ks_test ks_stat
#> <chr> <list> <chr> <list> <list> <dbl>
#> 1 a <dbl [30]> a <dbl [30]> <htest> 0
#> 2 a <dbl [30]> b <dbl [20]> <htest> 0.567
#> 3 a <dbl [30]> c <dbl [20]> <htest> 0.167
#> 4 b <dbl [20]> a <dbl [30]> <htest> 0.567
#> 5 b <dbl [20]> b <dbl [20]> <htest> 0
#> 6 b <dbl [20]> c <dbl [20]> <htest> 0.55
#> 7 c <dbl [20]> a <dbl [30]> <htest> 0.167
#> 8 c <dbl [20]> b <dbl [20]> <htest> 0.55
#> 9 c <dbl [20]> c <dbl [20]> <htest> 0
# you should set a filter so that you don't run tests twice (i.e. on a,b and b,a)
Created on 2019-08-23 by the reprex package (v0.3.0)
There may have been something in the
fuzzyjoin
package that would have been useful as well: https://github.com/dgrtwo/fuzzyjoin