Last active
February 17, 2022 11:21
-
-
Save grayskripko/2b513ead8a077ac4620d2e5599b4fd28 to your computer and use it in GitHub Desktop.
least correlated interactions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# let's say we have a set of features and want to create a bunch of their interations | |
# Which should we use? I tried to answer this question thinking that | |
# correlation is a good measure of novelty and usefullness of the information | |
explore$interactions <- function(df) { | |
assert(ncol(df) == 2) | |
df %>% | |
# { print(explore$cor_tibble(.)); . } %>% | |
mutate(sum=.[[1]] + .[[2]]) %>% | |
mutate(dif=.[[1]] - .[[2]]) %>% | |
mutate(mult=.[[1]] * .[[2]]) %>% | |
mutate(div=.[[1]] / .[[2]]) %>% | |
mutate(prop1=.[[1]] / (.[[1]] + .[[2]])) %>% | |
mutate(prop2=.[[2]] / (.[[1]] + .[[2]])) %>% | |
# mutate(dif_div=(.[[1]] - .[[2]]) / .[[1]]) %>% | |
drop_na() %>% | |
# pprint(10) %>% | |
explore$cor_tibble(.) %>% | |
mutate(abs=round(abs, 2)) | |
# pprint() | |
} | |
explore$cor_tibble <- function(x, pat = NULL, method = 'spearman') { | |
dupl_tbl <- x %>% | |
select(where(is.numeric)) %>% | |
cor(use = "complete.obs", method=method) %>% | |
as_tibble(rownames = 'nm') %>% | |
pivot_longer(-nm) %>% | |
mutate(abs = abs(value)) %>% | |
select(-value) %>% | |
# rename(feature = nm, feature2 = name) %>% | |
arrange(desc(abs)) %>% | |
mutate(abs=round(abs, 3)) | |
if (!is.null(pat)) { | |
dupl_tbl %>% | |
filter(str_detect(nm, pat) & abs < 1) | |
} else { | |
dupl_tbl %>% | |
mutate(across(all_of(c("nm", "name")), list(`i` = ~as.numeric(factor(.))))) %>% | |
filter(nm_i > name_i) %>% | |
select(-ends_with("_i")) | |
} | |
} | |
# here I treat one specific type of data. | |
# I have match data, where every feature is split in two parts - home and away team. | |
# You can change the code a little in order to adapt it for your needs | |
# just compare all columns to each other | |
x <- match_elo %>% | |
select(where(is.numeric)) %>% | |
colnames() %>% | |
enframe(name=NULL) %>% | |
separate(value, sep='(?<=^[ha])_', into=letters[1:2], fill='left') %>% | |
drop_na() %>% | |
distinct(b) %>% | |
pluck('b') %>% | |
map(~select(match_elo, str_c(c('h_', 'a_'), .)) %>% | |
explore$interactions()) %>% | |
map(~bind_rows(., rename(., nm=name, name=nm)) %>% arrange(desc(abs))) %>% | |
bind_rows() | |
x %>% | |
drop_na() %>% | |
filter(abs != 1) %>% | |
filter(str_starts(nm, 'h_')) %>% | |
filter(!str_starts(name, '[ha]_')) %>% | |
ggplot(aes(x = name, y = abs)) + | |
geom_violin(fill = "skyblue", size = 1) | |
# in my case, I've found that the best option is to | |
# use proportion interation. If I need more, a can add division as less | |
# correlated with original and proportion features |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment