Skip to content

Instantly share code, notes, and snippets.

@grayskripko
Last active February 17, 2022 11:21
Show Gist options
  • Save grayskripko/2b513ead8a077ac4620d2e5599b4fd28 to your computer and use it in GitHub Desktop.
Save grayskripko/2b513ead8a077ac4620d2e5599b4fd28 to your computer and use it in GitHub Desktop.
least correlated interactions
# let's say we have a set of features and want to create a bunch of their interations
# Which should we use? I tried to answer this question thinking that
# correlation is a good measure of novelty and usefullness of the information
explore$interactions <- function(df) {
assert(ncol(df) == 2)
df %>%
# { print(explore$cor_tibble(.)); . } %>%
mutate(sum=.[[1]] + .[[2]]) %>%
mutate(dif=.[[1]] - .[[2]]) %>%
mutate(mult=.[[1]] * .[[2]]) %>%
mutate(div=.[[1]] / .[[2]]) %>%
mutate(prop1=.[[1]] / (.[[1]] + .[[2]])) %>%
mutate(prop2=.[[2]] / (.[[1]] + .[[2]])) %>%
# mutate(dif_div=(.[[1]] - .[[2]]) / .[[1]]) %>%
drop_na() %>%
# pprint(10) %>%
explore$cor_tibble(.) %>%
mutate(abs=round(abs, 2))
# pprint()
}
explore$cor_tibble <- function(x, pat = NULL, method = 'spearman') {
dupl_tbl <- x %>%
select(where(is.numeric)) %>%
cor(use = "complete.obs", method=method) %>%
as_tibble(rownames = 'nm') %>%
pivot_longer(-nm) %>%
mutate(abs = abs(value)) %>%
select(-value) %>%
# rename(feature = nm, feature2 = name) %>%
arrange(desc(abs)) %>%
mutate(abs=round(abs, 3))
if (!is.null(pat)) {
dupl_tbl %>%
filter(str_detect(nm, pat) & abs < 1)
} else {
dupl_tbl %>%
mutate(across(all_of(c("nm", "name")), list(`i` = ~as.numeric(factor(.))))) %>%
filter(nm_i > name_i) %>%
select(-ends_with("_i"))
}
}
# here I treat one specific type of data.
# I have match data, where every feature is split in two parts - home and away team.
# You can change the code a little in order to adapt it for your needs
# just compare all columns to each other
x <- match_elo %>%
select(where(is.numeric)) %>%
colnames() %>%
enframe(name=NULL) %>%
separate(value, sep='(?<=^[ha])_', into=letters[1:2], fill='left') %>%
drop_na() %>%
distinct(b) %>%
pluck('b') %>%
map(~select(match_elo, str_c(c('h_', 'a_'), .)) %>%
explore$interactions()) %>%
map(~bind_rows(., rename(., nm=name, name=nm)) %>% arrange(desc(abs))) %>%
bind_rows()
x %>%
drop_na() %>%
filter(abs != 1) %>%
filter(str_starts(nm, 'h_')) %>%
filter(!str_starts(name, '[ha]_')) %>%
ggplot(aes(x = name, y = abs)) +
geom_violin(fill = "skyblue", size = 1)
# in my case, I've found that the best option is to
# use proportion interation. If I need more, a can add division as less
# correlated with original and proportion features
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment