grayskripko/least_corr_interact.R

## least_corr_interact.R
# let's say we have a set of features and want to create a bunch of their interations
# Which should we use? I tried to answer this question thinking that
# correlation is a good measure of novelty and usefullness of the information

explore$interactions <- function(df) {
  assert(ncol(df) == 2)
  df %>%
    # { print(explore$cor_tibble(.)); . } %>%
    mutate(sum=.[[1]] + .[[2]]) %>%
    mutate(dif=.[[1]] - .[[2]]) %>%
    mutate(mult=.[[1]] * .[[2]]) %>%
    mutate(div=.[[1]] / .[[2]]) %>%
    mutate(prop1=.[[1]] / (.[[1]] + .[[2]])) %>%
    mutate(prop2=.[[2]] / (.[[1]] + .[[2]])) %>%
    # mutate(dif_div=(.[[1]] - .[[2]]) / .[[1]]) %>%
    drop_na() %>%
    # pprint(10) %>%
    explore$cor_tibble(.) %>%
    mutate(abs=round(abs, 2))
    # pprint()
}

explore$cor_tibble <- function(x, pat = NULL, method = 'spearman') {
  dupl_tbl <- x %>%
    select(where(is.numeric)) %>%
    cor(use = "complete.obs", method=method) %>%
    as_tibble(rownames = 'nm') %>%
    pivot_longer(-nm) %>%
    mutate(abs = abs(value)) %>%
    select(-value) %>%
    # rename(feature = nm, feature2 = name) %>%
    arrange(desc(abs)) %>%
    mutate(abs=round(abs, 3))

  if (!is.null(pat)) {
    dupl_tbl %>%
      filter(str_detect(nm, pat) & abs < 1)
  } else {
    dupl_tbl %>%
      mutate(across(all_of(c("nm", "name")), list(`i` = ~as.numeric(factor(.))))) %>%
      filter(nm_i > name_i) %>%
      select(-ends_with("_i"))
  }
}

# here I treat one specific type of data.
# I have match data, where every feature is split in two parts - home and away team.
# You can change the code a little in order to adapt it for your needs
# just compare all columns to each other

x <- match_elo %>%
  select(where(is.numeric)) %>%
  colnames() %>%
  enframe(name=NULL) %>%
  separate(value, sep='(?<=^[ha])_', into=letters[1:2], fill='left') %>%
  drop_na() %>%
  distinct(b) %>%
  pluck('b') %>%
  map(~select(match_elo, str_c(c('h_', 'a_'), .)) %>%
          explore$interactions()) %>%
  map(~bind_rows(., rename(., nm=name, name=nm)) %>% arrange(desc(abs))) %>%
  bind_rows()

x %>%
  drop_na() %>%
  filter(abs != 1) %>%
  filter(str_starts(nm, 'h_')) %>%
  filter(!str_starts(name, '[ha]_')) %>%
  ggplot(aes(x = name, y = abs)) +
  geom_violin(fill = "skyblue", size = 1)

 # in my case, I've found that the best option is to
 # use proportion interation. If I need more, a can add division as less
 # correlated with original and proportion features
	# let's say we have a set of features and want to create a bunch of their interations
	# Which should we use? I tried to answer this question thinking that
	# correlation is a good measure of novelty and usefullness of the information

	explore$interactions <- function(df) {
	assert(ncol(df) == 2)
	df %>%
	# { print(explore$cor_tibble(.)); . } %>%
	mutate(sum=.[[1]] + .[[2]]) %>%
	mutate(dif=.[[1]] - .[[2]]) %>%
	mutate(mult=.[[1]] * .[[2]]) %>%
	mutate(div=.[[1]] / .[[2]]) %>%
	mutate(prop1=.[[1]] / (.[[1]] + .[[2]])) %>%
	mutate(prop2=.[[2]] / (.[[1]] + .[[2]])) %>%
	# mutate(dif_div=(.[[1]] - .[[2]]) / .[[1]]) %>%
	drop_na() %>%
	# pprint(10) %>%
	explore$cor_tibble(.) %>%
	mutate(abs=round(abs, 2))
	# pprint()
	}

	explore$cor_tibble <- function(x, pat = NULL, method = 'spearman') {
	dupl_tbl <- x %>%
	select(where(is.numeric)) %>%
	cor(use = "complete.obs", method=method) %>%
	as_tibble(rownames = 'nm') %>%
	pivot_longer(-nm) %>%
	mutate(abs = abs(value)) %>%
	select(-value) %>%
	# rename(feature = nm, feature2 = name) %>%
	arrange(desc(abs)) %>%
	mutate(abs=round(abs, 3))

	if (!is.null(pat)) {
	dupl_tbl %>%
	filter(str_detect(nm, pat) & abs < 1)
	} else {
	dupl_tbl %>%
	mutate(across(all_of(c("nm", "name")), list(`i` = ~as.numeric(factor(.))))) %>%
	filter(nm_i > name_i) %>%
	select(-ends_with("_i"))
	}
	}

	# here I treat one specific type of data.
	# I have match data, where every feature is split in two parts - home and away team.
	# You can change the code a little in order to adapt it for your needs
	# just compare all columns to each other

	x <- match_elo %>%
	select(where(is.numeric)) %>%
	colnames() %>%
	enframe(name=NULL) %>%
	separate(value, sep='(?<=^[ha])_', into=letters[1:2], fill='left') %>%
	drop_na() %>%
	distinct(b) %>%
	pluck('b') %>%
	map(~select(match_elo, str_c(c('h_', 'a_'), .)) %>%
	explore$interactions()) %>%
	map(~bind_rows(., rename(., nm=name, name=nm)) %>% arrange(desc(abs))) %>%
	bind_rows()

	x %>%
	drop_na() %>%
	filter(abs != 1) %>%
	filter(str_starts(nm, 'h_')) %>%
	filter(!str_starts(name, '[ha]_')) %>%
	ggplot(aes(x = name, y = abs)) +
	geom_violin(fill = "skyblue", size = 1)

	# in my case, I've found that the best option is to
	# use proportion interation. If I need more, a can add division as less
	# correlated with original and proportion features