Skip to content

Instantly share code, notes, and snippets.

@evjeny
Created December 4, 2021 23:44
Show Gist options
  • Save evjeny/fdc1094514716c69f5dde50f20c8cebe to your computer and use it in GitHub Desktop.
Save evjeny/fdc1094514716c69f5dde50f20c8cebe to your computer and use it in GitHub Desktop.
Sklearn data transformer for filtering collinear features
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class CollinearFilter(BaseEstimator, TransformerMixin):
def __init__(self, thresh: float = 0.95, choose_corr="random", random_state=None):
self.thresh = thresh
self.choose_corr = choose_corr
self.state = np.random.RandomState(random_state)
self.keep_indices_ = None
def fit(self, X, y=None):
cor_mrx = np.corrcoef(X, rowvar=False)
threshed = cor_mrx > self.thresh
keep_mask = np.ones(X.shape[1], dtype=np.int64)
indices = np.arange(X.shape[1], dtype=np.int64)
for feature_num in indices:
if keep_mask[feature_num]:
correlated_mask = keep_mask & threshed[feature_num]
if np.sum(correlated_mask) < 2:
continue
correlated_indices = self._indices(correlated_mask)
if self.choose_corr == "first":
choosen_feature = correlated_indices[0]
elif self.choose_corr == "random":
choosen_feature = self.state.choice(correlated_indices)
else:
raise Exception("Unknown `choose_corr` param, must be one of: `first`, `random`!")
erase_mask = correlated_mask & (indices != choosen_feature)
keep_mask[self._indices(erase_mask)] = 0
self.keep_indices_ = self._indices(keep_mask)
return self
def _indices(self, mask: np.ndarray):
return np.argwhere(mask).ravel()
def transform(self, X, y=None):
return X[:, self.keep_indices_]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment