Skip to content

Instantly share code, notes, and snippets.

@kidpixo
Last active February 20, 2019 08:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kidpixo/c6b0fb7347c7d298f565ab85b18aa411 to your computer and use it in GitHub Desktop.
Save kidpixo/c6b0fb7347c7d298f565ab85b18aa411 to your computer and use it in GitHub Desktop.
A scikit-learn transformer for extracting low correlation continuous features.
import sklearn
class CorrelationThreshold(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
"""A tranformer for combining low correlation on continous features.
This Transformer returns only features that have Pearson product-moment correlation coefficients
above a threshold value, default 0.99.
After fit, some data is available :
`get_statistics()` : returns information on the correlation matrix (max,min,percentiles)
`get_support()` : returns the index of retained features / boolean array (retain = 1/ delete=0)
`get_corr_matrix()` : returns the data correlation coefficients matrix (or with x>threshold masked)
"""
def __init__(self, threshold=0.99):
"""Initialize method.
Args:
threshold (float): The threshold to cut the correlation matrix absolute lower
values. Only variables with |C| >= threshold will be in output.
"""
import numpy as np
self.threshold = threshold
self.ind_to_delete = 0
self.ind_to_retain = 0
self.corr_matrix = 0
self.n_features_ = 0
def fit(self, X, y=None, **fit_params):
"""Fits transformer over X.
Calcualte the features index to retain and to reject based on threshold.
"""
# calculate the correlation coefficients matrix
self.corr_matrix = np.corrcoef(X_corr,rowvar=False)
self.ind_to_delete = set ()
for row_ind,row in enumerate(self.corr_matrix):
self.ind_to_delete.update([ind for ind,val in enumerate((np.abs(row) >= self.threshold)) if val and (ind != row_ind) and (ind >= row_ind)])
self.ind_to_retain = self.ind_to_delete.symmetric_difference(range(self.corr_matrix.shape[0]))
self.n_features_ = len(self.ind_to_retain)
return self
def transform(self, X, **transform_params):
"""Transforms X with threshold.
Args:
X (obj): The dataset to pass to the transformer.
Returns:
The transformed X with grouped buckets.
"""
X_copy = X.copy()
return X_copy[:,list(self.ind_to_retain)]
def fit_transform(self, X, y=None, **fit_params):
"""Fits+transform over X.
"""
return self.fit(X).transform(X)
def get_support(self, indices=False):
"""
Get a mask, or integer index, of the features selected
Parameters
----------
indices : boolean (default False)
If True, the return value will be an array of integers, rather
than a boolean mask.
Returns
-------
support : array
An index that selects the retained features from a feature vector.
If `indices` is False, this is a boolean array of shape
[# input features], in which an element is True iff its
corresponding feature is selected for retention. If `indices` is
True, this is an integer array of shape [# output features] whose
values are indices into the input feature vector.
"""
boolean_mask = np.zeros(self.corr_matrix.shape[0],dtype=int)
boolean_mask[list(self.ind_to_retain)] = 1
return np.array(list(self.ind_to_retain)) if indices else boolean_mask
def get_statistics(self):
"""
Get correlation coefficients matrix statistics
"""
return {'min':self.corr_matrix.min(),
'quantile_0.1': np.quantile(self.corr_matrix,0.1),
'mean' : self.corr_matrix.mean(),
'median' : np.median(self.corr_matrix),
'quantile_0.9': np.quantile(self.corr_matrix,0.9),
'max': self.corr_matrix.max()}
def get_corr_matrix(self,masked=False):
"""
Get correlation matrix as numpy 2D array [features,features].
Paramters
---------
masked : boolean (default False)
If True, return the correlation matrix with values => threshold masked.
"""
return np.ma.masked_greater_equal(self.corr_matrix,self.threshold) if masked else self.corr_matrix
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment