kidpixo/CovarianceThreshold_Transformer.py

## CovarianceThreshold_Transformer.py
import sklearn
class CorrelationThreshold(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    """A tranformer for combining low correlation on continous features.

    This Transformer returns only features that have Pearson product-moment correlation coefficients
    above a threshold value, default 0.99.

    After fit, some data is available :

    `get_statistics()` : returns information on the correlation matrix (max,min,percentiles)
    `get_support()` : returns the index of retained features / boolean array (retain = 1/ delete=0)
    `get_corr_matrix()` : returns the data correlation coefficients matrix (or with x>threshold masked)

    """

    def __init__(self, threshold=0.99):
        """Initialize method.

        Args:
            threshold (float): The threshold to cut the correlation matrix absolute lower
                values. Only variables with |C| >= threshold will be in output.
        """
        import numpy as np

        self.threshold = threshold
        self.ind_to_delete = 0
        self.ind_to_retain = 0
        self.corr_matrix   = 0
        self.n_features_   = 0


    def fit(self, X, y=None, **fit_params):
        """Fits transformer over X.

        Calcualte the features index to retain and to reject based on threshold.

        """

        # calculate the correlation coefficients matrix
        self.corr_matrix = np.corrcoef(X_corr,rowvar=False)

        self.ind_to_delete = set ()
        for row_ind,row in enumerate(self.corr_matrix):
            self.ind_to_delete.update([ind for ind,val in enumerate((np.abs(row) >= self.threshold)) if val and (ind != row_ind) and (ind >= row_ind)])

        self.ind_to_retain = self.ind_to_delete.symmetric_difference(range(self.corr_matrix.shape[0]))
        self.n_features_ = len(self.ind_to_retain)

        return self


    def transform(self, X, **transform_params):
        """Transforms X with threshold.

        Args:
            X (obj): The dataset to pass to the transformer.

        Returns:
            The transformed X with grouped buckets.
        """

        X_copy = X.copy()
        return X_copy[:,list(self.ind_to_retain)]

    def fit_transform(self, X, y=None, **fit_params):
        """Fits+transform over X.
        """

        return self.fit(X).transform(X)


    def get_support(self, indices=False):
        """
        Get a mask, or integer index, of the features selected
        Parameters
        ----------
        indices : boolean (default False)
            If True, the return value will be an array of integers, rather
            than a boolean mask.
        Returns
        -------
        support : array
            An index that selects the retained features from a feature vector.
            If `indices` is False, this is a boolean array of shape
            [# input features], in which an element is True iff its
            corresponding feature is selected for retention. If `indices` is
            True, this is an integer array of shape [# output features] whose
            values are indices into the input feature vector.
        """

        boolean_mask = np.zeros(self.corr_matrix.shape[0],dtype=int)
        boolean_mask[list(self.ind_to_retain)] = 1

        return np.array(list(self.ind_to_retain)) if indices else boolean_mask

    def get_statistics(self):
        """
        Get correlation coefficients matrix statistics
        """

        return {'min':self.corr_matrix.min(),
                'quantile_0.1': np.quantile(self.corr_matrix,0.1),
                'mean' : self.corr_matrix.mean(),
                'median' : np.median(self.corr_matrix),
               'quantile_0.9': np.quantile(self.corr_matrix,0.9),
                'max': self.corr_matrix.max()}

    def get_corr_matrix(self,masked=False):
        """
        Get correlation matrix as numpy 2D array [features,features].

        Paramters
        ---------
        masked : boolean (default False)
            If True, return the correlation matrix with values => threshold masked.

        """

        return np.ma.masked_greater_equal(self.corr_matrix,self.threshold) if masked else self.corr_matrix
	import sklearn
	class CorrelationThreshold(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
	"""A tranformer for combining low correlation on continous features.

	This Transformer returns only features that have Pearson product-moment correlation coefficients
	above a threshold value, default 0.99.

	After fit, some data is available :

	`get_statistics()` : returns information on the correlation matrix (max,min,percentiles)
	`get_support()` : returns the index of retained features / boolean array (retain = 1/ delete=0)
	`get_corr_matrix()` : returns the data correlation coefficients matrix (or with x>threshold masked)

	"""

	def __init__(self, threshold=0.99):
	"""Initialize method.

	Args:
	threshold (float): The threshold to cut the correlation matrix absolute lower
	values. Only variables with \|C\| >= threshold will be in output.
	"""
	import numpy as np

	self.threshold = threshold
	self.ind_to_delete = 0
	self.ind_to_retain = 0
	self.corr_matrix = 0
	self.n_features_ = 0


	def fit(self, X, y=None, **fit_params):
	"""Fits transformer over X.

	Calcualte the features index to retain and to reject based on threshold.

	"""

	# calculate the correlation coefficients matrix
	self.corr_matrix = np.corrcoef(X_corr,rowvar=False)

	self.ind_to_delete = set ()
	for row_ind,row in enumerate(self.corr_matrix):
	self.ind_to_delete.update([ind for ind,val in enumerate((np.abs(row) >= self.threshold)) if val and (ind != row_ind) and (ind >= row_ind)])

	self.ind_to_retain = self.ind_to_delete.symmetric_difference(range(self.corr_matrix.shape[0]))
	self.n_features_ = len(self.ind_to_retain)

	return self


	def transform(self, X, **transform_params):
	"""Transforms X with threshold.

	Args:
	X (obj): The dataset to pass to the transformer.

	Returns:
	The transformed X with grouped buckets.
	"""

	X_copy = X.copy()
	return X_copy[:,list(self.ind_to_retain)]

	def fit_transform(self, X, y=None, **fit_params):
	"""Fits+transform over X.
	"""

	return self.fit(X).transform(X)


	def get_support(self, indices=False):
	"""
	Get a mask, or integer index, of the features selected
	Parameters
	----------
	indices : boolean (default False)
	If True, the return value will be an array of integers, rather
	than a boolean mask.
	Returns
	-------
	support : array
	An index that selects the retained features from a feature vector.
	If `indices` is False, this is a boolean array of shape
	[# input features], in which an element is True iff its
	corresponding feature is selected for retention. If `indices` is
	True, this is an integer array of shape [# output features] whose
	values are indices into the input feature vector.
	"""

	boolean_mask = np.zeros(self.corr_matrix.shape[0],dtype=int)
	boolean_mask[list(self.ind_to_retain)] = 1

	return np.array(list(self.ind_to_retain)) if indices else boolean_mask

	def get_statistics(self):
	"""
	Get correlation coefficients matrix statistics
	"""

	return {'min':self.corr_matrix.min(),
	'quantile_0.1': np.quantile(self.corr_matrix,0.1),
	'mean' : self.corr_matrix.mean(),
	'median' : np.median(self.corr_matrix),
	'quantile_0.9': np.quantile(self.corr_matrix,0.9),
	'max': self.corr_matrix.max()}

	def get_corr_matrix(self,masked=False):
	"""
	Get correlation matrix as numpy 2D array [features,features].

	Paramters
	---------
	masked : boolean (default False)
	If True, return the correlation matrix with values => threshold masked.

	"""

	return np.ma.masked_greater_equal(self.corr_matrix,self.threshold) if masked else self.corr_matrix