Swarchal/find_correlation.py

## find_correlation.py
import pandas as pd
import numpy as np

def find_correlation(data, threshold=0.9, remove_negative=False):
    """
    Given a numeric pd.DataFrame, this will find highly correlated features,
    and return a list of features to remove.
    Parameters
    -----------
    data : pandas DataFrame
        DataFrame
    threshold : float
        correlation threshold, will remove one of pairs of features with a
        correlation greater than this value.
    remove_negative: Boolean
        If true then features which are highly negatively correlated will
        also be returned for removal.
    Returns
    --------
    select_flat : list
        listof column names to be removed
    """
    corr_mat = data.corr()
    if remove_negative:
        corr_mat = np.abs(corr_mat)
    corr_mat.loc[:, :] = np.tril(corr_mat, k=-1)
    already_in = set()
    result = []
    for col in corr_mat:
        perfect_corr = corr_mat[col][corr_mat[col] > threshold].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            result.append(perfect_corr)
    select_nested = [f[1:] for f in result]
    select_flat = [i for j in select_nested for i in j]
    return select_flat
	import pandas as pd
	import numpy as np

	def find_correlation(data, threshold=0.9, remove_negative=False):
	"""
	Given a numeric pd.DataFrame, this will find highly correlated features,
	and return a list of features to remove.
	Parameters
	-----------
	data : pandas DataFrame
	DataFrame
	threshold : float
	correlation threshold, will remove one of pairs of features with a
	correlation greater than this value.
	remove_negative: Boolean
	If true then features which are highly negatively correlated will
	also be returned for removal.
	Returns
	--------
	select_flat : list
	listof column names to be removed
	"""
	corr_mat = data.corr()
	if remove_negative:
	corr_mat = np.abs(corr_mat)
	corr_mat.loc[:, :] = np.tril(corr_mat, k=-1)
	already_in = set()
	result = []
	for col in corr_mat:
	perfect_corr = corr_mat[col][corr_mat[col] > threshold].index.tolist()
	if perfect_corr and col not in already_in:
	already_in.update(set(perfect_corr))
	perfect_corr.append(col)
	result.append(perfect_corr)
	select_nested = [f[1:] for f in result]
	select_flat = [i for j in select_nested for i in j]
	return select_flat