cjbayesian/cdf_diff.py

## cdf_diff.py
def cdf_diff(df, var, grp='label', col=None, rm_outlier=None, hard_lim=None, ax=None, xlim=None):
    '''Plot cummulative distributions of multiple groups for comparison.
    Arguments:
        df: DataFrame
        var: string, name of column to be plotted
        grp: string, grouping variable
        col: list, colors to use for each group
        rm_outlier: None|float, remove datapoints beyond this many sigma.
        ax: axis on which to plot. Default none will return a new figure

    Examples:
        cdf_diff(feats_labeled,var='Creatinine' ,rm_outlier=4.0)
        fig, ax = plt.subplots(1, 2)
        psLearn.cdf_diff(feats_labeled,var='Creatinine' ,ax=ax[0],rm_outlier=4.0)
        psLearn.cdf_diff(feats_labeled,var='Sodium Level',ax=ax[1])
    '''
    if col is None:
        col = ['green', 'red']
    import statsmodels.api as sm
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    grps = df[grp].unique()
    if len(df[var].unique()) == 2:
        df.groupby(grp)[var].mean().plot(ax = ax,kind='bar',color=col)
        ax.set_title(var)
    else:
        for g in grps:
            sample = df[df[grp]==g][var]
            sample = sample[np.isfinite(sample.values)]
            if rm_outlier is not None:
                sigma = sample.std()
                mu = sample.mean()
                sample = sample[sample > mu - rm_outlier * sigma ]
                sample = sample[sample < mu + rm_outlier * sigma ]
            if hard_lim is not None:
                sample = sample[sample > hard_lim[0] ]
                sample = sample[sample < hard_lim[1] ]

            ecdf = sm.distributions.ECDF(sample)
            sample = sample[ecdf(sample) < 0.99]

            x = np.linspace(min(sample), max(sample), 1000)
            y = ecdf(x)
            #x = np.append(x, [max(sample)])
            #y = np.append(y, [0])
            ax.step(x, y,label='%s = %s' % (grp,str(g)),c=col[int(g)])
            ax.set_title(var)
            ax.set_ylim([0,1])
            if xlim:
                ax.set_xlim(xlim)
	def cdf_diff(df, var, grp='label', col=None, rm_outlier=None, hard_lim=None, ax=None, xlim=None):
	'''Plot cummulative distributions of multiple groups for comparison.
	Arguments:
	df: DataFrame
	var: string, name of column to be plotted
	grp: string, grouping variable
	col: list, colors to use for each group
	rm_outlier: None\|float, remove datapoints beyond this many sigma.
	ax: axis on which to plot. Default none will return a new figure

	Examples:
	cdf_diff(feats_labeled,var='Creatinine' ,rm_outlier=4.0)
	fig, ax = plt.subplots(1, 2)
	psLearn.cdf_diff(feats_labeled,var='Creatinine' ,ax=ax[0],rm_outlier=4.0)
	psLearn.cdf_diff(feats_labeled,var='Sodium Level',ax=ax[1])
	'''
	if col is None:
	col = ['green', 'red']
	import statsmodels.api as sm
	if ax is None:
	fig, ax = plt.subplots(1, 1)
	grps = df[grp].unique()
	if len(df[var].unique()) == 2:
	df.groupby(grp)[var].mean().plot(ax = ax,kind='bar',color=col)
	ax.set_title(var)
	else:
	for g in grps:
	sample = df[df[grp]==g][var]
	sample = sample[np.isfinite(sample.values)]
	if rm_outlier is not None:
	sigma = sample.std()
	mu = sample.mean()
	sample = sample[sample > mu - rm_outlier * sigma ]
	sample = sample[sample < mu + rm_outlier * sigma ]
	if hard_lim is not None:
	sample = sample[sample > hard_lim[0] ]
	sample = sample[sample < hard_lim[1] ]

	ecdf = sm.distributions.ECDF(sample)
	sample = sample[ecdf(sample) < 0.99]

	x = np.linspace(min(sample), max(sample), 1000)
	y = ecdf(x)
	#x = np.append(x, [max(sample)])
	#y = np.append(y, [0])
	ax.step(x, y,label='%s = %s' % (grp,str(g)),c=col[int(g)])
	ax.set_title(var)
	ax.set_ylim([0,1])
	if xlim:
	ax.set_xlim(xlim)