massiung/psi_iv.py

## psi_iv.py
def pop_diff(popA, popB, bin_boundaries=None, num_bins=10):
    """
    Compute difference between two populations using the PSI / IV formula

    $$\Sigma_{i} (p_i^B - p_i^a)*\ln(\frac{p_i^B}{p_i^A})$$

    Note:
        Counts missing values in a separate bin to test for information and shift.
        Raises ValueError if popA has no nans but popB does.

    Arguments:
        popA (pandas.Series): Base population, e.g. goods or train
        popB (pandas.Series): Compared population, e.g. bads or test
        bin_boundaries (list or None): Boundaries between bins, excluding bottom and top which are +-np.inf by default
        num_bins (int): Number of buckets to use if bin_boundaries is not used

    Returns:
        (float) diff, (pandas.DataFrame) summary

    Examples:
        >>> psi = pop_diff(df_train['age'], df_test['age'], bin_boundaries=[17, 21, 40, 65])
        >>> iv_age = pop_diff(df_good['age'], df_bad['age'], num_buckets=20)
    """
    # create binning
    if bin_boundaries is None:
        bin_boundaries = [
            popA.quantile((i+1)*(1.0/num_bins))
            for i in range(0, num_bins-1)
        ]
    bin_boundaries = [-np.inf] + bin_boundaries + [np.inf]

    # make a table of bin counts (histogram)
    pop_diff = pd.DataFrame.from_dict({
        'start': bin_boundaries[:-1],
        'end': bin_boundaries[1:],
    })
    pop_diff['countA'] = pop_diff.apply(
        lambda row: ((row['start'] < popA) & (popA <= row['end'])).sum(),
        axis=1)
    pop_diff['countB'] = pop_diff.apply(
        lambda row: ((row['start'] < popB) & (popB <= row['end'])).sum(),
        axis=1)

    # analyze missing values - if they exist we create a separate bin
    popA_missing = len(popA) - popA.count()
    popB_missing = len(popB) - popB.count()
    if popA_missing:
        # create a bin for missing
        pop_diff = pop_diff.append({
            'start': np.nan,
            'end': np.nan,
            'countA': popA_missing,
            'countB': popB_missing
        }, ignore_index=True)
    elif popB_missing:
        raise ValueError('Population B has missing although population A doesnt. This might indicate a quality problem.')

    # apply psi / iv formula
    pop_diff['ratioA'] = pop_diff['countA']/len(popA)
    pop_diff['ratioB'] = pop_diff['countB']/len(popB)
    pop_diff['diff'] = (pop_diff['ratioB'] - pop_diff['ratioA'])*np.log(pop_diff['ratioB']/pop_diff['ratioA'])

    # compute the totals and format the result
    s = pop_diff.sum()
    s.name = 'total'

    return s['diff'], pop_diff.append(s)
	def pop_diff(popA, popB, bin_boundaries=None, num_bins=10):
	"""
	Compute difference between two populations using the PSI / IV formula

	$$\Sigma_{i} (p_i^B - p_i^a)*\ln(\frac{p_i^B}{p_i^A})$$

	Note:
	Counts missing values in a separate bin to test for information and shift.
	Raises ValueError if popA has no nans but popB does.

	Arguments:
	popA (pandas.Series): Base population, e.g. goods or train
	popB (pandas.Series): Compared population, e.g. bads or test
	bin_boundaries (list or None): Boundaries between bins, excluding bottom and top which are +-np.inf by default
	num_bins (int): Number of buckets to use if bin_boundaries is not used

	Returns:
	(float) diff, (pandas.DataFrame) summary

	Examples:
	>>> psi = pop_diff(df_train['age'], df_test['age'], bin_boundaries=[17, 21, 40, 65])
	>>> iv_age = pop_diff(df_good['age'], df_bad['age'], num_buckets=20)
	"""
	# create binning
	if bin_boundaries is None:
	bin_boundaries = [
	popA.quantile((i+1)*(1.0/num_bins))
	for i in range(0, num_bins-1)
	]
	bin_boundaries = [-np.inf] + bin_boundaries + [np.inf]

	# make a table of bin counts (histogram)
	pop_diff = pd.DataFrame.from_dict({
	'start': bin_boundaries[:-1],
	'end': bin_boundaries[1:],
	})
	pop_diff['countA'] = pop_diff.apply(
	lambda row: ((row['start'] < popA) & (popA <= row['end'])).sum(),
	axis=1)
	pop_diff['countB'] = pop_diff.apply(
	lambda row: ((row['start'] < popB) & (popB <= row['end'])).sum(),
	axis=1)

	# analyze missing values - if they exist we create a separate bin
	popA_missing = len(popA) - popA.count()
	popB_missing = len(popB) - popB.count()
	if popA_missing:
	# create a bin for missing
	pop_diff = pop_diff.append({
	'start': np.nan,
	'end': np.nan,
	'countA': popA_missing,
	'countB': popB_missing
	}, ignore_index=True)
	elif popB_missing:
	raise ValueError('Population B has missing although population A doesnt. This might indicate a quality problem.')

	# apply psi / iv formula
	pop_diff['ratioA'] = pop_diff['countA']/len(popA)
	pop_diff['ratioB'] = pop_diff['countB']/len(popB)
	pop_diff['diff'] = (pop_diff['ratioB'] - pop_diff['ratioA'])*np.log(pop_diff['ratioB']/pop_diff['ratioA'])

	# compute the totals and format the result
	s = pop_diff.sum()
	s.name = 'total'

	return s['diff'], pop_diff.append(s)