Skip to content

Instantly share code, notes, and snippets.

@massiung
Last active July 7, 2022 00:35
Show Gist options
  • Save massiung/5390a13789cd6a8e459c3a8a3bb6d796 to your computer and use it in GitHub Desktop.
Save massiung/5390a13789cd6a8e459c3a8a3bb6d796 to your computer and use it in GitHub Desktop.
Population Stability Index and Information Value function
def pop_diff(popA, popB, bin_boundaries=None, num_bins=10):
"""
Compute difference between two populations using the PSI / IV formula
$$\Sigma_{i} (p_i^B - p_i^a)*\ln(\frac{p_i^B}{p_i^A})$$
Note:
Counts missing values in a separate bin to test for information and shift.
Raises ValueError if popA has no nans but popB does.
Arguments:
popA (pandas.Series): Base population, e.g. goods or train
popB (pandas.Series): Compared population, e.g. bads or test
bin_boundaries (list or None): Boundaries between bins, excluding bottom and top which are +-np.inf by default
num_bins (int): Number of buckets to use if bin_boundaries is not used
Returns:
(float) diff, (pandas.DataFrame) summary
Examples:
>>> psi = pop_diff(df_train['age'], df_test['age'], bin_boundaries=[17, 21, 40, 65])
>>> iv_age = pop_diff(df_good['age'], df_bad['age'], num_buckets=20)
"""
# create binning
if bin_boundaries is None:
bin_boundaries = [
popA.quantile((i+1)*(1.0/num_bins))
for i in range(0, num_bins-1)
]
bin_boundaries = [-np.inf] + bin_boundaries + [np.inf]
# make a table of bin counts (histogram)
pop_diff = pd.DataFrame.from_dict({
'start': bin_boundaries[:-1],
'end': bin_boundaries[1:],
})
pop_diff['countA'] = pop_diff.apply(
lambda row: ((row['start'] < popA) & (popA <= row['end'])).sum(),
axis=1)
pop_diff['countB'] = pop_diff.apply(
lambda row: ((row['start'] < popB) & (popB <= row['end'])).sum(),
axis=1)
# analyze missing values - if they exist we create a separate bin
popA_missing = len(popA) - popA.count()
popB_missing = len(popB) - popB.count()
if popA_missing:
# create a bin for missing
pop_diff = pop_diff.append({
'start': np.nan,
'end': np.nan,
'countA': popA_missing,
'countB': popB_missing
}, ignore_index=True)
elif popB_missing:
raise ValueError('Population B has missing although population A doesnt. This might indicate a quality problem.')
# apply psi / iv formula
pop_diff['ratioA'] = pop_diff['countA']/len(popA)
pop_diff['ratioB'] = pop_diff['countB']/len(popB)
pop_diff['diff'] = (pop_diff['ratioB'] - pop_diff['ratioA'])*np.log(pop_diff['ratioB']/pop_diff['ratioA'])
# compute the totals and format the result
s = pop_diff.sum()
s.name = 'total'
return s['diff'], pop_diff.append(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment