Skip to content

Instantly share code, notes, and snippets.

@marr75
Created January 6, 2023 15:54
Show Gist options
  • Save marr75/9a9e7fa2e542b8313e899807943391cb to your computer and use it in GitHub Desktop.
Save marr75/9a9e7fa2e542b8313e899807943391cb to your computer and use it in GitHub Desktop.
Quick vectorized approach to calculating metrics at every potential breakpoint for a classification problem. Nice way to visualize precision, recall, and f1 in continuous graphs if you're introducing unsupervised machine learning or talking about very shallow decision trees.
def evaluate_threshold_binary_classification(x: pandas.Series, y: pandas.Series, reverse=False):
""" Calculate quality metrics such as True Positive, True Negative, False Positive, False Negative, Precision, Recall, and F1
for all possible thresholds of a metric x being used to predict a classification against a binary class, y.
x: the independent variable we will use as a predictor, a continuos variable
y: the dependent variable representing the class we are predicting, 0/1 or bool
reverse: whether the x and y series have an inverse relationship
"""
predictor = pd.DataFrame(
{
'discriminant': x * (1 and reverse or -1),
'class': y,
},
index=normalized_gdf.index,
).sort_values('discriminant')
break_point_evaluation = predictor.assign(
# Predict positive for all rows up to the sorted discriminator
predicted_positive=predictor['class'].reset_index().index,
# Predict negative for all other rows, `count - predicted_true`
predicted_negative=lambda df: (df.predicted_positive * -1) + predictor['class'].count(),
# All previous 1/True values will be true positives, `cumulative_sum`
true_positive=predictor['class'].cumsum(),
# All previous 0/False values will be false positives, `~cumulative_sum`
false_positive=(~predictor['class']).cumsum(),
# Reversing the sort to find true values from the negative predictions, `predicted_negative - reverse_cumalitive_sum`
true_negative=lambda df: df.predicted_negative - predictor.sort_values('discriminant', ascending=False)['class'].cumsum(),
# The balance of the negative predictions are
false_negative=lambda df: df.predicted_negative - df.true_negative,
precision=lambda df: df.true_positive / (df.true_positive + df.false_positive),
recall=lambda df: df.true_positive / (df.true_positive + df.false_negative),
f1=lambda df: (2 * df.precision * df.recall) / (df.precision + df.recall),
discriminant=x,
)
return break_point_evaluation
evaluation = evaluate_threshold_binary_classification(
principal_components.PC0,
normalized_gdf['ERS U/R'].astype(bool),
)
evaluation[['discriminant', 'true_positive', 'true_negative', 'false_positive', 'false_negative']].plot('discriminant')
evaluation[['discriminant', 'precision', 'recall', 'f1']].plot('discriminant')
print(
f"Best performing threshold was: {principal_components.PC0[evaluation['f1'].idxmax()]} for {principal_components.PC0.name}."
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment