Created
March 5, 2019 03:00
-
-
Save deaktator/1080eca4c291070d009014f2f2d759ad to your computer and use it in GitHub Desktop.
Using the hypergeometric distribution to model the expected accuracy in cross validation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.stats import hypergeom | |
# Run the following to see the graph: | |
# for i in range(1, 400): | |
# print(f"{i}\t{exp_test_acc(400, 201, i)}") | |
# | |
def exp_test_acc(pop_size, pos_in_pop, fold1_size): | |
fold2_size = pop_size - fold1_size | |
E_acc = 0.0 | |
for pos_in_1 in range(min(pos_in_pop, fold1_size) + 1): | |
pr_pos_1 = hypergeom.pmf(pos_in_1, pop_size, pos_in_pop, fold1_size) | |
if pr_pos_1 != 0: | |
pos_in_2 = pos_in_pop - pos_in_1 | |
# The priors can be thought of as a constant model. | |
f1_prior = pos_in_1 / fold1_size | |
f2_prior = pos_in_2 / fold2_size | |
# Compute the accuracy on a fold by using the model "trained" on the other fold. | |
f2_acc = f2_prior if f1_prior >= 0.5 else (1 - f2_prior) | |
f1_acc = f1_prior if f2_prior >= 0.5 else (1 - f1_prior) | |
# Update the expectation. The probability of pos_in_1 positives in the first | |
# fold is all that we need because the second fold is constrained by the choice | |
# of elements in the first fold. Here the fold estimates are weighted by the | |
# relative fold sizes. | |
E_acc += pr_pos_1 * ((fold1_size / pop_size) * f1_acc + (fold2_size / pop_size) * f2_acc) | |
return E_acc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment