deaktator/hypergeom_sklearn_imp_wt.py

## hypergeom_sklearn_imp_wt.py
from scipy.stats import hypergeom

# Run the following to see the graph:
#     for i in range(1, 400):
#         print(f"{i}\t{exp_test_acc(400, 201, i)}")
#
def exp_test_acc(pop_size, pos_in_pop, fold1_size):
    fold2_size = pop_size - fold1_size
    E_acc = 0.0

    for pos_in_1 in range(min(pos_in_pop, fold1_size) + 1):
        pr_pos_1 = hypergeom.pmf(pos_in_1, pop_size, pos_in_pop, fold1_size)

        if pr_pos_1 != 0:
            pos_in_2 = pos_in_pop - pos_in_1

            # The priors can be thought of as a constant model.
            f1_prior = pos_in_1 / fold1_size
            f2_prior = pos_in_2 / fold2_size

            # Compute the accuracy on a fold by using the model "trained" on the other fold.
            f2_acc = f2_prior if f1_prior >= 0.5 else (1 - f2_prior)
            f1_acc = f1_prior if f2_prior >= 0.5 else (1 - f1_prior)

            # Update the expectation.  The probability of pos_in_1 positives in the first
            # fold is all that we need because the second fold is constrained by the choice
            # of elements in the first fold.  Here the fold estimates are weighted by the
            # relative fold sizes.
            E_acc += pr_pos_1 * ((fold1_size / pop_size) * f1_acc + (fold2_size / pop_size) * f2_acc)
    return E_acc
	from scipy.stats import hypergeom

	# Run the following to see the graph:
	# for i in range(1, 400):
	# print(f"{i}\t{exp_test_acc(400, 201, i)}")
	#
	def exp_test_acc(pop_size, pos_in_pop, fold1_size):
	fold2_size = pop_size - fold1_size
	E_acc = 0.0

	for pos_in_1 in range(min(pos_in_pop, fold1_size) + 1):
	pr_pos_1 = hypergeom.pmf(pos_in_1, pop_size, pos_in_pop, fold1_size)

	if pr_pos_1 != 0:
	pos_in_2 = pos_in_pop - pos_in_1

	# The priors can be thought of as a constant model.
	f1_prior = pos_in_1 / fold1_size
	f2_prior = pos_in_2 / fold2_size

	# Compute the accuracy on a fold by using the model "trained" on the other fold.
	f2_acc = f2_prior if f1_prior >= 0.5 else (1 - f2_prior)
	f1_acc = f1_prior if f2_prior >= 0.5 else (1 - f1_prior)

	# Update the expectation. The probability of pos_in_1 positives in the first
	# fold is all that we need because the second fold is constrained by the choice
	# of elements in the first fold. Here the fold estimates are weighted by the
	# relative fold sizes.
	E_acc += pr_pos_1 * ((fold1_size / pop_size) * f1_acc + (fold2_size / pop_size) * f2_acc)
	return E_acc