syhw/bandits.py

## bandits.py
import numpy as np
from scipy.stats import bernoulli

N_EXPS = 200  # number of experiences to conduct TODO test 10k or 20k EXPS
N_BAGS = 10  # number of bags
N_DRAWS = 100  # number of draws
SMOOTHER = 1.E-2  # shrinkage parameter
EPSILON_BANDIT = 0.1  # epsilon-greedy bandit epsilon
EPSILON_NUM = 1.E-9  # numerical epsilon

bags_p = np.random.random((N_EXPS, N_BAGS))  # the probability of the Bernoulli
                                             # for N_EXPS with N_BAGS
mean_s_1 = 0.
mean_s_2_m = 0.
mean_s_2_s = 0.
mean_s_3_m = 0.
mean_s_3_s = 0.
mean_s_4_m = 0.
mean_s_4_s = 0.
mean_s_5 = 0.
mean_s_5_e0 = 0.
mean_s_6 = 0.
mean_s_7 = 0.
for i in xrange(bags_p.shape[0]):
    # strategy 1 = take 10 in each of the 10 bags
    b_1 = [bernoulli.rvs(p, size=N_BAGS) for p in bags_p[i]]
    mean_s_1 += np.sum(b_1)

    # strategy 2 = Laplace law of succession estimation with one sample (stupid)
    b_2 = [bernoulli.rvs(p) for p in bags_p[i]]
    estimator = np.array([e+SMOOTHER for e in b_2])
    estimator /= np.sum(estimator)
    mean_s_2_m += np.sum(b_2)
    mean_s_2_s += np.sum(b_2)
    # still 90 draws:
    # maximal picking (_m)
    mean_s_2_m += np.sum(bernoulli.rvs(bags_p[i, np.argmax(estimator)],
        size=N_DRAWS-N_BAGS))
    # sampling (_s)
    mean_s_2_s += np.sum([bernoulli.rvs(np.random.choice(bags_p[i],
        p=estimator)) for _ in xrange(N_DRAWS-N_BAGS)])


    # strategy 3 = Laplace law of succession estimation with three samples
    b_3 = [bernoulli.rvs(p, size=3) for p in bags_p[i]]
    estimator = [np.sum(e)+SMOOTHER for e in b_3]
    estimator = np.array(estimator)
    estimator /= np.sum(estimator)
    mean_s_3_m += np.sum(b_3)
    mean_s_3_s += np.sum(b_3)
    # still 70 draws:
    # maximal picking (_m)
    mean_s_3_m += np.sum(bernoulli.rvs(bags_p[i, np.argmax(estimator)],
        size=N_DRAWS-3*N_BAGS))
    # sampling (_s)
    mean_s_3_s += np.sum([bernoulli.rvs(np.random.choice(bags_p[i],
        p=estimator)) for _ in xrange(N_DRAWS-3*N_BAGS)])

    # strategy 4 = Laplace law of succession estimation with five samples
    b_4 = [bernoulli.rvs(p, size=5) for p in bags_p[i]]
    estimator = [np.sum(e)+SMOOTHER for e in b_4]
    estimator /= np.sum(estimator)
    mean_s_4_m += np.sum(b_4)
    mean_s_4_s += np.sum(b_4)
    # still 50 draws:
    # maximal picking (_m)
    mean_s_4_m += np.sum(bernoulli.rvs(bags_p[i, np.argmax(estimator)],
        size=N_DRAWS-5*N_BAGS))
    # sampling (_s)
    mean_s_4_s += np.sum([bernoulli.rvs(np.random.choice(bags_p[i],
        p=estimator)) for _ in xrange(N_DRAWS-5*N_BAGS)])

    # strategy 5 = epsilon-greedy bandit
    b_5 = b_2
    mean_s_5 += np.sum(b_5)
    estimator = np.array([e+EPSILON_NUM for e in b_5])
    for _ in xrange(N_DRAWS-N_BAGS):
        if np.random.random() < EPSILON_BANDIT:
            random_ind = np.random.randint(0, N_BAGS)
            random_sample = bernoulli.rvs(bags_p[i, random_ind])
            estimator[random_ind] += random_sample
            mean_s_5 += random_sample
        else:
            max_ind = np.argmax(estimator)
            max_sample = bernoulli.rvs(bags_p[i, max_ind])
            estimator[max_ind] += max_sample
            mean_s_5 += max_sample

    # strategy 5 = epsilon-greedy bandit with epsilon = 0
    b_5 = b_2
    mean_s_5_e0 += np.sum(b_5)
    estimator = np.array([e+EPSILON_NUM for e in b_5])
    for _ in xrange(N_DRAWS-N_BAGS):
        max_ind = np.argmax(estimator)
        max_sample = bernoulli.rvs(bags_p[i, max_ind])
        estimator[max_ind] += max_sample
        mean_s_5_e0 += max_sample

    # strategy 6 = UCB1 bandit
    b_6 = b_2
    mean_s_6 += np.sum(b_6)
    estimator = np.array([e+EPSILON_NUM for e in b_5])
    counts = np.ones(N_BAGS)
    for c in xrange(N_DRAWS-N_BAGS):
        best_estimator = [e + np.sqrt(2*np.log(c+N_BAGS+1.)) /
                (counts[ii] ** 1.5) for ii, e in enumerate(estimator)]
        max_ind = np.argmax(best_estimator)
        max_sample = bernoulli.rvs(bags_p[i, max_ind])
        estimator[max_ind] += max_sample
        counts[max_ind] += 1
        mean_s_6 += max_sample

    # strategy 7 = Bayesian bandit
    b_7 = b_2
    mean_s_7 += np.sum(b_7) # we could even start without pulling each once!
    beta_a = np.ones(N_BAGS)
    beta_b = np.ones(N_BAGS)
    for ii, val in enumerate(b_7):
        beta_a[ii] += val
        beta_b[ii] += 1 - val
    for c in xrange(N_DRAWS - N_BAGS):
        #best_estimator = [np.mean(np.random.beta(beta_a[ii], beta_b[ii],
        #    size=100)) for ii in xrange(N_BAGS)]
        best_estimator = [beta_a[ii]/(beta_a[ii] + beta_b[ii]) for ii
                in xrange(N_BAGS)]
        max_ind = np.argmax(best_estimator)
        max_sample = bernoulli.rvs(bags_p[i, max_ind])
        beta_a[max_ind] += max_sample
        beta_b[max_ind] += 1 - max_sample
        mean_s_7 += max_sample


print "strategy 1 (random/avg):", mean_s_1 / bags_p.shape[0]
print "strategy 2 (estimate (1 sample) and exploit) max:", mean_s_2_m / bags_p.shape[0]
print "strategy 2 (estimate (1 sample) and exploit) sample:", mean_s_2_s / bags_p.shape[0]
print "strategy 3 (estimate (3 samples) and exploit) max:", mean_s_3_m / bags_p.shape[0]
print "strategy 3 (estimate (3 samples) and exploit) sample:", mean_s_3_s / bags_p.shape[0]
print "strategy 4 (estimate (5 samples) and exploit) max:", mean_s_4_m / bags_p.shape[0]
print "strategy 4 (estimate (5 samples) and exploit) sample:", mean_s_4_s / bags_p.shape[0]
print "strategy 5 (bandit epsilon-greedy with epsilon", EPSILON_BANDIT, "):", mean_s_5 / bags_p.shape[0]
print "strategy 5 (bandit epsilon-greedy with epsilon 0 ):", mean_s_5_e0 / bags_p.shape[0]
print "strategy 6 (bandit UCB1):", mean_s_6 / bags_p.shape[0]
print "strategy 7 (Bayesian bandit):", mean_s_7 / bags_p.shape[0]
	import numpy as np
	from scipy.stats import bernoulli

	N_EXPS = 200 # number of experiences to conduct TODO test 10k or 20k EXPS
	N_BAGS = 10 # number of bags
	N_DRAWS = 100 # number of draws
	SMOOTHER = 1.E-2 # shrinkage parameter
	EPSILON_BANDIT = 0.1 # epsilon-greedy bandit epsilon
	EPSILON_NUM = 1.E-9 # numerical epsilon

	bags_p = np.random.random((N_EXPS, N_BAGS)) # the probability of the Bernoulli
	# for N_EXPS with N_BAGS
	mean_s_1 = 0.
	mean_s_2_m = 0.
	mean_s_2_s = 0.
	mean_s_3_m = 0.
	mean_s_3_s = 0.
	mean_s_4_m = 0.
	mean_s_4_s = 0.
	mean_s_5 = 0.
	mean_s_5_e0 = 0.
	mean_s_6 = 0.
	mean_s_7 = 0.
	for i in xrange(bags_p.shape[0]):
	# strategy 1 = take 10 in each of the 10 bags
	b_1 = [bernoulli.rvs(p, size=N_BAGS) for p in bags_p[i]]
	mean_s_1 += np.sum(b_1)

	# strategy 2 = Laplace law of succession estimation with one sample (stupid)
	b_2 = [bernoulli.rvs(p) for p in bags_p[i]]
	estimator = np.array([e+SMOOTHER for e in b_2])
	estimator /= np.sum(estimator)
	mean_s_2_m += np.sum(b_2)
	mean_s_2_s += np.sum(b_2)
	# still 90 draws:
	# maximal picking (_m)
	mean_s_2_m += np.sum(bernoulli.rvs(bags_p[i, np.argmax(estimator)],
	size=N_DRAWS-N_BAGS))
	# sampling (_s)
	mean_s_2_s += np.sum([bernoulli.rvs(np.random.choice(bags_p[i],
	p=estimator)) for _ in xrange(N_DRAWS-N_BAGS)])


	# strategy 3 = Laplace law of succession estimation with three samples
	b_3 = [bernoulli.rvs(p, size=3) for p in bags_p[i]]
	estimator = [np.sum(e)+SMOOTHER for e in b_3]
	estimator = np.array(estimator)
	estimator /= np.sum(estimator)
	mean_s_3_m += np.sum(b_3)
	mean_s_3_s += np.sum(b_3)
	# still 70 draws:
	# maximal picking (_m)
	mean_s_3_m += np.sum(bernoulli.rvs(bags_p[i, np.argmax(estimator)],
	size=N_DRAWS-3*N_BAGS))
	# sampling (_s)
	mean_s_3_s += np.sum([bernoulli.rvs(np.random.choice(bags_p[i],
	p=estimator)) for _ in xrange(N_DRAWS-3*N_BAGS)])

	# strategy 4 = Laplace law of succession estimation with five samples
	b_4 = [bernoulli.rvs(p, size=5) for p in bags_p[i]]
	estimator = [np.sum(e)+SMOOTHER for e in b_4]
	estimator /= np.sum(estimator)
	mean_s_4_m += np.sum(b_4)
	mean_s_4_s += np.sum(b_4)
	# still 50 draws:
	# maximal picking (_m)
	mean_s_4_m += np.sum(bernoulli.rvs(bags_p[i, np.argmax(estimator)],
	size=N_DRAWS-5*N_BAGS))
	# sampling (_s)
	mean_s_4_s += np.sum([bernoulli.rvs(np.random.choice(bags_p[i],
	p=estimator)) for _ in xrange(N_DRAWS-5*N_BAGS)])

	# strategy 5 = epsilon-greedy bandit
	b_5 = b_2
	mean_s_5 += np.sum(b_5)
	estimator = np.array([e+EPSILON_NUM for e in b_5])
	for _ in xrange(N_DRAWS-N_BAGS):
	if np.random.random() < EPSILON_BANDIT:
	random_ind = np.random.randint(0, N_BAGS)
	random_sample = bernoulli.rvs(bags_p[i, random_ind])
	estimator[random_ind] += random_sample
	mean_s_5 += random_sample
	else:
	max_ind = np.argmax(estimator)
	max_sample = bernoulli.rvs(bags_p[i, max_ind])
	estimator[max_ind] += max_sample
	mean_s_5 += max_sample

	# strategy 5 = epsilon-greedy bandit with epsilon = 0
	b_5 = b_2
	mean_s_5_e0 += np.sum(b_5)
	estimator = np.array([e+EPSILON_NUM for e in b_5])
	for _ in xrange(N_DRAWS-N_BAGS):
	max_ind = np.argmax(estimator)
	max_sample = bernoulli.rvs(bags_p[i, max_ind])
	estimator[max_ind] += max_sample
	mean_s_5_e0 += max_sample

	# strategy 6 = UCB1 bandit
	b_6 = b_2
	mean_s_6 += np.sum(b_6)
	estimator = np.array([e+EPSILON_NUM for e in b_5])
	counts = np.ones(N_BAGS)
	for c in xrange(N_DRAWS-N_BAGS):
	best_estimator = [e + np.sqrt(2*np.log(c+N_BAGS+1.)) /
	(counts[ii] ** 1.5) for ii, e in enumerate(estimator)]
	max_ind = np.argmax(best_estimator)
	max_sample = bernoulli.rvs(bags_p[i, max_ind])
	estimator[max_ind] += max_sample
	counts[max_ind] += 1
	mean_s_6 += max_sample

	# strategy 7 = Bayesian bandit
	b_7 = b_2
	mean_s_7 += np.sum(b_7) # we could even start without pulling each once!
	beta_a = np.ones(N_BAGS)
	beta_b = np.ones(N_BAGS)
	for ii, val in enumerate(b_7):
	beta_a[ii] += val
	beta_b[ii] += 1 - val
	for c in xrange(N_DRAWS - N_BAGS):
	#best_estimator = [np.mean(np.random.beta(beta_a[ii], beta_b[ii],
	# size=100)) for ii in xrange(N_BAGS)]
	best_estimator = [beta_a[ii]/(beta_a[ii] + beta_b[ii]) for ii
	in xrange(N_BAGS)]
	max_ind = np.argmax(best_estimator)
	max_sample = bernoulli.rvs(bags_p[i, max_ind])
	beta_a[max_ind] += max_sample
	beta_b[max_ind] += 1 - max_sample
	mean_s_7 += max_sample


	print "strategy 1 (random/avg):", mean_s_1 / bags_p.shape[0]
	print "strategy 2 (estimate (1 sample) and exploit) max:", mean_s_2_m / bags_p.shape[0]
	print "strategy 2 (estimate (1 sample) and exploit) sample:", mean_s_2_s / bags_p.shape[0]
	print "strategy 3 (estimate (3 samples) and exploit) max:", mean_s_3_m / bags_p.shape[0]
	print "strategy 3 (estimate (3 samples) and exploit) sample:", mean_s_3_s / bags_p.shape[0]
	print "strategy 4 (estimate (5 samples) and exploit) max:", mean_s_4_m / bags_p.shape[0]
	print "strategy 4 (estimate (5 samples) and exploit) sample:", mean_s_4_s / bags_p.shape[0]
	print "strategy 5 (bandit epsilon-greedy with epsilon", EPSILON_BANDIT, "):", mean_s_5 / bags_p.shape[0]
	print "strategy 5 (bandit epsilon-greedy with epsilon 0 ):", mean_s_5_e0 / bags_p.shape[0]
	print "strategy 6 (bandit UCB1):", mean_s_6 / bags_p.shape[0]
	print "strategy 7 (Bayesian bandit):", mean_s_7 / bags_p.shape[0]