3Nigma/gist:8964454

## gistfile1.matlab
# the optimization consists in the elimination of the 'Qsum' variable from the normal case and modifying
# 'Qavg' directly using a simple mathematica observation as Rich Sutton elegantly demonstrates
# in section 2.4

function totalRewards = simulate_epsg_n_bandit(n, eps, gamecnt, rollcnt)
  # make room for the estimated value - action [Q_t(a)]
  totalRewards = zeros(1, rollcnt);

  for k = 1:gamecnt
    # generate the true value - action [q_*(a)] from a normal distribution of mean 0 and variance 1
    q = randn(1, n);

    Qavg   = zeros(1, n);
    nPulls = zeros(1, n);

    for i = 1:rollcnt
      if (unifrnd(0, 1) <= 1 - eps)
        # do a tiebraking, exploitation step
        idsQstep = find(Qavg == max(Qavg));
        randperm(length(idsQstep));
        iQstep = idsQstep(1);
      else
        # do an exploration step
        iQstep = unidrnd(n);
      endif

      Rk = q(iQstep) + normrnd(0, 1);
      totalRewards(i) = totalRewards(i) + Rk;
      nPulls(iQstep) = nPulls(iQstep) + 1;
      Qavg(iQstep) = Qavg(iQstep) + 1/nPulls(iQstep) * (Rk - Qavg(iQstep));
    endfor
  endfor
endfunction
	# the optimization consists in the elimination of the 'Qsum' variable from the normal case and modifying
	# 'Qavg' directly using a simple mathematica observation as Rich Sutton elegantly demonstrates
	# in section 2.4

	function totalRewards = simulate_epsg_n_bandit(n, eps, gamecnt, rollcnt)
	# make room for the estimated value - action [Q_t(a)]
	totalRewards = zeros(1, rollcnt);

	for k = 1:gamecnt
	# generate the true value - action [q_*(a)] from a normal distribution of mean 0 and variance 1
	q = randn(1, n);

	Qavg = zeros(1, n);
	nPulls = zeros(1, n);

	for i = 1:rollcnt
	if (unifrnd(0, 1) <= 1 - eps)
	# do a tiebraking, exploitation step
	idsQstep = find(Qavg == max(Qavg));
	randperm(length(idsQstep));
	iQstep = idsQstep(1);
	else
	# do an exploration step
	iQstep = unidrnd(n);
	endif

	Rk = q(iQstep) + normrnd(0, 1);
	totalRewards(i) = totalRewards(i) + Rk;
	nPulls(iQstep) = nPulls(iQstep) + 1;
	Qavg(iQstep) = Qavg(iQstep) + 1/nPulls(iQstep) * (Rk - Qavg(iQstep));
	endfor
	endfor
	endfunction