Last active
January 3, 2016 20:39
-
-
Save xiaohan2012/8516733 to your computer and use it in GitHub Desktop.
Using the epsilon-greedy strategy to generate the n-armed bandit testbed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [] = n_armed_testbed(nB,nA,nP,sigma) | |
% | |
% Generates the 10-armed bandit testbed. | |
% | |
% Inputs: | |
% nB: the number of bandits | |
% nA: the number of arms | |
% nP: the number of plays (times we will pull a arm) | |
% sigma: the standard deviation of the return from each of the arms | |
% | |
% Written by: | |
% -- | |
% John L. Weatherwax 2007-11-13 | |
% | |
% email: wax@alum.mit.edu | |
% | |
% Please send comments and especially bug reports to the | |
% above email address. | |
% | |
%----- | |
%close all; | |
%clc; | |
%clear; | |
if( nargin<1 ) % the number of bandits: | |
nB = 2000; | |
end | |
if( nargin<2 ) % the number of arms: | |
nA = 10; | |
end | |
if( nargin<3 ) % the number of plays (times we will pull a arm): | |
nP = 10000; | |
end | |
if( nargin<4 ) % the standard deviation of the return from each of the arms: | |
sigma = 1.0; | |
end | |
randn('seed',0); | |
% generate the TRUE reward Q^{\star}: | |
qStarMeans = mvnrnd( zeros(nB,nA), eye(nA) ); | |
% run an experiment for each epsilon: | |
% 0 => fully greedy | |
% 1 => explore on each trial | |
epsArray = [ 0, 0.01, 0.1 ]; %, 1 ]; | |
% assume we have at least ONE draw from each "arm" (initialize with use the qStarMeans matrix): | |
qT0 = mvnrnd( qStarMeans, eye(nA) ); | |
avgReward = zeros(length(epsArray),nP); | |
perOptAction = zeros(length(epsArray),nP); | |
cumReward = zeros(length(epsArray),nP); | |
cumProb = zeros(length(epsArray),nP); | |
for ei=1:length(epsArray), | |
tEps = epsArray(ei); | |
%qT = qT0; % <- initialize to one draw per arm | |
qT = zeros(size(qT0)); % <- initialize to zero draws per arm (no knowledge) | |
qN = ones( nB, nA ); % keep track of the number draws on this arm | |
qS = qT; % keep track of the SUM of the rewards (qT = qS./qN) | |
allRewards = zeros(nB,nP); | |
pickedMaxAction = zeros(nB,nP); | |
for bi=1:nB, % pick a bandit | |
for pi=1:nP, % make a play | |
% determine if this move is exploritory or greedy: | |
if( rand(1) <= tEps ) % pick a RANDOM arm: | |
[dum,arm] = histc(rand(1),linspace(0,1+eps,nA+1)); clear dum; | |
else % pick the GREEDY arm: | |
[dum,arm] = max( qT(bi,:) ); clear dum; | |
end | |
% determine if the arm selected is the best possible: | |
[dum,bestArm] = max( qStarMeans(bi,:) ); | |
if( arm==bestArm ) pickedMaxAction(bi,pi) = 1; end | |
% get the reward from drawing on that arm: | |
reward = qStarMeans(bi,arm) + sigma*randn(1); %introduce some noise, or the pure greedy approach would be the best | |
allRewards(bi,pi) = reward; | |
% update qN,qS,qT: | |
qN(bi,arm) = qN(bi,arm)+1; | |
qS(bi,arm) = qS(bi,arm)+reward; | |
qT(bi,arm) = qS(bi,arm)/qN(bi,arm); | |
end | |
end | |
avgRew = mean(allRewards,1); | |
avgReward(ei,:) = avgRew(:).'; | |
percentOptAction = mean(pickedMaxAction,1); | |
perOptAction(ei,:) = percentOptAction(:).'; | |
csAR = cumsum(allRewards,2); % do a cummulative sum across plays for each bandit | |
csRew = mean(csAR,1); | |
cumReward(ei,:) = csRew(:).'; | |
csPA = cumsum(pickedMaxAction,2)./cumsum(ones(size(pickedMaxAction)),2); | |
csProb = mean(csPA,1); | |
cumProb(ei,:) = csProb(:).'; | |
end | |
% produce the average rewards plot: | |
% | |
figure; hold on; clrStr = 'brkc'; all_hnds = []; | |
for ei=1:length(epsArray), | |
%all_hnds(ei) = plot( [ 0, avgReward(ei,:) ], [clrStr(ei)] ); | |
all_hnds(ei) = plot( 1:nP, avgReward(ei,:), [clrStr(ei),'-'] ); | |
end | |
legend( all_hnds, { '0', '0.01', '0.1' }, 'Location', 'SouthEast' ); | |
axis tight; grid on; | |
xlabel( 'plays' ); ylabel( 'Average Reward' ); | |
% produce the percent optimal action plot: | |
% | |
figure; hold on; clrStr = 'brkc'; all_hnds = []; | |
for ei=1:length(epsArray), | |
%all_hnds(ei) = plot( [ 0, avgReward(ei,:) ], [clrStr(ei)] ); | |
all_hnds(ei) = plot( 1:nP, perOptAction(ei,:), [clrStr(ei),'-'] ); | |
end | |
legend( all_hnds, { '0', '0.01', '0.1' }, 'Location', 'SouthEast' ); | |
axis( [ 0, nP, 0, 1 ] ); axis tight; grid on; | |
xlabel( 'plays' ); ylabel( '% Optimal Action' ); | |
% produce the cummulative average rewards plot: | |
% | |
figure; hold on; clrStr = 'brkc'; all_hnds = []; | |
for ei=1:length(epsArray), | |
all_hnds(ei) = plot( 1:nP, cumReward(ei,:), [clrStr(ei),'-'] ); | |
end | |
legend( all_hnds, { '0', '0.01', '0.1' }, 'Location', 'SouthEast' ); | |
axis tight; grid on; | |
xlabel( 'plays' ); ylabel( 'Cummulative Average Reward' ); | |
% produce the cummulative percent optimal action plot: | |
% | |
figure; hold on; clrStr = 'brkc'; all_hnds = []; | |
for ei=1:length(epsArray), | |
all_hnds(ei) = plot( 1:nP, cumProb(ei,:), [clrStr(ei),'-'] ); | |
end | |
legend( all_hnds, { '0', '0.01', '0.1' }, 'Location', 'SouthEast' ); | |
axis( [ 0, nP, 0, 1 ] ); axis tight; grid on; | |
xlabel( 'plays' ); ylabel( 'Cummulative % Optimal Action' ); |
Some good ways to plot graphs with legends
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thing learned: