tuzz/bandits.rb

## bandits.rb
NUMBER_OF_BANDITS = 4
EPSILON = 0.001

VALUES_AND_OCCURENCES = NUMBER_OF_BANDITS.times.map { [0, 0] }

def greedy_action
  VALUES_AND_OCCURENCES.map.with_index { |(v, o), i| [v, o, i] }.max_by(&:first).last
end

def random_action
  NUMBER_OF_BANDITS.times.to_a.sample
end

def bandit(action, turns)
  if turns > 50000
    Math.sin(action) + rand * 5
  else
    Math.cos(action) + rand * 3
  end
end

total_reward = 0
turns = 0;

loop do
  action = rand < EPSILON ? random_action : greedy_action
  reward = bandit(action, turns)

  total_reward += reward
  turns += 1

  prev_value, occurences = VALUES_AND_OCCURENCES[action]

  occurences += 1
  new_value = prev_value + (1.0 / occurences) * (reward - prev_value)

  VALUES_AND_OCCURENCES[action] = [new_value, occurences]

  puts total_reward.to_f / turns
  puts VALUES_AND_OCCURENCES.inspect
end
	NUMBER_OF_BANDITS = 4
	EPSILON = 0.001

	VALUES_AND_OCCURENCES = NUMBER_OF_BANDITS.times.map { [0, 0] }

	def greedy_action
	VALUES_AND_OCCURENCES.map.with_index { \|(v, o), i\| [v, o, i] }.max_by(&:first).last
	end

	def random_action
	NUMBER_OF_BANDITS.times.to_a.sample
	end

	def bandit(action, turns)
	if turns > 50000
	Math.sin(action) + rand * 5
	else
	Math.cos(action) + rand * 3
	end
	end

	total_reward = 0
	turns = 0;

	loop do
	action = rand < EPSILON ? random_action : greedy_action
	reward = bandit(action, turns)

	total_reward += reward
	turns += 1

	prev_value, occurences = VALUES_AND_OCCURENCES[action]

	occurences += 1
	new_value = prev_value + (1.0 / occurences) * (reward - prev_value)

	VALUES_AND_OCCURENCES[action] = [new_value, occurences]

	puts total_reward.to_f / turns
	puts VALUES_AND_OCCURENCES.inspect
	end