msaroufim/q-learning.py

## q-learning.py
#https://www.geeksforgeeks.org/q-learning-in-python/

def createPolicy(Q, epsilon, num_actions):

  def policyFunction(state):
    # policy function can be a neural net instead of a dictionary from state to action
    # helpful when state is too big like continuous domains
    action_probs = np.ones(num_actions, dtype=float) * epsilon / num_actions
    best_action = np.argmax(Q[state])
    action_probs[best_action] += 1.0 - epsilon

    return action_probs

  return policyFunction

def qLearning(env, num_episodes, discount_factor = 1.0, alpha = 0.6, epsilon = 0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

      stats = plotting.EpisodeStats(
      episode_lengths = np.zeros(num_episodes),
      episode_rewards = np.zeros(num_episodes))

      policy = createPolicy(Q, epsilon, env.action_space.n)

      for ith_episode in range(num_episodes):
        # Instead of reset() can do a domain randomization here
        # need to also then log environment config
        state = env.reset()

        for t in itertools.count():
          action_probs = policy(state)

          #epsilon greedy
          action = np.random.choice(np.arange(len(action_probs)), p = action_probs)

          next_state, reward, done, _ = env.step(action)

          stats.episode_rewards[i_episode] += reward
          stats.episode_lengths[i_episode] = t

          # TD update
          # argmax is bae
          best_next_action = np.argmax(Q[next_state])
          td_target = reward + discount_factor * Q[next_state][best_next_action]
          td_delta = td_target - Q[state][action]
          Q[state][action] += alpha * td_delta

          if done:
            break

          state = next_state
        return Q, stats
	#https://www.geeksforgeeks.org/q-learning-in-python/

	def createPolicy(Q, epsilon, num_actions):

	def policyFunction(state):
	# policy function can be a neural net instead of a dictionary from state to action
	# helpful when state is too big like continuous domains
	action_probs = np.ones(num_actions, dtype=float) * epsilon / num_actions
	best_action = np.argmax(Q[state])
	action_probs[best_action] += 1.0 - epsilon

	return action_probs

	return policyFunction

	def qLearning(env, num_episodes, discount_factor = 1.0, alpha = 0.6, epsilon = 0.1):
	Q = defaultdict(lambda: np.zeros(env.action_space.n))

	stats = plotting.EpisodeStats(
	episode_lengths = np.zeros(num_episodes),
	episode_rewards = np.zeros(num_episodes))

	policy = createPolicy(Q, epsilon, env.action_space.n)

	for ith_episode in range(num_episodes):
	# Instead of reset() can do a domain randomization here
	# need to also then log environment config
	state = env.reset()

	for t in itertools.count():
	action_probs = policy(state)

	#epsilon greedy
	action = np.random.choice(np.arange(len(action_probs)), p = action_probs)

	next_state, reward, done, _ = env.step(action)

	stats.episode_rewards[i_episode] += reward
	stats.episode_lengths[i_episode] = t

	# TD update
	# argmax is bae
	best_next_action = np.argmax(Q[next_state])
	td_target = reward + discount_factor * Q[next_state][best_next_action]
	td_delta = td_target - Q[state][action]
	Q[state][action] += alpha * td_delta

	if done:
	break

	state = next_state
	return Q, stats