Skip to content

Instantly share code, notes, and snippets.

@msaroufim
Last active February 26, 2020 03:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msaroufim/7ab9d302d5a698ec79557263a3054774 to your computer and use it in GitHub Desktop.
Save msaroufim/7ab9d302d5a698ec79557263a3054774 to your computer and use it in GitHub Desktop.
#https://www.geeksforgeeks.org/q-learning-in-python/
def createPolicy(Q, epsilon, num_actions):
def policyFunction(state):
# policy function can be a neural net instead of a dictionary from state to action
# helpful when state is too big like continuous domains
action_probs = np.ones(num_actions, dtype=float) * epsilon / num_actions
best_action = np.argmax(Q[state])
action_probs[best_action] += 1.0 - epsilon
return action_probs
return policyFunction
def qLearning(env, num_episodes, discount_factor = 1.0, alpha = 0.6, epsilon = 0.1):
Q = defaultdict(lambda: np.zeros(env.action_space.n))
stats = plotting.EpisodeStats(
episode_lengths = np.zeros(num_episodes),
episode_rewards = np.zeros(num_episodes))
policy = createPolicy(Q, epsilon, env.action_space.n)
for ith_episode in range(num_episodes):
# Instead of reset() can do a domain randomization here
# need to also then log environment config
state = env.reset()
for t in itertools.count():
action_probs = policy(state)
#epsilon greedy
action = np.random.choice(np.arange(len(action_probs)), p = action_probs)
next_state, reward, done, _ = env.step(action)
stats.episode_rewards[i_episode] += reward
stats.episode_lengths[i_episode] = t
# TD update
# argmax is bae
best_next_action = np.argmax(Q[next_state])
td_target = reward + discount_factor * Q[next_state][best_next_action]
td_delta = td_target - Q[state][action]
Q[state][action] += alpha * td_delta
if done:
break
state = next_state
return Q, stats
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment