Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Basic Q-Learning Algorithm

This is a basic implementation of a Q-Learning Agent. It performs moderately well, but does not solve the enviornment.

Hyperparameters are untuned for the most part. The agent could be made better if some time was put into tuning them.

import gym
from gym import wrappers
import qlearning as qla
import collections
def CartPoleFeaturExtractor(state, action):
#Round so that features can generalize between states slightly
state = (round(state[0],3), round(state[1],2), round(state[2],3), round(state[3],2))
featureVector = []
featureVector.append((state, 1))
featureVector.append((("f0", state[0], action), 1))
featureVector.append((("f1", state[1], action), 1))
featureVector.append((("f2", state[2], action), 1))
featureVector.append((("f3", state[3], action), 1))
return featureVector
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, 'C:\\Users\\Tyler\\Downloads\\tmp\\cart-pole', force=True)
ql = qla.QLearningAlgorithm(env.action_space, DISCOUNT, CartPoleFeaturExtractor, EXPLORATION_PROB)
record = collections.deque(100*[0], 100)
for i_episode in range(NUM_EPISODES):
observation = env.reset()
for t in range(EPISODE_LENGTH):
# Your Code Here #
action = ql.getAction(tuple(observation))
oldObs = observation
# End Your Code #
observation, reward, done, info = env.step(action)
# Your Code Here #
true_reward = -1 if done is True and t < EPISODE_LENGTH-1 else 0 # Negative reward if agent fails
ql.incorporateFeedback(tuple(oldObs), action, true_reward, tuple(observation))
# End Your Code #
if done: #Print relevant info and start new episode
print "Episode ", i_episode+1, " finished after {} timesteps".format(t+1)
if i_episode % 10 is 0:
print "\tRolling Ave: ", float(sum(record)) / max(len(record), 1)
import collections
import random
import math
class RLAlgorithm:
# Produce an action given a state.
def getAction(self, state): raise NotImplementedError("Override me")
# If |state| is a terminal state, this function will be called with (s, a,
# 0, None). When this function is called, it indicates that taking action |action|
# in state |state| resulted in reward |reward| and a transition to state |newState|.
def incorporateFeedback(self, state, action, reward, newState): raise NotImplementedError("Override me")
class QLearningAlgorithm(RLAlgorithm):
def __init__(self, possible_actions, discount, featureExtractor, explorationProb=0.2, stepSizeCoef=0.25):
self.possible_actions = possible_actions = discount
self.featureExtractor = featureExtractor
self.explorationProb = explorationProb
self.weights = collections.defaultdict(lambda: random.uniform(-0.05,0.05))
self.numIters = 0
self.stepSizeCoef = stepSizeCoef
# Return the Q function associated with the weights and features
def getQ(self, state, action):
score = 0
for f, v in self.featureExtractor(state, action):
score += self.weights[f] * v
return score
# This algorithm will produce an action given a state.
# Here we use the epsilon-greedy algorithm: with probability |explorationProb|, take a random action.
def getAction(self, state):
self.numIters += 1
if random.random() < self.explorationProb:
return self.possible_actions.sample()
return max(((self.getQ(state, action), action) for action in range(self.possible_actions.n)), key=lambda x: x[0])[1]
# Call this function to get the step size to update the weights.
def getStepSize(self):
return 1.0 / (self.numIters**self.stepSizeCoef)
# We will call this function with (s, a, r, s'), which you should use to update |weights|.
# Note that if s is a terminal state, then s' will be None. Remember to check for this.
# You should update the weights using self.getStepSize(); use
# self.getQ() to compute the current estimate of the parameters.
def incorporateFeedback(self, state, action, reward, newState):
if (newState == None): pass
Vopt = max([self.getQ(newState, a) for a in range(self.possible_actions.n)]) #self.possible_actions(newState)])
getQ = self.getQ(state, action)
for f, v in self.featureExtractor(state, action):
self.weights[f] -= self.getStepSize() * (getQ - (reward + * Vopt)) * v
#print self.weights
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment