-
-
Save renan-cunha/44e9a0e4b6d651a87c93565dc6245b56 to your computer and use it in GitHub Desktop.
Implementation of policy gradient from scratch.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import numpy as np | |
import scipy.stats | |
class LinearSoftmaxAgent(object): | |
"""Act with softmax policy. Features are encoded as | |
phi(s, a) is a 1-hot vector of states.""" | |
def __init__(self, state_size, action_size): | |
self.state_size = state_size | |
self.action_size = action_size | |
self.states = [] | |
self.actions = [] | |
self.probs = [] | |
self.rewards = [] | |
self.theta = np.random.random(state_size * action_size) | |
self.alpha = .01 | |
self.gamma = .99 | |
def store(self, state, action, prob, reward): | |
self.states.append(state) | |
self.actions.append(action) | |
self.probs.append(prob) | |
self.rewards.append(reward) | |
def _phi(self, s, a): | |
encoded = np.zeros([self.action_size, self.state_size]) | |
encoded[a] = s | |
return encoded.flatten() | |
def _softmax(self, s, a): | |
return np.exp(self.theta.dot(self._phi(s, a)) / 100) | |
def pi(self, s): | |
"""\pi(a | s)""" | |
weights = np.empty(self.action_size) | |
for a in range(self.action_size): | |
weights[a] = self._softmax(s, a) | |
return weights / np.sum(weights) | |
def act(self, state): | |
probs = self.pi(state) | |
a = random.choices(range(0, self.action_size), weights=probs) | |
a = a[0] | |
pi = probs[a] | |
return (a, pi) | |
def _gradient(self, s, a): | |
expected = 0 | |
probs = self.pi(s) | |
for b in range(0, self.action_size): | |
expected += probs[b] * self._phi(s, b) | |
return self._phi(s, a) - expected | |
def _R(self, t): | |
"""Reward function.""" | |
total = 0 | |
for tau in range(t, len(self.rewards)): | |
total += self.gamma**(tau - t) * self.rewards[tau] | |
return total | |
def train(self): | |
self.rewards -= np.mean(self.rewards) | |
self.rewards /= np.std(self.rewards) | |
for t in range(len(self.states)): | |
s = self.states[t] | |
a = self.actions[t] | |
r = self._R(t) | |
grad = self._gradient(s, a) | |
self.theta = self.theta + self.alpha * r * grad | |
# print(self.theta) | |
self.states = [] | |
self.actions = [] | |
self.probs = [] | |
self.rewards = [] | |
def getName(self): | |
return 'LinearSoftmaxAgent' | |
def save(self): | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import agent | |
SAVE_FREQUENCY = 10 | |
env = gym.make('CartPole-v1') | |
state = env.reset() | |
score = 0 | |
episode = 0 | |
prev_frame = None | |
state_size = 4 | |
action_size = env.action_space.n | |
g = agent.LinearSoftmaxEnsemble(state_size, action_size) | |
MAX_EPISODES = 10000 | |
while episode < MAX_EPISODES: # episode loop | |
# env.render() | |
action, prob = g.act(state) | |
state, reward, done, info = env.step(action) # take a random action | |
if done: | |
reward = -10 | |
score += reward | |
g.store(state, action, prob, reward) | |
if done: | |
episode += 1 | |
g.train() | |
print('Episode: {} Score: {}'.format(episode, score)) | |
score = 0 | |
state = env.reset() | |
if episode % SAVE_FREQUENCY == 0: | |
g.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment