Skip to content

Instantly share code, notes, and snippets.

@renan-cunha
Forked from edwardyu/agent.py
Created August 30, 2019 13:49
Show Gist options
  • Save renan-cunha/44e9a0e4b6d651a87c93565dc6245b56 to your computer and use it in GitHub Desktop.
Save renan-cunha/44e9a0e4b6d651a87c93565dc6245b56 to your computer and use it in GitHub Desktop.
Implementation of policy gradient from scratch.
import random
import numpy as np
import scipy.stats
class LinearSoftmaxAgent(object):
"""Act with softmax policy. Features are encoded as
phi(s, a) is a 1-hot vector of states."""
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.states = []
self.actions = []
self.probs = []
self.rewards = []
self.theta = np.random.random(state_size * action_size)
self.alpha = .01
self.gamma = .99
def store(self, state, action, prob, reward):
self.states.append(state)
self.actions.append(action)
self.probs.append(prob)
self.rewards.append(reward)
def _phi(self, s, a):
encoded = np.zeros([self.action_size, self.state_size])
encoded[a] = s
return encoded.flatten()
def _softmax(self, s, a):
return np.exp(self.theta.dot(self._phi(s, a)) / 100)
def pi(self, s):
"""\pi(a | s)"""
weights = np.empty(self.action_size)
for a in range(self.action_size):
weights[a] = self._softmax(s, a)
return weights / np.sum(weights)
def act(self, state):
probs = self.pi(state)
a = random.choices(range(0, self.action_size), weights=probs)
a = a[0]
pi = probs[a]
return (a, pi)
def _gradient(self, s, a):
expected = 0
probs = self.pi(s)
for b in range(0, self.action_size):
expected += probs[b] * self._phi(s, b)
return self._phi(s, a) - expected
def _R(self, t):
"""Reward function."""
total = 0
for tau in range(t, len(self.rewards)):
total += self.gamma**(tau - t) * self.rewards[tau]
return total
def train(self):
self.rewards -= np.mean(self.rewards)
self.rewards /= np.std(self.rewards)
for t in range(len(self.states)):
s = self.states[t]
a = self.actions[t]
r = self._R(t)
grad = self._gradient(s, a)
self.theta = self.theta + self.alpha * r * grad
# print(self.theta)
self.states = []
self.actions = []
self.probs = []
self.rewards = []
def getName(self):
return 'LinearSoftmaxAgent'
def save(self):
pass
import gym
import agent
SAVE_FREQUENCY = 10
env = gym.make('CartPole-v1')
state = env.reset()
score = 0
episode = 0
prev_frame = None
state_size = 4
action_size = env.action_space.n
g = agent.LinearSoftmaxEnsemble(state_size, action_size)
MAX_EPISODES = 10000
while episode < MAX_EPISODES: # episode loop
# env.render()
action, prob = g.act(state)
state, reward, done, info = env.step(action) # take a random action
if done:
reward = -10
score += reward
g.store(state, action, prob, reward)
if done:
episode += 1
g.train()
print('Episode: {} Score: {}'.format(episode, score))
score = 0
state = env.reset()
if episode % SAVE_FREQUENCY == 0:
g.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment