Skip to content

Instantly share code, notes, and snippets.

@maxpagels
Last active February 21, 2018 18:41
Show Gist options
  • Save maxpagels/1c968d26c01844cd70f6789a18971b11 to your computer and use it in GitHub Desktop.
Save maxpagels/1c968d26c01844cd70f6789a18971b11 to your computer and use it in GitHub Desktop.
# Skeleton pseudocode for implementation of Evolved Policy Gradients
# Original paper: https://arxiv.org/pdf/1802.04821.pdf
import numpy as np
lr_delta = 0.01
lr_alpha = 0.01
noise_stddev = 0.5
K = 10
discount_factor = 0.5
phi_dim = 10
phi = 0.344289 # Some initialisation
num_epochs = 1000
timesteps = 1000
num_workers = 1000
for e in range(0, num_epochs):
for w in range(0, num_workers):
e_w = np.random.normal(0, 1, phi_dim)
inner_loss_function_parameter = phi + (noise_stddev * e_w)
# Generate random environment
for j in range(0, K):
state = get_state(-1)
rewards = []
for t in range(0, timesteps):
action = sample_action(state)
reward = get_reward(action)
rewards.append(reward)
undiscounted_reward = np.sum(rewards)
# Update policy parameter theta via perturbed inner loss function
# (the loss function parametrised by inner_loss_function_parameter)
# Compute final return
# Update parameter phi for outer loss function
def get_state(timestep):
# Return state object at timestep t
return None
def sample_action(state):
# Sample action from policy gradient agent
return None
def get_reward(action, environment)
# Play action in environment, retun reward
return 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment