Skip to content

Instantly share code, notes, and snippets.

@gustafsson
Forked from karpathy/pg-pong.py
Created February 23, 2019 20:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gustafsson/8959e53349668ab216411b59e9a1893b to your computer and use it in GitHub Desktop.
Save gustafsson/8959e53349668ab216411b59e9a1893b to your computer and use it in GitHub Desktop.
Training a Neural Network ATARI Pong agent with Policy Gradients from raw pixels
# Translation of @karpathy's pg-pong.py from Python to Julia
# https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# Note that it takes several days to learn a policy that wins most games.
# Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym through Gym.jl and PyCall.jl.
using Gym
using Serialization
using StatsBase: mean, std
# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = false # resume from previous checkpoint?
dorender = false
# model initialization
D = 80 * 80 # input dimensionality: 80x80 grid
if resume
model = deserialize("save.jls")
else
model = Dict( "W1" => randn(H,D) / sqrt(D), # "Xavier" initialization
"W2" => randn(H) / sqrt(H))
end
zeros_like(x) = fill!(similar(x), zero(eltype(x)))
grad_buffer = Dict( k => zeros_like(v) for (k,v) in model ) # update buffers that add up gradients over a batch
rmsprop_cache = Dict( k => zeros_like(v) for (k,v) in model ) # rmsprop memory
sigmoid(x) = 1.0 / (1.0 + exp(-x)) # sigmoid "squashing" function to interval [0,1]
ravel(x) = reshape(x, :)
function prepro(I)
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[36:195, :, :] # crop
I = I[1:2:end, 1:2:end, 1] # downsample by factor of 2
I[I .== 144] .= 0 # erase background (background type 1)
I[I .== 109] .= 0 # erase background (background type 2)
I[I .!= 0] .= 1 # everything else (paddles, ball) just set to 1
return convert.(Float64, I) |> ravel
end
function discount_rewards(r)
""" take 1D float array of rewards and compute discounted reward """
discounted_r = similar(r)
running_add = 0
for t in length(r):-1:1
if r[t] != 0 running_add = 0 end # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
end
return discounted_r
end
function policy_forward(x)
h = model["W1"] * x
h[h.<0] .= 0 # ReLU nonlinearity
logp = model["W2"]' * h
p = sigmoid(logp)
return p, h # return probability of taking action 2, and hidden state
end
function policy_backward(epx, eph, epdlogp)
""" backward pass. (eph is array of intermediate hidden states) """
dW2 = (eph' * epdlogp) |> ravel
dh = epdlogp * model["W2"]'
dh[eph .<= 0] .= 0 # backpro prelu
dW1 = dh' * epx
return Dict("W1" => dW1, "W2" => dW2)
end
function train()
env = GymEnv("Pong-v0")
observation = reset!(env)
prev_x = nothing # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = nothing
reward_sum = 0
episode_number = 0
while true
if dorender render(env) end
# preprocess the observation, set input to network to be difference image
cur_x = prepro(observation)
x = prev_x != nothing ? cur_x - prev_x : zeros(Float64, D)
prev_x = cur_x
# forward the policy network and sample an action from the returned probability
aprob, h = policy_forward(x)
action = rand() < aprob ? 2 : 3 # roll the dice!
# record various intermediates (needed later for backprop)
push!(xs, x) # observation
push!(hs, h) # hidden state
y = action == 2 ? 1 : 0 # a "fake label"
push!(dlogps, y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)
# step the environment and get new measurements
observation, reward, done, info = step!(env, action)
reward_sum += reward
push!(drs, reward) # record reward (has to be done after we call step() to get reward for previous action)
if done # an episode finished
episode_number += 1
# stack together all inputs, hidden states, action gradients, and rewards for this episode
epx = hcat(xs...)'
eph = hcat(hs...)'
epdlogp = hcat(dlogps...)'
epr = hcat(drs...)'
xs,hs,dlogps,drs = [],[],[],[] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# standardize the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr .-= mean(discounted_epr)
discounted_epr ./= std(discounted_epr)
epdlogp .*= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
grad = policy_backward(epx, eph, epdlogp)
for k in keys(model)
grad_buffer[k] += grad[k] # accumulate grad over batch
end
# perform rmsprop parameter update every batch_size episodes
if episode_number % batch_size == 0
for (k,v) in model
g = grad_buffer[k] # gradient
rmsprop_cache[k] = decay_rate .* rmsprop_cache[k] .+ (1 .- decay_rate) .* g .^ 2
model[k] += learning_rate .* g ./ (sqrt.(rmsprop_cache[k]) .+ 1e-5)
fill!(grad_buffer[k], 0) # reset batch gradient buffer
end
end
# boring book-keeping
running_reward = running_reward == nothing ? reward_sum : running_reward * 0.99 + reward_sum * 0.01
println("resetting env. episode reward total was ", reward_sum, ". running mean: ", running_reward)
if episode_number % 100 == 0 serialize("save.jls", model) end
reward_sum = 0
observation = reset!(env) # reset env
prev_x = nothing
end
if reward != 0 # Pong has either +1 or -1 reward exactly when game ends.
println("ep ", episode_number, ": game finished, reward: ", reward, reward == -1 ? "" : " !!!!!!!!")
end
end
end
train()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment