gustafsson/pg-pong.jl

## pg-pong.jl
# Translation of @karpathy's pg-pong.py from Python to Julia
# https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# Note that it takes several days to learn a policy that wins most games.

# Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym through Gym.jl and PyCall.jl.
using Gym
using Serialization
using StatsBase: mean, std

# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = false # resume from previous checkpoint?
dorender = false

# model initialization
D = 80 * 80 # input dimensionality: 80x80 grid
if resume
  model = deserialize("save.jls")
else
  model = Dict( "W1" => randn(H,D) / sqrt(D), # "Xavier" initialization
                "W2" => randn(H) / sqrt(H))
end

zeros_like(x) = fill!(similar(x), zero(eltype(x)))
grad_buffer = Dict( k => zeros_like(v) for (k,v) in model ) # update buffers that add up gradients over a batch
rmsprop_cache = Dict( k => zeros_like(v) for (k,v) in model ) # rmsprop memory

sigmoid(x) = 1.0 / (1.0 + exp(-x)) # sigmoid "squashing" function to interval [0,1]
ravel(x) = reshape(x, :)

function prepro(I)
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[36:195, :, :] # crop
  I = I[1:2:end, 1:2:end, 1] # downsample by factor of 2
  I[I .== 144] .= 0 # erase background (background type 1)
  I[I .== 109] .= 0 # erase background (background type 2)
  I[I .!= 0] .= 1 # everything else (paddles, ball) just set to 1
  return convert.(Float64, I) |> ravel
end

function discount_rewards(r)
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = similar(r)
  running_add = 0
  for t in length(r):-1:1
    if r[t] != 0 running_add = 0 end # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  end
  return discounted_r
end

function policy_forward(x)
  h = model["W1"] * x
  h[h.<0] .= 0 # ReLU nonlinearity
  logp = model["W2"]' * h
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state
end

function policy_backward(epx, eph, epdlogp)
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = (eph' * epdlogp) |> ravel
  dh = epdlogp * model["W2"]'
  dh[eph .<= 0] .= 0 # backpro prelu
  dW1 = dh' * epx
  return Dict("W1" => dW1, "W2" => dW2)
end

function train()
  env = GymEnv("Pong-v0")
  observation = reset!(env)
  prev_x = nothing # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = nothing
  reward_sum = 0
  episode_number = 0
  while true
    if dorender render(env) end

    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = prev_x != nothing ? cur_x - prev_x : zeros(Float64, D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = rand() < aprob ? 2 : 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    push!(xs, x) # observation
    push!(hs, h) # hidden state
    y = action == 2 ? 1 : 0 # a "fake label"
    push!(dlogps, y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = step!(env, action)
    reward_sum += reward

    push!(drs, reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = hcat(xs...)'
      eph = hcat(hs...)'
      epdlogp = hcat(dlogps...)'
      epr = hcat(drs...)'
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr .-= mean(discounted_epr)
      discounted_epr ./= std(discounted_epr)

      epdlogp .*= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in keys(model)
        grad_buffer[k] += grad[k] # accumulate grad over batch
      end

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0
        for (k,v) in model
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate .* rmsprop_cache[k] .+ (1 .- decay_rate) .* g .^ 2
          model[k] += learning_rate .* g ./ (sqrt.(rmsprop_cache[k]) .+ 1e-5)
          fill!(grad_buffer[k], 0) # reset batch gradient buffer
        end
      end

      # boring book-keeping
      running_reward = running_reward == nothing ? reward_sum : running_reward * 0.99 + reward_sum * 0.01
      println("resetting env. episode reward total was ", reward_sum, ". running mean: ", running_reward)
      if episode_number % 100 == 0 serialize("save.jls", model) end
      reward_sum = 0
      observation = reset!(env) # reset env
      prev_x = nothing
    end

    if reward != 0 # Pong has either +1 or -1 reward exactly when game ends.
      println("ep ", episode_number, ": game finished, reward: ", reward, reward == -1 ? "" : " !!!!!!!!")
    end
  end
end

train()
	# Translation of @karpathy's pg-pong.py from Python to Julia
	# https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
	# Note that it takes several days to learn a policy that wins most games.

	# Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym through Gym.jl and PyCall.jl.
	using Gym
	using Serialization
	using StatsBase: mean, std

	# hyperparameters
	H = 200 # number of hidden layer neurons
	batch_size = 10 # every how many episodes to do a param update?
	learning_rate = 1e-4
	gamma = 0.99 # discount factor for reward
	decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
	resume = false # resume from previous checkpoint?
	dorender = false

	# model initialization
	D = 80 * 80 # input dimensionality: 80x80 grid
	if resume
	model = deserialize("save.jls")
	else
	model = Dict( "W1" => randn(H,D) / sqrt(D), # "Xavier" initialization
	"W2" => randn(H) / sqrt(H))
	end

	zeros_like(x) = fill!(similar(x), zero(eltype(x)))
	grad_buffer = Dict( k => zeros_like(v) for (k,v) in model ) # update buffers that add up gradients over a batch
	rmsprop_cache = Dict( k => zeros_like(v) for (k,v) in model ) # rmsprop memory

	sigmoid(x) = 1.0 / (1.0 + exp(-x)) # sigmoid "squashing" function to interval [0,1]
	ravel(x) = reshape(x, :)

	function prepro(I)
	""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
	I = I[36:195, :, :] # crop
	I = I[1:2:end, 1:2:end, 1] # downsample by factor of 2
	I[I .== 144] .= 0 # erase background (background type 1)
	I[I .== 109] .= 0 # erase background (background type 2)
	I[I .!= 0] .= 1 # everything else (paddles, ball) just set to 1
	return convert.(Float64, I) \|> ravel
	end

	function discount_rewards(r)
	""" take 1D float array of rewards and compute discounted reward """
	discounted_r = similar(r)
	running_add = 0
	for t in length(r):-1:1
	if r[t] != 0 running_add = 0 end # reset the sum, since this was a game boundary (pong specific!)
	running_add = running_add * gamma + r[t]
	discounted_r[t] = running_add
	end
	return discounted_r
	end

	function policy_forward(x)
	h = model["W1"] * x
	h[h.<0] .= 0 # ReLU nonlinearity
	logp = model["W2"]' * h
	p = sigmoid(logp)
	return p, h # return probability of taking action 2, and hidden state
	end

	function policy_backward(epx, eph, epdlogp)
	""" backward pass. (eph is array of intermediate hidden states) """
	dW2 = (eph' * epdlogp) \|> ravel
	dh = epdlogp * model["W2"]'
	dh[eph .<= 0] .= 0 # backpro prelu
	dW1 = dh' * epx
	return Dict("W1" => dW1, "W2" => dW2)
	end

	function train()
	env = GymEnv("Pong-v0")
	observation = reset!(env)
	prev_x = nothing # used in computing the difference frame
	xs,hs,dlogps,drs = [],[],[],[]
	running_reward = nothing
	reward_sum = 0
	episode_number = 0
	while true
	if dorender render(env) end

	# preprocess the observation, set input to network to be difference image
	cur_x = prepro(observation)
	x = prev_x != nothing ? cur_x - prev_x : zeros(Float64, D)
	prev_x = cur_x

	# forward the policy network and sample an action from the returned probability
	aprob, h = policy_forward(x)
	action = rand() < aprob ? 2 : 3 # roll the dice!

	# record various intermediates (needed later for backprop)
	push!(xs, x) # observation
	push!(hs, h) # hidden state
	y = action == 2 ? 1 : 0 # a "fake label"
	push!(dlogps, y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

	# step the environment and get new measurements
	observation, reward, done, info = step!(env, action)
	reward_sum += reward

	push!(drs, reward) # record reward (has to be done after we call step() to get reward for previous action)

	if done # an episode finished
	episode_number += 1

	# stack together all inputs, hidden states, action gradients, and rewards for this episode
	epx = hcat(xs...)'
	eph = hcat(hs...)'
	epdlogp = hcat(dlogps...)'
	epr = hcat(drs...)'
	xs,hs,dlogps,drs = [],[],[],[] # reset array memory

	# compute the discounted reward backwards through time
	discounted_epr = discount_rewards(epr)
	# standardize the rewards to be unit normal (helps control the gradient estimator variance)
	discounted_epr .-= mean(discounted_epr)
	discounted_epr ./= std(discounted_epr)

	epdlogp .*= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
	grad = policy_backward(epx, eph, epdlogp)
	for k in keys(model)
	grad_buffer[k] += grad[k] # accumulate grad over batch
	end

	# perform rmsprop parameter update every batch_size episodes
	if episode_number % batch_size == 0
	for (k,v) in model
	g = grad_buffer[k] # gradient
	rmsprop_cache[k] = decay_rate .* rmsprop_cache[k] .+ (1 .- decay_rate) .* g .^ 2
	model[k] += learning_rate .* g ./ (sqrt.(rmsprop_cache[k]) .+ 1e-5)
	fill!(grad_buffer[k], 0) # reset batch gradient buffer
	end
	end

	# boring book-keeping
	running_reward = running_reward == nothing ? reward_sum : running_reward * 0.99 + reward_sum * 0.01
	println("resetting env. episode reward total was ", reward_sum, ". running mean: ", running_reward)
	if episode_number % 100 == 0 serialize("save.jls", model) end
	reward_sum = 0
	observation = reset!(env) # reset env
	prev_x = nothing
	end

	if reward != 0 # Pong has either +1 or -1 reward exactly when game ends.
	println("ep ", episode_number, ": game finished, reward: ", reward, reward == -1 ? "" : " !!!!!!!!")
	end
	end
	end

	train()