Skip to content

Instantly share code, notes, and snippets.

import tensorflow as tf
import numpy as np
NUM_STATES = 10
NUM_ACTIONS = 2
GAMMA = 0.5
def hot_one_state(index):
array = np.zeros(NUM_STATES)
@DanielSlater
DanielSlater / pg-pong.py
Created August 3, 2016 22:05 — forked from karpathy/pg-pong.py
Training a Neural Network ATARI Pong agent with Policy Gradients from raw pixels
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import cPickle as pickle
import gym
# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
# select a random move
move = random.choice(moves)
result, next_move = monte_carlo_sample(apply_move(board_state, move, side), -side)
return result, move
def monte_carlo_tree_search(board_state, side, number_of_samples):
results_per_move = collections.defaultdict(lambda: [0, 0])
for _ in range(number_of_samples):
result, move = monte_carlo_sample(board_state, side)
results_per_move[move][0] += result
results_per_move[move][1] += 1
move = max(results_per_move,
key=lambda x: results_per_move.get(x)[0] /
results_per_move[move][1])
def upper_confidence_bounds(payout, samples_for_this_machine, log_total_samples):
return payout / samples_for_this_machine + math.sqrt((2 * log_total_samples) / samples_for_this_machine)
def monte_carlo_tree_search_uct(board_state, side, number_of_rollouts):
state_results = collections.defaultdict(float)
state_samples = collections.defaultdict(float)
for _ in range(number_of_rollouts):
current_side = side
current_board_state = board_state
first_unvisited_node = True
rollout_path = []
result = 0
while result == 0:
move_states = {move: apply_move(current_board_state, move, current_side)
for move in available_moves(current_board_state)}
if not move_states:
result = 0
break
if all((state in state_samples) for _, state in move_states):
log_total_samples = math.log(sum(state_samples[s] for s in move_states.values()))
move, state = max(move_states,
key=lambda _, s:upper_confidence_bounds(state_results[s],state_samples[s], log_total_samples))
else:
move = random.choice(list(move_states.keys()))
current_board_state = move_states[move]
if first_unvisited_node:
rollout_path.append((current_board_state, current_side))
if current_board_state not in state_samples:
first_unvisited_node = False
state_values[current_board_state] = value_func(current_board_state)