Skip to content

Instantly share code, notes, and snippets.

current_board_state = move_states[move]
if first_unvisited_node:
rollout_path.append((current_board_state, current_side))
if current_board_state not in state_samples:
first_unvisited_node = False
state_values[current_board_state] = value_func(current_board_state)
if all((state in state_samples) for _, state in move_states):
log_total_samples = math.log(sum(state_samples[s] for s in move_states.values()))
move, state = max(move_states,
key=lambda _, s:upper_confidence_bounds(state_results[s],state_samples[s], log_total_samples))
else:
move = random.choice(list(move_states.keys()))
while result == 0:
move_states = {move: apply_move(current_board_state, move, current_side)
for move in available_moves(current_board_state)}
if not move_states:
result = 0
break
def monte_carlo_tree_search_uct(board_state, side, number_of_rollouts):
state_results = collections.defaultdict(float)
state_samples = collections.defaultdict(float)
for _ in range(number_of_rollouts):
current_side = side
current_board_state = board_state
first_unvisited_node = True
rollout_path = []
result = 0
def upper_confidence_bounds(payout, samples_for_this_machine, log_total_samples):
return payout / samples_for_this_machine + math.sqrt((2 * log_total_samples) / samples_for_this_machine)
def monte_carlo_tree_search(board_state, side, number_of_samples):
results_per_move = collections.defaultdict(lambda: [0, 0])
for _ in range(number_of_samples):
result, move = monte_carlo_sample(board_state, side)
results_per_move[move][0] += result
results_per_move[move][1] += 1
move = max(results_per_move,
key=lambda x: results_per_move.get(x)[0] /
results_per_move[move][1])
# select a random move
move = random.choice(moves)
result, next_move = monte_carlo_sample(apply_move(board_state, move, side), -side)
return result, move
@DanielSlater
DanielSlater / pg-pong.py
Created August 3, 2016 22:05 — forked from karpathy/pg-pong.py
Training a Neural Network ATARI Pong agent with Policy Gradients from raw pixels
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import cPickle as pickle
import gym
# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
import tensorflow as tf
import numpy as np
NUM_STATES = 10
NUM_ACTIONS = 2
GAMMA = 0.5
def hot_one_state(index):
array = np.zeros(NUM_STATES)