Created
November 13, 2014 21:25
-
-
Save jean-robert/f3ac65bea741796c3778 to your computer and use it in GitHub Desktop.
Tic-tac-toe Reinforcement Learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import copy | |
class TTTGame: | |
board = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '] | |
playerX = None | |
playerO = None | |
def showBoard(self): | |
print '---------------' | |
print self.board[:3] | |
print self.board[3:6] | |
print self.board[6:] | |
print '---------------' | |
def reset(self): | |
self.board = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '] | |
def scoreBoard(self): | |
win = (self.board[:3] == ['X', 'X', 'X']) | \ | |
(self.board[3:6] == ['X', 'X', 'X']) | \ | |
(self.board[6:] == ['X', 'X', 'X']) | \ | |
([self.board[0], self.board[3], self.board[6]] == ['X', 'X', 'X']) | \ | |
([self.board[1], self.board[4], self.board[7]] == ['X', 'X', 'X']) | \ | |
([self.board[2], self.board[5], self.board[8]] == ['X', 'X', 'X']) | \ | |
([self.board[0], self.board[4], self.board[8]] == ['X', 'X', 'X']) | \ | |
([self.board[2], self.board[4], self.board[6]] == ['X', 'X', 'X']) | |
lose = (self.board[:3] == ['O', 'O', 'O']) | \ | |
(self.board[3:6] == ['O', 'O', 'O']) | \ | |
(self.board[6:] == ['O', 'O', 'O']) | \ | |
([self.board[0], self.board[3], self.board[6]] == ['O', 'O', 'O']) | \ | |
([self.board[1], self.board[4], self.board[7]] == ['O', 'O', 'O']) | \ | |
([self.board[2], self.board[5], self.board[8]] == ['O', 'O', 'O']) | \ | |
([self.board[0], self.board[4], self.board[8]] == ['O', 'O', 'O']) | \ | |
([self.board[2], self.board[4], self.board[6]] == ['O', 'O', 'O']) | |
draw = sum([b == ' ' for b in self.board]) == 0 | |
return win, lose, draw | |
def setPlayerX(self, player): | |
self.playerX = player | |
player.game = self | |
def setPlayerO(self, player): | |
self.playerO = player | |
player.game = self | |
def showStrBoard(self, string): | |
board = list(string) | |
print '---------------' | |
print board[:3] | |
print board[3:6] | |
print board[6:] | |
print '---------------' | |
def simulateLearnPlay(self, verbose=False): | |
turn = 'X' | |
self.reset() | |
if verbose: | |
self.showBoard() | |
while sum(self.scoreBoard()) == 0: | |
if turn=='X': | |
self.playerX.learnPlay() | |
turn = 'O' | |
elif turn=='O': | |
self.playerO.learnPlay() | |
turn = 'X' | |
if verbose: | |
self.showBoard() | |
def simulateHumanPlay(self, human='O'): | |
turn = 'X' | |
self.reset() | |
self.showBoard() | |
while sum(self.scoreBoard()) == 0: | |
if turn=='X': | |
if human=='X': | |
self.playerX.humanPlay() | |
else: | |
self.playerX.learnPlay() | |
turn = 'O' | |
elif turn=='O': | |
if human=='O': | |
self.playerO.humanPlay() | |
else: | |
self.playerO.learnPlay() | |
turn = 'X' | |
self.showBoard() | |
print self.scoreBoard() | |
class TTTAgent: | |
symbol = 'X' | |
oppsymbol = 'O' | |
p = None | |
game = None | |
def __init__(self, symbol='X'): | |
if symbol!='X': | |
self.symbol = 'O' | |
self.oppsymbol = 'X' | |
self.p = dict() | |
def scoreBoard(self, board): | |
win = (board[:3] == [self.symbol, self.symbol, self.symbol]) | \ | |
(board[3:6] == [self.symbol, self.symbol, self.symbol]) | \ | |
(board[6:] == [self.symbol, self.symbol, self.symbol]) | \ | |
([board[0], board[3], board[6]] == [self.symbol, self.symbol, self.symbol]) | \ | |
([board[1], board[4], board[7]] == [self.symbol, self.symbol, self.symbol]) | \ | |
([board[2], board[5], board[8]] == [self.symbol, self.symbol, self.symbol]) | \ | |
([board[0], board[4], board[8]] == [self.symbol, self.symbol, self.symbol]) | \ | |
([board[2], board[4], board[6]] == [self.symbol, self.symbol, self.symbol]) | |
lose = (board[:3] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \ | |
(board[3:6] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \ | |
(board[6:] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \ | |
([board[0], board[3], board[6]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \ | |
([board[1], board[4], board[7]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \ | |
([board[2], board[5], board[8]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \ | |
([board[0], board[4], board[8]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \ | |
([board[2], board[4], board[6]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | |
draw = sum([b == ' ' for b in board]) == 0 | |
return win, lose, draw | |
def play(self, move): | |
if self.game.board[move] == ' ': | |
self.game.board[move] = self.symbol | |
def randomPlay(self): | |
availMoves = [i for i, b in enumerate(self.game.board) if b == ' '] | |
move = random.sample(set(availMoves), 1) | |
self.play(move[0]) | |
def boardValue(self, board): | |
boardH = ''.join(board) | |
if not boardH in self.p: | |
if sum(self.scoreBoard(board)) == 0: | |
self.p[boardH] = 0.5 | |
else: | |
self.p[boardH] = int(self.scoreBoard(board)[0]) | |
return self.p[boardH] | |
def learnPlay(self): | |
availMoves = [i for i, b in enumerate(self.game.board) if b == ' '] | |
maxValue = 0 | |
maxMove = [] | |
for b in availMoves: | |
boardtp1 = copy.copy(self.game.board) | |
boardtp1[b] = self.symbol | |
if self.boardValue(boardtp1) > maxValue: | |
maxValue = self.boardValue(boardtp1) | |
maxMove = [b] | |
elif self.boardValue(boardtp1) == maxValue: | |
maxMove.append(b) | |
if random.sample(set(range(9)), 1)[0] == 1: | |
maxMove = availMoves | |
move = random.sample(set(maxMove), 1)[0] | |
else: | |
move = random.sample(set(maxMove), 1)[0] | |
boardtp1 = copy.copy(self.game.board) | |
boardtp1[move] = self.symbol | |
self.game.playerX.updateP(self.game.board, boardtp1) | |
self.game.playerO.updateP(self.game.board, boardtp1) | |
self.play(move) | |
def updateP(self, board, boardtp1, alpha=0.2): | |
boardH = ''.join(board) | |
boardtp1H = ''.join(boardtp1) | |
if not boardH in self.p: | |
self.boardValue(board) | |
if not boardtp1H in self.p: | |
self.boardValue(boardtp1) | |
self.p[boardH] = self.p[boardH] + alpha*(self.p[boardtp1H] - self.p[boardH]) | |
def humanPlay(self): | |
self.game.showBoard() | |
move = raw_input('move ? ') | |
self.play(int(move)) | |
if __name__ == '__main__': | |
g = TTTGame() | |
playerX = TTTAgent(symbol='X') | |
playerO = TTTAgent(symbol='O') | |
g.setPlayerX(playerX) | |
g.setPlayerO(playerO) | |
for i in range(10000): | |
g.simulateLearnPlay() | |
g.simulateHumanPlay() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment