Skip to content

Instantly share code, notes, and snippets.

@jean-robert
Created November 13, 2014 21:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jean-robert/f3ac65bea741796c3778 to your computer and use it in GitHub Desktop.
Save jean-robert/f3ac65bea741796c3778 to your computer and use it in GitHub Desktop.
Tic-tac-toe Reinforcement Learning
import random
import copy
class TTTGame:
board = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
playerX = None
playerO = None
def showBoard(self):
print '---------------'
print self.board[:3]
print self.board[3:6]
print self.board[6:]
print '---------------'
def reset(self):
self.board = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
def scoreBoard(self):
win = (self.board[:3] == ['X', 'X', 'X']) | \
(self.board[3:6] == ['X', 'X', 'X']) | \
(self.board[6:] == ['X', 'X', 'X']) | \
([self.board[0], self.board[3], self.board[6]] == ['X', 'X', 'X']) | \
([self.board[1], self.board[4], self.board[7]] == ['X', 'X', 'X']) | \
([self.board[2], self.board[5], self.board[8]] == ['X', 'X', 'X']) | \
([self.board[0], self.board[4], self.board[8]] == ['X', 'X', 'X']) | \
([self.board[2], self.board[4], self.board[6]] == ['X', 'X', 'X'])
lose = (self.board[:3] == ['O', 'O', 'O']) | \
(self.board[3:6] == ['O', 'O', 'O']) | \
(self.board[6:] == ['O', 'O', 'O']) | \
([self.board[0], self.board[3], self.board[6]] == ['O', 'O', 'O']) | \
([self.board[1], self.board[4], self.board[7]] == ['O', 'O', 'O']) | \
([self.board[2], self.board[5], self.board[8]] == ['O', 'O', 'O']) | \
([self.board[0], self.board[4], self.board[8]] == ['O', 'O', 'O']) | \
([self.board[2], self.board[4], self.board[6]] == ['O', 'O', 'O'])
draw = sum([b == ' ' for b in self.board]) == 0
return win, lose, draw
def setPlayerX(self, player):
self.playerX = player
player.game = self
def setPlayerO(self, player):
self.playerO = player
player.game = self
def showStrBoard(self, string):
board = list(string)
print '---------------'
print board[:3]
print board[3:6]
print board[6:]
print '---------------'
def simulateLearnPlay(self, verbose=False):
turn = 'X'
self.reset()
if verbose:
self.showBoard()
while sum(self.scoreBoard()) == 0:
if turn=='X':
self.playerX.learnPlay()
turn = 'O'
elif turn=='O':
self.playerO.learnPlay()
turn = 'X'
if verbose:
self.showBoard()
def simulateHumanPlay(self, human='O'):
turn = 'X'
self.reset()
self.showBoard()
while sum(self.scoreBoard()) == 0:
if turn=='X':
if human=='X':
self.playerX.humanPlay()
else:
self.playerX.learnPlay()
turn = 'O'
elif turn=='O':
if human=='O':
self.playerO.humanPlay()
else:
self.playerO.learnPlay()
turn = 'X'
self.showBoard()
print self.scoreBoard()
class TTTAgent:
symbol = 'X'
oppsymbol = 'O'
p = None
game = None
def __init__(self, symbol='X'):
if symbol!='X':
self.symbol = 'O'
self.oppsymbol = 'X'
self.p = dict()
def scoreBoard(self, board):
win = (board[:3] == [self.symbol, self.symbol, self.symbol]) | \
(board[3:6] == [self.symbol, self.symbol, self.symbol]) | \
(board[6:] == [self.symbol, self.symbol, self.symbol]) | \
([board[0], board[3], board[6]] == [self.symbol, self.symbol, self.symbol]) | \
([board[1], board[4], board[7]] == [self.symbol, self.symbol, self.symbol]) | \
([board[2], board[5], board[8]] == [self.symbol, self.symbol, self.symbol]) | \
([board[0], board[4], board[8]] == [self.symbol, self.symbol, self.symbol]) | \
([board[2], board[4], board[6]] == [self.symbol, self.symbol, self.symbol])
lose = (board[:3] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \
(board[3:6] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \
(board[6:] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \
([board[0], board[3], board[6]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \
([board[1], board[4], board[7]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \
([board[2], board[5], board[8]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \
([board[0], board[4], board[8]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol]) | \
([board[2], board[4], board[6]] == [self.oppsymbol, self.oppsymbol, self.oppsymbol])
draw = sum([b == ' ' for b in board]) == 0
return win, lose, draw
def play(self, move):
if self.game.board[move] == ' ':
self.game.board[move] = self.symbol
def randomPlay(self):
availMoves = [i for i, b in enumerate(self.game.board) if b == ' ']
move = random.sample(set(availMoves), 1)
self.play(move[0])
def boardValue(self, board):
boardH = ''.join(board)
if not boardH in self.p:
if sum(self.scoreBoard(board)) == 0:
self.p[boardH] = 0.5
else:
self.p[boardH] = int(self.scoreBoard(board)[0])
return self.p[boardH]
def learnPlay(self):
availMoves = [i for i, b in enumerate(self.game.board) if b == ' ']
maxValue = 0
maxMove = []
for b in availMoves:
boardtp1 = copy.copy(self.game.board)
boardtp1[b] = self.symbol
if self.boardValue(boardtp1) > maxValue:
maxValue = self.boardValue(boardtp1)
maxMove = [b]
elif self.boardValue(boardtp1) == maxValue:
maxMove.append(b)
if random.sample(set(range(9)), 1)[0] == 1:
maxMove = availMoves
move = random.sample(set(maxMove), 1)[0]
else:
move = random.sample(set(maxMove), 1)[0]
boardtp1 = copy.copy(self.game.board)
boardtp1[move] = self.symbol
self.game.playerX.updateP(self.game.board, boardtp1)
self.game.playerO.updateP(self.game.board, boardtp1)
self.play(move)
def updateP(self, board, boardtp1, alpha=0.2):
boardH = ''.join(board)
boardtp1H = ''.join(boardtp1)
if not boardH in self.p:
self.boardValue(board)
if not boardtp1H in self.p:
self.boardValue(boardtp1)
self.p[boardH] = self.p[boardH] + alpha*(self.p[boardtp1H] - self.p[boardH])
def humanPlay(self):
self.game.showBoard()
move = raw_input('move ? ')
self.play(int(move))
if __name__ == '__main__':
g = TTTGame()
playerX = TTTAgent(symbol='X')
playerO = TTTAgent(symbol='O')
g.setPlayerX(playerX)
g.setPlayerO(playerO)
for i in range(10000):
g.simulateLearnPlay()
g.simulateHumanPlay()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment