{{ message }}

Instantly share code, notes, and snippets.

tsu-nera/q_tic_tac_toe.py

Last active Jun 22, 2017
Q学習法
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 import gym import numpy as np import gym_tic_tac_toe from math import floor env = gym.make('tic_tac_toe-v0') n_states = 3 ** 9 # 状態数 n_actions = 9 # 行動数 eM = 1000 # 評価を行うエピソード数 def q_learning(M, options): # initiate lookup table Q = np.zeros((n_states, n_actions)) results = np.zeros(M) for m in range(M): np.random.seed(np.mod(m, eM)) t = 1 state = env.reset() state3 = state['board'] done = False pstate = 0 paction = 0 while(True): # 状態観測 state10 = encode(state3) # 政策の生成 policy = np.zeros(n_actions) policy = select_policy(options, Q, state10, policy) # 行動の選択および実行 action, state, reward, done = action_train(t, state3, policy) state3 = state['board'] ######################## # Q関数の更新（Q学習） # 1ステップ前の状態，行動のQ値を更新 if t > 1: if reward == None: reward = 0 Q[pstate][paction] = \ Q[pstate][paction]+options["alpha"]*(reward-Q[pstate][paction]+options["gamma"]*max(Q[state10])) # ゲーム終了 if done or env.move_generator() == []: if reward == 1: fin = 2 elif reward == -1: fin = 1 elif reward == 0: fin = 3 else: fin = None results[m] = fin break # 状態と行動の記録 pstate = state10 paction = action t += 1 if np.mod(m, eM)==0: partial_results = results[m-eM+1:m] print('%d) Win=%d/%d, Draw=%d/%d, Lose=%d/%d' % (m, len(partial_results[partial_results == 2]), eM, len(partial_results[partial_results == 3]), eM, len(partial_results[partial_results == 1]), eM)) ############################################################################### ############################################################################### convert = [[0, 1, 2, 3, 4, 5, 6, 7, 8], [2, 1, 0, 5, 4, 3, 8, 7, 6], [6, 3, 0, 7, 4, 1, 8, 5, 2], [0, 3, 8, 1, 4, 7, 2, 5, 8], [8, 7, 6, 5, 4, 3, 2, 1, 0], [6, 7, 8, 3, 4, 5, 0, 1, 2], [2, 5, 8, 1, 4, 7, 0, 3, 6], [8, 5, 2, 7, 4, 1, 6, 3, 0] ] power = np.array([3 ** i for i in range(8, -1, -1)], dtype=np.float64) def encode(state3): return encode2(encode1(state3)) def encode1(state3): ret = np.empty(len(state3)) for n, i in enumerate(state3): if i == -1: ret[n] = 1 elif i == 1: ret[n] = 2 else: ret[n] = 0 return ret def encode2(state3): cands = [sum(state3[convert[i]] * power) for i in range(len(convert))] return int(min(cands)) + 1 def select_policy(options, Q, state10, policy): if options['pmode'] == 0: q = Q[state10] v = max(q) a = np.where(q == v)[0][0] policy[a] = 1 elif options['pmode'] == 1: q = Q[state10] v = max(q) a = np.where(q == v)[0][0] policy = np.ones(n_actions) * options['epsilon'] / n_actions policy[a] = 1 - options['epsilon'] + options['epsilon'] / n_actions elif options['pmode'] == 2: policy = np.exp(Q[state10] / options['tau']) / \ sum(np.exp(Q[state10] / options['tau'])) return policy def select_npc_action(step, state3, policy): a = None # first step is always select 0 if step == 1: return [1, 0] else: while 1: random = np.random.rand() cprob = 0 for a in range(n_actions): cprob += policy[a] if random < cprob: break if state3[a] == 0: break return [1, a] def select_enemy_action(state3, moves): reach = False pos = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [1, 5, 8], [0, 4, 8], [2, 4, 6]] a = None for i in range(len(pos)): state_i = state3[pos[i]] val = sum(state_i) num = len(state_i[state_i == 0]) if val == 2 and num == 1: idx = int(state_i[state_i == 0][0]) a = pos[i][idx] if [-1, a] in moves: reach = True break if not reach: while 1: a = floor(np.random.rand() * 8) + 1 if state3[a] == 0: break return [-1, a] def action_train(t, state3, policy): # select action npc_action = select_npc_action(t, state3, policy) # action execute state, reward, done, _ = env.step(npc_action) moves = env.move_generator() if done or moves == []: return npc_action, state, reward, done state32 = encode1(state['board']) enemy_action = select_enemy_action(state32, moves) # action execute state, reward, done, _ = env.step(enemy_action) if not done and reward == 0: reward = None return npc_action, state, reward, done if __name__ == '__main__': # ε- greedy options = {'pmode': 1, 'epsilon': 0.1, 'alpha': 1, 'gamma': 0.9}

tsu-nera commented Jun 22, 2017

1. Win=552/1000, Draw=135/1000, Lose=312/1000
2. Win=557/1000, Draw=132/1000, Lose=310/1000
3. Win=553/1000, Draw=127/1000, Lose=319/1000
4. Win=551/1000, Draw=129/1000, Lose=319/1000
5. Win=554/1000, Draw=131/1000, Lose=314/1000
6. Win=549/1000, Draw=130/1000, Lose=320/1000
7. Win=555/1000, Draw=129/1000, Lose=315/1000
8. Win=549/1000, Draw=129/1000, Lose=321/1000
9. Win=555/1000, Draw=131/1000, Lose=313/1000

tsu-nera commented Jun 22, 2017

1. Win=593/1000, Draw=114/1000, Lose=292/1000
2. Win=595/1000, Draw=112/1000, Lose=292/1000
3. Win=570/1000, Draw=98/1000, Lose=331/1000
4. Win=593/1000, Draw=101/1000, Lose=305/1000
5. Win=553/1000, Draw=108/1000, Lose=338/1000
6. Win=582/1000, Draw=89/1000, Lose=328/1000
7. Win=600/1000, Draw=102/1000, Lose=297/1000
8. Win=572/1000, Draw=105/1000, Lose=322/1000
9. Win=597/1000, Draw=101/1000, Lose=301/1000