Skip to content

Instantly share code, notes, and snippets.

@tsu-nera
Last active June 22, 2017 21:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tsu-nera/e5ba529a04ca02c48697471636c4a32b to your computer and use it in GitHub Desktop.
Save tsu-nera/e5ba529a04ca02c48697471636c4a32b to your computer and use it in GitHub Desktop.
Q学習法
import gym
import numpy as np
import gym_tic_tac_toe
from math import floor
env = gym.make('tic_tac_toe-v0')
n_states = 3 ** 9 # 状態数
n_actions = 9 # 行動数
eM = 1000 # 評価を行うエピソード数
def q_learning(M, options):
# initiate lookup table
Q = np.zeros((n_states, n_actions))
results = np.zeros(M)
for m in range(M):
np.random.seed(np.mod(m, eM))
t = 1
state = env.reset()
state3 = state['board']
done = False
pstate = 0
paction = 0
while(True):
# 状態観測
state10 = encode(state3)
# 政策の生成
policy = np.zeros(n_actions)
policy = select_policy(options, Q, state10, policy)
# 行動の選択および実行
action, state, reward, done = action_train(t, state3, policy)
state3 = state['board']
########################
# Q関数の更新(Q学習)
# 1ステップ前の状態,行動のQ値を更新
if t > 1:
if reward == None:
reward = 0
Q[pstate][paction] = \
Q[pstate][paction]+options["alpha"]*(reward-Q[pstate][paction]+options["gamma"]*max(Q[state10]))
# ゲーム終了
if done or env.move_generator() == []:
if reward == 1:
fin = 2
elif reward == -1:
fin = 1
elif reward == 0:
fin = 3
else:
fin = None
results[m] = fin
break
# 状態と行動の記録
pstate = state10
paction = action
t += 1
if np.mod(m, eM)==0:
partial_results = results[m-eM+1:m]
print('%d) Win=%d/%d, Draw=%d/%d, Lose=%d/%d' % (m,
len(partial_results[partial_results == 2]), eM,
len(partial_results[partial_results == 3]), eM,
len(partial_results[partial_results == 1]), eM))
###############################################################################
###############################################################################
convert = [[0, 1, 2, 3, 4, 5, 6, 7, 8],
[2, 1, 0, 5, 4, 3, 8, 7, 6],
[6, 3, 0, 7, 4, 1, 8, 5, 2],
[0, 3, 8, 1, 4, 7, 2, 5, 8],
[8, 7, 6, 5, 4, 3, 2, 1, 0],
[6, 7, 8, 3, 4, 5, 0, 1, 2],
[2, 5, 8, 1, 4, 7, 0, 3, 6],
[8, 5, 2, 7, 4, 1, 6, 3, 0]
]
power = np.array([3 ** i for i in range(8, -1, -1)], dtype=np.float64)
def encode(state3):
return encode2(encode1(state3))
def encode1(state3):
ret = np.empty(len(state3))
for n, i in enumerate(state3):
if i == -1:
ret[n] = 1
elif i == 1:
ret[n] = 2
else:
ret[n] = 0
return ret
def encode2(state3):
cands = [sum(state3[convert[i]] * power) for i in range(len(convert))]
return int(min(cands)) + 1
def select_policy(options, Q, state10, policy):
if options['pmode'] == 0:
q = Q[state10]
v = max(q)
a = np.where(q == v)[0][0]
policy[a] = 1
elif options['pmode'] == 1:
q = Q[state10]
v = max(q)
a = np.where(q == v)[0][0]
policy = np.ones(n_actions) * options['epsilon'] / n_actions
policy[a] = 1 - options['epsilon'] + options['epsilon'] / n_actions
elif options['pmode'] == 2:
policy = np.exp(Q[state10] / options['tau']) / \
sum(np.exp(Q[state10] / options['tau']))
return policy
def select_npc_action(step, state3, policy):
a = None
# first step is always select 0
if step == 1:
return [1, 0]
else:
while 1:
random = np.random.rand()
cprob = 0
for a in range(n_actions):
cprob += policy[a]
if random < cprob:
break
if state3[a] == 0:
break
return [1, a]
def select_enemy_action(state3, moves):
reach = False
pos = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [1, 5, 8], [0, 4, 8], [2, 4, 6]]
a = None
for i in range(len(pos)):
state_i = state3[pos[i]]
val = sum(state_i)
num = len(state_i[state_i == 0])
if val == 2 and num == 1:
idx = int(state_i[state_i == 0][0])
a = pos[i][idx]
if [-1, a] in moves:
reach = True
break
if not reach:
while 1:
a = floor(np.random.rand() * 8) + 1
if state3[a] == 0:
break
return [-1, a]
def action_train(t, state3, policy):
# select action
npc_action = select_npc_action(t, state3, policy)
# action execute
state, reward, done, _ = env.step(npc_action)
moves = env.move_generator()
if done or moves == []:
return npc_action, state, reward, done
state32 = encode1(state['board'])
enemy_action = select_enemy_action(state32, moves)
# action execute
state, reward, done, _ = env.step(enemy_action)
if not done and reward == 0:
reward = None
return npc_action, state, reward, done
if __name__ == '__main__':
# ε- greedy
options = {'pmode': 1, 'epsilon': 0.1, 'alpha': 1, 'gamma': 0.9}
@tsu-nera
Copy link
Author

改善しないので、バグっている可能性が高いな。。

  1. Win=552/1000, Draw=135/1000, Lose=312/1000
  2. Win=557/1000, Draw=132/1000, Lose=310/1000
  3. Win=553/1000, Draw=127/1000, Lose=319/1000
  4. Win=551/1000, Draw=129/1000, Lose=319/1000
  5. Win=554/1000, Draw=131/1000, Lose=314/1000
  6. Win=549/1000, Draw=130/1000, Lose=320/1000
  7. Win=555/1000, Draw=129/1000, Lose=315/1000
  8. Win=549/1000, Draw=129/1000, Lose=321/1000
  9. Win=555/1000, Draw=131/1000, Lose=313/1000

@tsu-nera
Copy link
Author

少し勝率があがった。

  1. Win=593/1000, Draw=114/1000, Lose=292/1000
  2. Win=595/1000, Draw=112/1000, Lose=292/1000
  3. Win=570/1000, Draw=98/1000, Lose=331/1000
  4. Win=593/1000, Draw=101/1000, Lose=305/1000
  5. Win=553/1000, Draw=108/1000, Lose=338/1000
  6. Win=582/1000, Draw=89/1000, Lose=328/1000
  7. Win=600/1000, Draw=102/1000, Lose=297/1000
  8. Win=572/1000, Draw=105/1000, Lose=322/1000
  9. Win=597/1000, Draw=101/1000, Lose=301/1000

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment