jskDr/tictactoe.py

## tictactoe.py
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pickle

# TicTacToe game has nine stateus with nine actions. An user can put his ston on any postion in the borad except

def set_state_inplace(S, action, P_no):
    ''' S is numpy array.'''
    assert S[action] == 0, 'position should be empty to put a new stone'
    S[action] = P_no # User numpy to insert action in the specific position

def calc_reward(S):
    mask_l = tf.constant([[1,1,1,0,0,0,0,0,0], [0,0,0,1,1,1,0,0,0], [0,0,0,0,0,0,1,1,1],
                         [1,0,0,1,0,0,1,0,0], [0,1,0,0,1,0,0,1,0], [0,0,1,0,0,1,0,0,1],
                         [1,0,0,0,1,0,0,0,1], [0,0,1,0,1,0,1,0,0]], dtype=tf.int16)
    for mask in mask_l:
        # print(mask)
        mask_S = mask * S
        # print(mask_S)

        for player in [1,2]:
            abs_err = tf.reduce_sum(tf.abs(mask_S - player * mask))
            # print(abs_err)
            if abs_err == 0:
                # print(f'Player{player} wins')
                return player
    return 0

def calc_reward_tf(S):
    mask_l = tf.constant([[1,1,1,0,0,0,0,0,0], [0,0,0,1,1,1,0,0,0], [0,0,0,0,0,0,1,1,1],
                         [1,0,0,1,0,0,1,0,0], [0,1,0,0,1,0,0,1,0], [0,0,1,0,0,1,0,0,1],
                         [1,0,0,0,1,0,0,0,1], [0,0,1,0,1,0,1,0,0]], dtype=tf.int32)

    S = tf.constant(S, dtype=tf.int32)
    S = tf.reshape(S, shape=(1,-1))
    S_cp = tf.matmul(tf.ones((mask_l.shape[0],1),dtype=tf.int32), S)
    mask_S = mask_l * S_cp

    for player in [1, 2]:
        if tf.reduce_any(tf.reduce_sum(tf.abs(mask_S - player * mask_l),axis=1) == 0):
            return player

    return 0

def one_of_amax(arr, disp_flag=False):
    results = np.where(arr == np.amax(arr))[0]

    if disp_flag:
        print('Equally max actions:', results)

    action = results[np.random.randint(0, len(results), 1)[0]]
    return action


class Q_System:
    def __init__(self, N_A=9, N_Symbols=3, epsilon=0.01, disp_flag=False):
        """
        N_A : Number of actions
        N_Symbols : Number of possible symbols in each point: 0, 1, 2,
        representing empty, player1, player2
        N_S : Number of states
        """
        if N_A is not None:
            self.disp_flag = disp_flag
            N_S = N_Symbols**N_A
            self.Qsa = [np.zeros((N_S,N_A)), np.zeros((N_S,N_A))]
            self.N_A = N_A
            self.N_Symbols = N_Symbols
            self.epsilon = epsilon
        else:
            self.disp_flag = False
            self.Qsa = None
            self.N_A = None
            self.N_Symbols = None
            self.epsilon = None

    def save(self):
        f = open('tictactoe_data.pckl', 'wb')
        obj = [self.N_A, self.N_Symbols, self.epsilon, self.Qsa]
        pickle.dump(obj, f)
        f.close()

    def load(self):
        f = open('tictactoe_data.pckl', 'rb')
        obj = pickle.load(f)
        [self.N_A, self.N_Symbols, self.epsilon, self.Qsa] = obj
        f.close()


    def calc_S_idx(self, S):
        S_idx = 0
        unit = 1
        for s in S:
            S_idx += s*unit
            unit *= self.N_Symbols
        return S_idx

    def _policy_random(self, S, a):
        return 1 / self.N_A

    def policy_random(self, P_no, S, action_list):
        action_prob = []
        S_idx = self.calc_S_idx(S)
        for _ in action_list:
            action_prob.append(1/len(action_list))
        action_idx = tf.squeeze(tf.random.categorical(tf.math.log([action_prob]),1)).numpy()
        if action_idx == len(action_prob): # if all zeros in actoin_prob
            action = action_list[tf.squeeze(np.random.randint(0, len(action_list), 1))]
        else:
            action = action_list[action_idx]
        if self.disp_flag:
            print('S_idx', S_idx, 'action', action, 'action_list', action_list, 'action_prob', action_prob)
        return action

    def policy(self, P_no, S, action_list):
        action_prob = []
        S_idx = self.calc_S_idx(S)
        for a in action_list:
            action_prob.append(self.Qsa[P_no-1][S_idx,a])

        # We consider max Q with epsilon greedy
        if tf.squeeze(tf.random.uniform([1,1])) > self.epsilon:
            action = action_list[one_of_amax(action_prob)]
        else:
            action = action_list[np.random.randint(0,len(action_list),1)[0]]

        if self.disp_flag:
            print('S_idx', S_idx, 'action', action,
                  'action_list', action_list, 'action_prob', action_prob)
        return action

    def _r0_policy(self, P_no, S, action_list):
        action_prob = []
        S_idx = self.calc_S_idx(S)
        for a in action_list:
            action_prob.append(self.Qsa[P_no-1][S_idx, a])
        action_idx = tf.squeeze(tf.random.categorical(tf.math.log([action_prob]),1)).numpy()
        if action_idx == len(action_prob): # if all zeros in actoin_prob
            action = action_list[tf.squeeze(np.random.randint(0, len(action_list), 1))]
        else:
            action = action_list[action_idx]
        if self.disp_flag:
            print('S_idx', S_idx, 'action', action, 'action_list', action_list, 'action_prob', action_prob)

        return action

    def find_action_list(self, S):
        action_list = []
        no_occupied = 0
        for a in range(self.N_A):
            if S[a] == 0:
                action_list.append(a)
            else:
                no_occupied += 1
        return action_list, no_occupied

    # Take action_prob at the given state
    def get_action(self, P_no, S):
        """Return action, done
        """
        action_list, no_occupied = self.find_action_list(S)
        # Since number of possible actions are reduced,
        # denominator is also updated.
        action = self.policy(P_no, S, action_list)
        done = no_occupied == (self.N_A - 1)
        return action, done

    def get_action_with_random(self, P_no, S):
        """Return action, done
        """
        action_list, no_occupied = self.find_action_list(S)
        # Since number of possible actions are reduced,
        # denominator is also updated.
        if P_no == 1:
            action = self.policy(P_no, S, action_list)
        else:
            action = self.policy_random(P_no, S, action_list)
        done = no_occupied == (self.N_A - 1)
        return action, done

    def get_action_with_human(self, P_no, S):
        """
        action, done = get_action_with_human(self, P_no, S)
        - Playing with human

        [Inputs]

        P_no : Human player index which represents 1=fist, 2=second playing
        """
        action_list, no_occupied = self.find_action_list(S)
        # Since number of possible actions are reduced,
        # denominator is also updated.
        print('The current game state is:')
        print(S.reshape(3,3))
        print('Action index:')
        print(np.array(range(9)).reshape(3,3))
        print('Avaiable actions: ', action_list)
        rand_idx = np.random.randint(0, len(action_list))
        random_action = action_list[int(rand_idx)]

        action = None
        while action not in action_list:
            action = input_default(f'Type your action (default={random_action}): ', random_action, int)
            if action not in action_list:
                print('Type action again among in the avaible action list:', action_list)

        done = no_occupied == (self.N_A - 1)
        return action, done

    def play(self, P_no):
        """
        Buff = play(self, P_no)

        [Inputs]
            P_no: player number, which is 1 or 2

        [Returns]
            Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
                where S, a, r, S_next are state, action, rewrd, and next state

        [Examples]
            1. Buff = self.play(1)
            2. Buff = self.play(2)
        """
        N_A = self.N_A
        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}

        S = np.zeros((N_A,),dtype='int16') # #state == #action

        if self.disp_flag:
            print('S:', S)

        done = False
        while done == False:
            action, done = self.get_action(P_no, S)
            Buff['P_no'].append(P_no)
            Buff['S'].append(S.copy())
            Buff['a'].append(action)
            set_state_inplace(S, action, P_no)
            Buff['S_next'].append(S.copy())

            if self.disp_flag:
                print('S:', S)

            win_player = calc_reward_tf(S)
            reward = 0 if win_player == 0 else 1
            Buff['r'].append(reward)
            P_no = 1 if P_no == 2 else 2

            if win_player:
                done = True

        if self.disp_flag:
            if win_player:
                print(f'player {win_player} win')
            else:
                print(f'Tie game')

        return Buff

    def play_with_random(self, P_no):
        """
        Buff = play(self, P_no)

        [Inputs]
            P_no: player number, which is 1 or 2

        [Returns]
            Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
                where S, a, r, S_next are state, action, rewrd, and next state

        [Examples]
            1. Buff = self.play(1)
            2. Buff = self.play(2)
        """
        N_A = self.N_A
        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}

        S = np.zeros((N_A,),dtype='int16') # #state == #action

        if self.disp_flag:
            print('S:', S)

        done = False
        while done == False:
            action, done = self.get_action_with_random(P_no, S)
            Buff['P_no'].append(P_no)
            Buff['S'].append(S.copy())
            Buff['a'].append(action)
            set_state_inplace(S, action, P_no)
            Buff['S_next'].append(S.copy())

            if self.disp_flag:
                print('S:', S)

            win_player = calc_reward_tf(S)
            reward = 0 if win_player == 0 else 1
            Buff['r'].append(reward)
            P_no = 1 if P_no == 2 else 2

            if win_player:
                done = True

        if self.disp_flag:
            if win_player:
                print(f'player {win_player} win')
            else:
                print(f'Tie game')

        return Buff

    def play_with_human(self, player_human=1):
        """
        Buff = play_with_human(self, P_no)
        - Playing with human

        [Inputs]

        P_no: player number, which is 1 or 2

        [Returns]

        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
        where S, a, r, S_next are state, action, rewrd, and next state

        [Examples]
            1. Buff = self.play(1)
            2. Buff = self.play(2)
        """
        N_A = self.N_A
        Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}

        S = np.zeros((N_A,),dtype='int16') # #state == #action

        if self.disp_flag:
            print('S:', S)

        P_no = 1 # set P_no = 1 while player_human could be 1 or 2
        done = False
        while done == False:
            if player_human == P_no:
                action, done = self.get_action_with_human(P_no, S)
            else:
                P_no_trained_agent = 1 # random agent is 2
                action, done = self.get_action_with_random(P_no_trained_agent, S)
            Buff['P_no'].append(P_no)
            Buff['S'].append(S.copy())
            Buff['a'].append(action)
            set_state_inplace(S, action, P_no)
            Buff['S_next'].append(S.copy())

            if self.disp_flag:
                print('S:', S)

            win_player = calc_reward_tf(S)
            reward = 0 if win_player == 0 else 1
            Buff['r'].append(reward)
            P_no = 1 if P_no == 2 else 2

            if win_player:
                done = True

        print(S.reshape(3,3))
        if win_player == player_human:
            print('You win')
        elif win_player != 0:
            print('You lose')
        else:
            print('Tie game')

        return Buff


    def update_Qsa_inplace(self, Buff, ff=0.9, lr=0.01):

        def discounted_inplace(Buff_r):
            """discounted_inplace(Buff_r):
            Convert a reward vector to a discounted return vector using ff,
            where ff means forgeting factor.

            [Input] Buff_r = Buff[r]: stores rewards in each episode
            """
            g_prev = 0
            for i, r_l in enumerate(reversed(Buff_r)):
                Buff_r[-i-1] = r_l + ff * g_prev
                g_prev = Buff_r[-i-1]

        def updateQsa_inplace(Qsa_player, Buff_player):
            if self.disp_flag:
                print('---------------------------------------')
                print('S, S_idx, a, lr * r, Qsa_player[S_idx,a]')

            for S, a, r in zip(Buff_player['S'], Buff_player['a'], Buff_player['r']):
                S_idx = self.calc_S_idx(S)
                Qsa_player[S_idx,a] += lr * r

                if self.disp_flag:
                    print(S, S_idx, a, lr * r, Qsa_player[S_idx,a])

        # def updateQsa_stages_inplace(player, Qsa_player, Buff_player, disp_flag=True):

        def buff_depart(Buff):
            Buff_dual = [{'S':[], 'a':[], 'r':[]}, {'S':[], 'a':[], 'r':[]}]
            for i, (p, S, a, r) in enumerate(zip(Buff['P_no'], Buff['S'], Buff['a'], Buff['r'])):
                if i > 0:
                    # final reward for a player is reward of a next player
                    prev_p = 2 if p==1 else 1
                    Buff_dual[prev_p-1]['r'][-1] = -r # 1 for player#2 --> -1 for player#1, vice versa
                    if self.disp_flag:
                        print('i, prev_p, Buff_dual[prev_p-1]')
                        print(i, prev_p, Buff_dual[prev_p-1])
                Buff_dual[p-1]['S'].append(S)
                Buff_dual[p-1]['a'].append(a)
                Buff_dual[p-1]['r'].append(r)
                if self.disp_flag:
                    print('i, p, Buff_dual[p-1]')
                    print(i, p, Buff_dual[p-1])

            return Buff_dual

        Buff_dual = buff_depart(Buff)

        # player#1
        for player in [1,2]:
            discounted_inplace(Buff_dual[player-1]['r']) # for player#1

            if self.disp_flag:
                print('player:', player)
                print("Buff_dual[player-1]['r']", Buff_dual[player-1]['r'])

            updateQsa_inplace(self.Qsa[player-1], Buff_dual[player-1])
            # updateQsa_stages_inplace(player, self.Qsa_stages[player-1], Buff_dual[player-1])

    def learning(self, N_episodes=2, ff=0.9, lr=0.01, print_cnt=10):
        """Return:
            cnt_trace = [cnt, ...]: cnt vector are stacked in cnt_trace
        """
        cnt = [0, 0, 0] # tie, p1, p2
        cnt_trace = [cnt.copy()]
        player = 1
        for episode in range(N_episodes):
            # print('===================================')
            # Can save this data for play 2 as well

            # Decrease epsilon with respect to epside
            self.epsilon = 1 / (1 + episode/100)
            # self.epsilon =  1 / (1 + episode)

            Buff = self.play(player)
            self.update_Qsa_inplace(Buff, ff=ff, lr=lr)
            win_player = 0 if Buff['r'][-1] == 0 else Buff['P_no'][-1]
            cnt[win_player] += 1
            cnt_trace.append(cnt.copy())

            player = 2 if player == 1 else 1
            if episode % print_cnt == 0:
                print(episode, cnt)

                print('S = [0,0,0, 0,0,0, 0,0,0]')
                print('Qsa[0][0,:]', [f'{self.Qsa[0][0,a]:.1e}' for a in range(9)])
                print('Qsa[1][0,:]', [f'{self.Qsa[1][0,a]:.1e}' for a in range(9)])

                S = [1,1,0, 2,1,2, 1,2,2]
                S_idx = self.calc_S_idx(S)
                print('S = ', S)
                print(f'Qsa[0][{S_idx},:]', [f'{self.Qsa[0][S_idx,a]:.1e}' for a in range(9)])
                print(f'Qsa[1][{S_idx},:]', [f'{self.Qsa[1][S_idx,a]:.1e}' for a in range(9)])

                S = [1,1,0, 2,0,0, 2,0,0]
                S_idx = self.calc_S_idx(S)
                print('S = ', S)
                print(f'Qsa[0][{S_idx},:]', [f'{self.Qsa[0][S_idx,a]:.1e}' for a in range(9)])
                print(f'Qsa[1][{S_idx},:]', [f'{self.Qsa[1][S_idx,a]:.1e}' for a in range(9)])

        return cnt_trace

class Testing:
    def __init__(self, fn_name):
        '''Usages:
            - Testing('calc_reward_tf')
        '''
        if fn_name == 'calc_reward_tf':
            self.test_calc_reward_tf()
        elif fn_name == 'find_action_list':
            self.test_find_action_list()
        elif fn_name == 'get_action':
            self.test_get_action()
        elif fn_name == 'all':
            self.test_calc_reward_tf()
            self.test_find_action_list()
            self.test_get_action()

    def test_calc_reward_tf(self):
        S_examples = tf.constant([[0,0,0, 0,0,0, 0,0,0],
                                 [1,1,1, 2,0,2, 2,0,0],
                                 [0,0,2, 1,2,1, 2,0,0]])

        print('===================================')
        print('Testing: calc_reward_tf')
        print('[Anwer]')
        answer = [0, 1, 2]
        print(answer)

        print('-------------------------------------')
        print('[Test]')
        test = [calc_reward_tf(S) for S in S_examples]
        print(test)
        if test == answer:
            print('Test Ok')
        else:
            print('Test fail')

    def test_find_action_list(self):
        print('===================================')
        print('Testing: test_find_action_list')
        print('[Answer]')
        print('''[[0 0 0]
 [0 0 0]
 [0 0 0]] [0, 1, 2, 3, 4, 5, 6, 7, 8] 0
[[0 2 0]
 [0 1 0]
 [1 0 2]] [0, 2, 3, 5, 7] 4''')

        N_A = 9
        N_Symbols = 3
        my_Q_System = Q_System(N_A, N_Symbols)

        print('-------------------------------------')
        print('[Test]')
        S_l = [[0,0,0, 0,0,0, 0,0,0], [0,2,0, 0,1,0, 1,0,2]]
        for S in S_l:
            action_list, no_occupied = my_Q_System.find_action_list(S)
            print(np.reshape(S,(3,3)), action_list, no_occupied)

    def test_get_action(self):
        print('===================================')
        print('Testing: get_action')
        print('''[Answer]
Equally max actions: [0]
S_idx 0 action 0 action_list [0, 1, 2, 3, 4, 5, 6, 7, 8] action_prob [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[[0 0 0]
 [0 0 0]
 [0 0 0]] 1 0
Equally max actions: [0]
S_idx 13950 action 0 action_list [0, 1, 3, 5, 7] action_prob [1.0, 0.0, 0.0, 0.0, 0.0]
[[0 0 2]
 [0 1 0]
 [1 0 2]] 1 0
Equally max actions: [0]
S_idx 0 action 0 action_list [0, 1, 2, 3, 4, 5, 6, 7, 8] action_prob [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[[0 0 0]
 [0 0 0]
 [0 0 0]] 2 0
Equally max actions: [0]
S_idx 13950 action 0 action_list [0, 1, 3, 5, 7] action_prob [1.0, 0.0, 0.0, 0.0, 0.0]
[[0 0 2]
 [0 1 0]
 [1 0 2]] 2 0''')
        N_A = 9
        N_Symbols = 3
        my_Q_System = Q_System(N_A, N_Symbols)

        print('-------------------------------------')
        print('[Test]')
        S_l = [[0,0,0, 0,0,0, 0,0,0], [0,0,2, 0,1,0, 1,0,2]]
        for P_no in [1,2]:
            for S in S_l:
                S_idx = my_Q_System.calc_S_idx(S)
                my_Q_System.Qsa[P_no-1][S_idx,:] = np.array([1.0,0.0,0, 0,0,0, 0,0,0])
                action, _ = my_Q_System.get_action(P_no, S)
                print(np.reshape(S,(3,3)), P_no, action)


def _main():
    ff = 0.9
    lr = 0.01
    N_episodes = 2
    N_Symbols = 3 # 0=empty, 1=plyaer1, 2=player2
    N_A = 9 # (0,0), (0,1), ..., (2,2)

    my_Q_System = Q_System(N_A, N_Symbols)
    cnt = [0, 0, 0] # tie, p1, p2
    player = 1
    for episode in range(N_episodes):
        # print('===================================')
        # Can save this data for play 2 as well
        Buff = my_Q_System.play(player)
        my_Q_System.update_Qsa_inplace(Buff, ff=ff, lr=lr)
        win_player = 0 if Buff['r'][-1] == 0 else Buff['P_no'][-1]
        cnt[win_player] += 1

        player = 2 if player == 1 else 1
        if episode % 10 == 0:
            print(episode, cnt)

    print(cnt)


def plot_cnt_trace(cnt_trace):
    N_cnt = len(cnt_trace)
    cnt_d = {'Equal':np.zeros(N_cnt,dtype=int), 'P1':np.zeros(N_cnt,dtype=int), 'P2':np.zeros(N_cnt,dtype=int)}
    for i, cnt in enumerate(cnt_trace):
        cnt_d['Equal'][i] = cnt[0]
        cnt_d['P1'][i] = cnt[1]
        cnt_d['P2'][i] = cnt[2]
    plt.plot(range(N_cnt), cnt_d['Equal'], label='Equal')
    plt.plot(range(N_cnt), cnt_d['P1'], label='Plyaer1 wins')
    plt.plot(range(N_cnt), cnt_d['P2'], label='Plyaer2 wins')
    plt.xlabel('Episode')
    plt.ylabel('Count')
    plt.legend(loc=0)
    plt.title('Learned (P#1) vs. Random (P#2) policies during learning')
    plt.show()


def learning_stage(N_episodes=100, save_flag=True, fig_flag=False):
    ff = 0.9
    lr = 0.01
    N_Symbols = 3 # 0=empty, 1=plyaer1, 2=player2
    N_A = 9 # (0,0), (0,1), ..., (2,2)
    print_cnt = N_episodes / 10

    my_Q_System = Q_System(N_A, N_Symbols)
    cnt_trace = my_Q_System.learning(N_episodes=N_episodes, ff=ff, lr=lr, print_cnt=print_cnt)
    print('-------------------')
    cnt = cnt_trace[-1]
    print(N_episodes, cnt)

    if save_flag:
        my_Q_System.save()

    if fig_flag:
        plot_cnt_trace(cnt_trace)

    return my_Q_System


def input_default(str, defalut_value, dtype=int):
    answer = input(str)
    if answer == '':
        return defalut_value
    else:
        return dtype(answer)


def main():
    Q1 = input_default('1. Loading a trained agent (0) or Learning a new agent (1)? (default=0) ', 0, int)
    if Q1 == 0:
        print('Loading the trained agent...')
        Q2 = input_default('2. Do you want to start first?(0=yes,1=no,default=0) ', 0, int)
        player_human = Q2 + 1
        if player_human == 1:
            print('You=1, Agent=2')
        else:
            print('Agent=1, You=2')
        trained_Q_System = Q_System(None)
        trained_Q_System.load()
        trained_Q_System.play_with_human(player_human)
        # print(len(trained_Q_System.Qsa))
    else:
        print('Start to learn a new agent...')
        Q2 = input_default('2. How many episode do you want to learn?(default=10000) ', 10000, int)
        # my_Q_System = learning_stage(N_episodes=Q2, fig_flag=True)
        _ = learning_stage(N_episodes=Q2, fig_flag=True)
        # print(len(my_Q_System.Qsa))


if __name__ == "__main__":
    main()
    # Testing('all')
    pass
	import tensorflow as tf
	import numpy as np
	import matplotlib.pyplot as plt
	import pickle

	# TicTacToe game has nine stateus with nine actions. An user can put his ston on any postion in the borad except

	def set_state_inplace(S, action, P_no):
	''' S is numpy array.'''
	assert S[action] == 0, 'position should be empty to put a new stone'
	S[action] = P_no # User numpy to insert action in the specific position

	def calc_reward(S):
	mask_l = tf.constant([[1,1,1,0,0,0,0,0,0], [0,0,0,1,1,1,0,0,0], [0,0,0,0,0,0,1,1,1],
	[1,0,0,1,0,0,1,0,0], [0,1,0,0,1,0,0,1,0], [0,0,1,0,0,1,0,0,1],
	[1,0,0,0,1,0,0,0,1], [0,0,1,0,1,0,1,0,0]], dtype=tf.int16)
	for mask in mask_l:
	# print(mask)
	mask_S = mask * S
	# print(mask_S)

	for player in [1,2]:
	abs_err = tf.reduce_sum(tf.abs(mask_S - player * mask))
	# print(abs_err)
	if abs_err == 0:
	# print(f'Player{player} wins')
	return player
	return 0

	def calc_reward_tf(S):
	mask_l = tf.constant([[1,1,1,0,0,0,0,0,0], [0,0,0,1,1,1,0,0,0], [0,0,0,0,0,0,1,1,1],
	[1,0,0,1,0,0,1,0,0], [0,1,0,0,1,0,0,1,0], [0,0,1,0,0,1,0,0,1],
	[1,0,0,0,1,0,0,0,1], [0,0,1,0,1,0,1,0,0]], dtype=tf.int32)

	S = tf.constant(S, dtype=tf.int32)
	S = tf.reshape(S, shape=(1,-1))
	S_cp = tf.matmul(tf.ones((mask_l.shape[0],1),dtype=tf.int32), S)
	mask_S = mask_l * S_cp

	for player in [1, 2]:
	if tf.reduce_any(tf.reduce_sum(tf.abs(mask_S - player * mask_l),axis=1) == 0):
	return player

	return 0

	def one_of_amax(arr, disp_flag=False):
	results = np.where(arr == np.amax(arr))[0]

	if disp_flag:
	print('Equally max actions:', results)

	action = results[np.random.randint(0, len(results), 1)[0]]
	return action


	class Q_System:
	def __init__(self, N_A=9, N_Symbols=3, epsilon=0.01, disp_flag=False):
	"""
	N_A : Number of actions
	N_Symbols : Number of possible symbols in each point: 0, 1, 2,
	representing empty, player1, player2
	N_S : Number of states
	"""
	if N_A is not None:
	self.disp_flag = disp_flag
	N_S = N_Symbols**N_A
	self.Qsa = [np.zeros((N_S,N_A)), np.zeros((N_S,N_A))]
	self.N_A = N_A
	self.N_Symbols = N_Symbols
	self.epsilon = epsilon
	else:
	self.disp_flag = False
	self.Qsa = None
	self.N_A = None
	self.N_Symbols = None
	self.epsilon = None

	def save(self):
	f = open('tictactoe_data.pckl', 'wb')
	obj = [self.N_A, self.N_Symbols, self.epsilon, self.Qsa]
	pickle.dump(obj, f)
	f.close()

	def load(self):
	f = open('tictactoe_data.pckl', 'rb')
	obj = pickle.load(f)
	[self.N_A, self.N_Symbols, self.epsilon, self.Qsa] = obj
	f.close()


	def calc_S_idx(self, S):
	S_idx = 0
	unit = 1
	for s in S:
	S_idx += s*unit
	unit *= self.N_Symbols
	return S_idx

	def _policy_random(self, S, a):
	return 1 / self.N_A

	def policy_random(self, P_no, S, action_list):
	action_prob = []
	S_idx = self.calc_S_idx(S)
	for _ in action_list:
	action_prob.append(1/len(action_list))
	action_idx = tf.squeeze(tf.random.categorical(tf.math.log([action_prob]),1)).numpy()
	if action_idx == len(action_prob): # if all zeros in actoin_prob
	action = action_list[tf.squeeze(np.random.randint(0, len(action_list), 1))]
	else:
	action = action_list[action_idx]
	if self.disp_flag:
	print('S_idx', S_idx, 'action', action, 'action_list', action_list, 'action_prob', action_prob)
	return action

	def policy(self, P_no, S, action_list):
	action_prob = []
	S_idx = self.calc_S_idx(S)
	for a in action_list:
	action_prob.append(self.Qsa[P_no-1][S_idx,a])

	# We consider max Q with epsilon greedy
	if tf.squeeze(tf.random.uniform([1,1])) > self.epsilon:
	action = action_list[one_of_amax(action_prob)]
	else:
	action = action_list[np.random.randint(0,len(action_list),1)[0]]

	if self.disp_flag:
	print('S_idx', S_idx, 'action', action,
	'action_list', action_list, 'action_prob', action_prob)
	return action

	def _r0_policy(self, P_no, S, action_list):
	action_prob = []
	S_idx = self.calc_S_idx(S)
	for a in action_list:
	action_prob.append(self.Qsa[P_no-1][S_idx, a])
	action_idx = tf.squeeze(tf.random.categorical(tf.math.log([action_prob]),1)).numpy()
	if action_idx == len(action_prob): # if all zeros in actoin_prob
	action = action_list[tf.squeeze(np.random.randint(0, len(action_list), 1))]
	else:
	action = action_list[action_idx]
	if self.disp_flag:
	print('S_idx', S_idx, 'action', action, 'action_list', action_list, 'action_prob', action_prob)

	return action

	def find_action_list(self, S):
	action_list = []
	no_occupied = 0
	for a in range(self.N_A):
	if S[a] == 0:
	action_list.append(a)
	else:
	no_occupied += 1
	return action_list, no_occupied

	# Take action_prob at the given state
	def get_action(self, P_no, S):
	"""Return action, done
	"""
	action_list, no_occupied = self.find_action_list(S)
	# Since number of possible actions are reduced,
	# denominator is also updated.
	action = self.policy(P_no, S, action_list)
	done = no_occupied == (self.N_A - 1)
	return action, done

	def get_action_with_random(self, P_no, S):
	"""Return action, done
	"""
	action_list, no_occupied = self.find_action_list(S)
	# Since number of possible actions are reduced,
	# denominator is also updated.
	if P_no == 1:
	action = self.policy(P_no, S, action_list)
	else:
	action = self.policy_random(P_no, S, action_list)
	done = no_occupied == (self.N_A - 1)
	return action, done

	def get_action_with_human(self, P_no, S):
	"""
	action, done = get_action_with_human(self, P_no, S)
	- Playing with human

	[Inputs]

	P_no : Human player index which represents 1=fist, 2=second playing
	"""
	action_list, no_occupied = self.find_action_list(S)
	# Since number of possible actions are reduced,
	# denominator is also updated.
	print('The current game state is:')
	print(S.reshape(3,3))
	print('Action index:')
	print(np.array(range(9)).reshape(3,3))
	print('Avaiable actions: ', action_list)
	rand_idx = np.random.randint(0, len(action_list))
	random_action = action_list[int(rand_idx)]

	action = None
	while action not in action_list:
	action = input_default(f'Type your action (default={random_action}): ', random_action, int)
	if action not in action_list:
	print('Type action again among in the avaible action list:', action_list)

	done = no_occupied == (self.N_A - 1)
	return action, done

	def play(self, P_no):
	"""
	Buff = play(self, P_no)

	[Inputs]
	P_no: player number, which is 1 or 2

	[Returns]
	Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
	where S, a, r, S_next are state, action, rewrd, and next state

	[Examples]
	1. Buff = self.play(1)
	2. Buff = self.play(2)
	"""
	N_A = self.N_A
	Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}

	S = np.zeros((N_A,),dtype='int16') # #state == #action

	if self.disp_flag:
	print('S:', S)

	done = False
	while done == False:
	action, done = self.get_action(P_no, S)
	Buff['P_no'].append(P_no)
	Buff['S'].append(S.copy())
	Buff['a'].append(action)
	set_state_inplace(S, action, P_no)
	Buff['S_next'].append(S.copy())

	if self.disp_flag:
	print('S:', S)

	win_player = calc_reward_tf(S)
	reward = 0 if win_player == 0 else 1
	Buff['r'].append(reward)
	P_no = 1 if P_no == 2 else 2

	if win_player:
	done = True

	if self.disp_flag:
	if win_player:
	print(f'player {win_player} win')
	else:
	print(f'Tie game')

	return Buff

	def play_with_random(self, P_no):
	"""
	Buff = play(self, P_no)

	[Inputs]
	P_no: player number, which is 1 or 2

	[Returns]
	Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
	where S, a, r, S_next are state, action, rewrd, and next state

	[Examples]
	1. Buff = self.play(1)
	2. Buff = self.play(2)
	"""
	N_A = self.N_A
	Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}

	S = np.zeros((N_A,),dtype='int16') # #state == #action

	if self.disp_flag:
	print('S:', S)

	done = False
	while done == False:
	action, done = self.get_action_with_random(P_no, S)
	Buff['P_no'].append(P_no)
	Buff['S'].append(S.copy())
	Buff['a'].append(action)
	set_state_inplace(S, action, P_no)
	Buff['S_next'].append(S.copy())

	if self.disp_flag:
	print('S:', S)

	win_player = calc_reward_tf(S)
	reward = 0 if win_player == 0 else 1
	Buff['r'].append(reward)
	P_no = 1 if P_no == 2 else 2

	if win_player:
	done = True

	if self.disp_flag:
	if win_player:
	print(f'player {win_player} win')
	else:
	print(f'Tie game')

	return Buff

	def play_with_human(self, player_human=1):
	"""
	Buff = play_with_human(self, P_no)
	- Playing with human

	[Inputs]

	P_no: player number, which is 1 or 2

	[Returns]

	Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}: gathered information during learning
	where S, a, r, S_next are state, action, rewrd, and next state

	[Examples]
	1. Buff = self.play(1)
	2. Buff = self.play(2)
	"""
	N_A = self.N_A
	Buff = {'P_no': [], 'S':[], 'a':[], 'r':[], 'S_next': []}

	S = np.zeros((N_A,),dtype='int16') # #state == #action

	if self.disp_flag:
	print('S:', S)

	P_no = 1 # set P_no = 1 while player_human could be 1 or 2
	done = False
	while done == False:
	if player_human == P_no:
	action, done = self.get_action_with_human(P_no, S)
	else:
	P_no_trained_agent = 1 # random agent is 2
	action, done = self.get_action_with_random(P_no_trained_agent, S)
	Buff['P_no'].append(P_no)
	Buff['S'].append(S.copy())
	Buff['a'].append(action)
	set_state_inplace(S, action, P_no)
	Buff['S_next'].append(S.copy())

	if self.disp_flag:
	print('S:', S)

	win_player = calc_reward_tf(S)
	reward = 0 if win_player == 0 else 1
	Buff['r'].append(reward)
	P_no = 1 if P_no == 2 else 2

	if win_player:
	done = True

	print(S.reshape(3,3))
	if win_player == player_human:
	print('You win')
	elif win_player != 0:
	print('You lose')
	else:
	print('Tie game')

	return Buff


	def update_Qsa_inplace(self, Buff, ff=0.9, lr=0.01):

	def discounted_inplace(Buff_r):
	"""discounted_inplace(Buff_r):
	Convert a reward vector to a discounted return vector using ff,
	where ff means forgeting factor.

	[Input] Buff_r = Buff[r]: stores rewards in each episode
	"""
	g_prev = 0
	for i, r_l in enumerate(reversed(Buff_r)):
	Buff_r[-i-1] = r_l + ff * g_prev
	g_prev = Buff_r[-i-1]

	def updateQsa_inplace(Qsa_player, Buff_player):
	if self.disp_flag:
	print('---------------------------------------')
	print('S, S_idx, a, lr * r, Qsa_player[S_idx,a]')

	for S, a, r in zip(Buff_player['S'], Buff_player['a'], Buff_player['r']):
	S_idx = self.calc_S_idx(S)
	Qsa_player[S_idx,a] += lr * r

	if self.disp_flag:
	print(S, S_idx, a, lr * r, Qsa_player[S_idx,a])

	# def updateQsa_stages_inplace(player, Qsa_player, Buff_player, disp_flag=True):

	def buff_depart(Buff):
	Buff_dual = [{'S':[], 'a':[], 'r':[]}, {'S':[], 'a':[], 'r':[]}]
	for i, (p, S, a, r) in enumerate(zip(Buff['P_no'], Buff['S'], Buff['a'], Buff['r'])):
	if i > 0:
	# final reward for a player is reward of a next player
	prev_p = 2 if p==1 else 1
	Buff_dual[prev_p-1]['r'][-1] = -r # 1 for player#2 --> -1 for player#1, vice versa
	if self.disp_flag:
	print('i, prev_p, Buff_dual[prev_p-1]')
	print(i, prev_p, Buff_dual[prev_p-1])
	Buff_dual[p-1]['S'].append(S)
	Buff_dual[p-1]['a'].append(a)
	Buff_dual[p-1]['r'].append(r)
	if self.disp_flag:
	print('i, p, Buff_dual[p-1]')
	print(i, p, Buff_dual[p-1])

	return Buff_dual

	Buff_dual = buff_depart(Buff)

	# player#1
	for player in [1,2]:
	discounted_inplace(Buff_dual[player-1]['r']) # for player#1

	if self.disp_flag:
	print('player:', player)
	print("Buff_dual[player-1]['r']", Buff_dual[player-1]['r'])

	updateQsa_inplace(self.Qsa[player-1], Buff_dual[player-1])
	# updateQsa_stages_inplace(player, self.Qsa_stages[player-1], Buff_dual[player-1])

	def learning(self, N_episodes=2, ff=0.9, lr=0.01, print_cnt=10):
	"""Return:
	cnt_trace = [cnt, ...]: cnt vector are stacked in cnt_trace
	"""
	cnt = [0, 0, 0] # tie, p1, p2
	cnt_trace = [cnt.copy()]
	player = 1
	for episode in range(N_episodes):
	# print('===================================')
	# Can save this data for play 2 as well

	# Decrease epsilon with respect to epside
	self.epsilon = 1 / (1 + episode/100)
	# self.epsilon = 1 / (1 + episode)

	Buff = self.play(player)
	self.update_Qsa_inplace(Buff, ff=ff, lr=lr)
	win_player = 0 if Buff['r'][-1] == 0 else Buff['P_no'][-1]
	cnt[win_player] += 1
	cnt_trace.append(cnt.copy())

	player = 2 if player == 1 else 1
	if episode % print_cnt == 0:
	print(episode, cnt)

	print('S = [0,0,0, 0,0,0, 0,0,0]')
	print('Qsa[0][0,:]', [f'{self.Qsa[0][0,a]:.1e}' for a in range(9)])
	print('Qsa[1][0,:]', [f'{self.Qsa[1][0,a]:.1e}' for a in range(9)])

	S = [1,1,0, 2,1,2, 1,2,2]
	S_idx = self.calc_S_idx(S)
	print('S = ', S)
	print(f'Qsa[0][{S_idx},:]', [f'{self.Qsa[0][S_idx,a]:.1e}' for a in range(9)])
	print(f'Qsa[1][{S_idx},:]', [f'{self.Qsa[1][S_idx,a]:.1e}' for a in range(9)])

	S = [1,1,0, 2,0,0, 2,0,0]
	S_idx = self.calc_S_idx(S)
	print('S = ', S)
	print(f'Qsa[0][{S_idx},:]', [f'{self.Qsa[0][S_idx,a]:.1e}' for a in range(9)])
	print(f'Qsa[1][{S_idx},:]', [f'{self.Qsa[1][S_idx,a]:.1e}' for a in range(9)])

	return cnt_trace

	class Testing:
	def __init__(self, fn_name):
	'''Usages:
	- Testing('calc_reward_tf')
	'''
	if fn_name == 'calc_reward_tf':
	self.test_calc_reward_tf()
	elif fn_name == 'find_action_list':
	self.test_find_action_list()
	elif fn_name == 'get_action':
	self.test_get_action()
	elif fn_name == 'all':
	self.test_calc_reward_tf()
	self.test_find_action_list()
	self.test_get_action()

	def test_calc_reward_tf(self):
	S_examples = tf.constant([[0,0,0, 0,0,0, 0,0,0],
	[1,1,1, 2,0,2, 2,0,0],
	[0,0,2, 1,2,1, 2,0,0]])

	print('===================================')
	print('Testing: calc_reward_tf')
	print('[Anwer]')
	answer = [0, 1, 2]
	print(answer)

	print('-------------------------------------')
	print('[Test]')
	test = [calc_reward_tf(S) for S in S_examples]
	print(test)
	if test == answer:
	print('Test Ok')
	else:
	print('Test fail')

	def test_find_action_list(self):
	print('===================================')
	print('Testing: test_find_action_list')
	print('[Answer]')
	print('''[[0 0 0]
	[0 0 0]
	[0 0 0]] [0, 1, 2, 3, 4, 5, 6, 7, 8] 0
	[[0 2 0]
	[0 1 0]
	[1 0 2]] [0, 2, 3, 5, 7] 4''')

	N_A = 9
	N_Symbols = 3
	my_Q_System = Q_System(N_A, N_Symbols)

	print('-------------------------------------')
	print('[Test]')
	S_l = [[0,0,0, 0,0,0, 0,0,0], [0,2,0, 0,1,0, 1,0,2]]
	for S in S_l:
	action_list, no_occupied = my_Q_System.find_action_list(S)
	print(np.reshape(S,(3,3)), action_list, no_occupied)

	def test_get_action(self):
	print('===================================')
	print('Testing: get_action')
	print('''[Answer]
	Equally max actions: [0]
	S_idx 0 action 0 action_list [0, 1, 2, 3, 4, 5, 6, 7, 8] action_prob [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
	[[0 0 0]
	[0 0 0]
	[0 0 0]] 1 0
	Equally max actions: [0]
	S_idx 13950 action 0 action_list [0, 1, 3, 5, 7] action_prob [1.0, 0.0, 0.0, 0.0, 0.0]
	[[0 0 2]
	[0 1 0]
	[1 0 2]] 1 0
	Equally max actions: [0]
	S_idx 0 action 0 action_list [0, 1, 2, 3, 4, 5, 6, 7, 8] action_prob [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
	[[0 0 0]
	[0 0 0]
	[0 0 0]] 2 0
	Equally max actions: [0]
	S_idx 13950 action 0 action_list [0, 1, 3, 5, 7] action_prob [1.0, 0.0, 0.0, 0.0, 0.0]
	[[0 0 2]
	[0 1 0]
	[1 0 2]] 2 0''')
	N_A = 9
	N_Symbols = 3
	my_Q_System = Q_System(N_A, N_Symbols)

	print('-------------------------------------')
	print('[Test]')
	S_l = [[0,0,0, 0,0,0, 0,0,0], [0,0,2, 0,1,0, 1,0,2]]
	for P_no in [1,2]:
	for S in S_l:
	S_idx = my_Q_System.calc_S_idx(S)
	my_Q_System.Qsa[P_no-1][S_idx,:] = np.array([1.0,0.0,0, 0,0,0, 0,0,0])
	action, _ = my_Q_System.get_action(P_no, S)
	print(np.reshape(S,(3,3)), P_no, action)



	def _main():
	ff = 0.9
	lr = 0.01
	N_episodes = 2
	N_Symbols = 3 # 0=empty, 1=plyaer1, 2=player2
	N_A = 9 # (0,0), (0,1), ..., (2,2)

	my_Q_System = Q_System(N_A, N_Symbols)
	cnt = [0, 0, 0] # tie, p1, p2
	player = 1
	for episode in range(N_episodes):
	# print('===================================')
	# Can save this data for play 2 as well
	Buff = my_Q_System.play(player)
	my_Q_System.update_Qsa_inplace(Buff, ff=ff, lr=lr)
	win_player = 0 if Buff['r'][-1] == 0 else Buff['P_no'][-1]
	cnt[win_player] += 1

	player = 2 if player == 1 else 1
	if episode % 10 == 0:
	print(episode, cnt)

	print(cnt)


	def plot_cnt_trace(cnt_trace):
	N_cnt = len(cnt_trace)
	cnt_d = {'Equal':np.zeros(N_cnt,dtype=int), 'P1':np.zeros(N_cnt,dtype=int), 'P2':np.zeros(N_cnt,dtype=int)}
	for i, cnt in enumerate(cnt_trace):
	cnt_d['Equal'][i] = cnt[0]
	cnt_d['P1'][i] = cnt[1]
	cnt_d['P2'][i] = cnt[2]
	plt.plot(range(N_cnt), cnt_d['Equal'], label='Equal')
	plt.plot(range(N_cnt), cnt_d['P1'], label='Plyaer1 wins')
	plt.plot(range(N_cnt), cnt_d['P2'], label='Plyaer2 wins')
	plt.xlabel('Episode')
	plt.ylabel('Count')
	plt.legend(loc=0)
	plt.title('Learned (P#1) vs. Random (P#2) policies during learning')
	plt.show()


	def learning_stage(N_episodes=100, save_flag=True, fig_flag=False):
	ff = 0.9
	lr = 0.01
	N_Symbols = 3 # 0=empty, 1=plyaer1, 2=player2
	N_A = 9 # (0,0), (0,1), ..., (2,2)
	print_cnt = N_episodes / 10

	my_Q_System = Q_System(N_A, N_Symbols)
	cnt_trace = my_Q_System.learning(N_episodes=N_episodes, ff=ff, lr=lr, print_cnt=print_cnt)
	print('-------------------')
	cnt = cnt_trace[-1]
	print(N_episodes, cnt)

	if save_flag:
	my_Q_System.save()

	if fig_flag:
	plot_cnt_trace(cnt_trace)

	return my_Q_System


	def input_default(str, defalut_value, dtype=int):
	answer = input(str)
	if answer == '':
	return defalut_value
	else:
	return dtype(answer)


	def main():
	Q1 = input_default('1. Loading a trained agent (0) or Learning a new agent (1)? (default=0) ', 0, int)
	if Q1 == 0:
	print('Loading the trained agent...')
	Q2 = input_default('2. Do you want to start first?(0=yes,1=no,default=0) ', 0, int)
	player_human = Q2 + 1
	if player_human == 1:
	print('You=1, Agent=2')
	else:
	print('Agent=1, You=2')
	trained_Q_System = Q_System(None)
	trained_Q_System.load()
	trained_Q_System.play_with_human(player_human)
	# print(len(trained_Q_System.Qsa))
	else:
	print('Start to learn a new agent...')
	Q2 = input_default('2. How many episode do you want to learn?(default=10000) ', 10000, int)
	# my_Q_System = learning_stage(N_episodes=Q2, fig_flag=True)
	_ = learning_stage(N_episodes=Q2, fig_flag=True)
	# print(len(my_Q_System.Qsa))


	if __name__ == "__main__":
	main()
	# Testing('all')
	pass