Dviejopomata/q.py

## q.py
import itertools
import random

import numpy as np


class Game:
    def __init__(self):
        """
        0 - casilla vacia
        1 - jugador
        2 - recompensa
        """
        self.state = random.sample([0, 1, 2], 3)
        """
         0 - Izquierda
         1 - Derecha
        """
        self.actions = [0, 1]

    def step(self, action):
        """
        Realiza una accion en el juego
        :param action: 0 - izquierda 1 - derecha
        :return:
        """
        distance = 1 if action == 1 else -1
        player_idx = self.state.index(1)
        reward_idx = self.state.index(2)
        if 0 <= (player_idx + distance) <= 2:
            new_idx = player_idx + distance
            self.state[player_idx] = 0
            self.state[new_idx] = 1
            done = new_idx == reward_idx
            if done:
                reward = 10
            else:
                reward = 0
        else:
            reward = -10
            done = False
        return self.state, reward, done, {}

    def reset(self):
        """
        Resetea el estado del juego
        :return:
        """
        self.state = random.sample([0, 1, 2], 3)
        return self.state

    def state_to_number(self):
        """
        Mapea un estado a un numero que sera el indice en la tabla Q
        :return:
        """
        map = {
            0: [1, 0, 2],
            1: [0, 1, 2],
            2: [0, 2, 1],
            3: [1, 2, 0],
            4: [2, 1, 0],
            5: [2, 0, 1],
        }
        return [key for key, value in map.items() if value == self.state]


game = Game()
LR = 0.01
GAMMA = 0.9
NUM_EPISODES = 1000
epsilon = 0.8

comb_states = sum(1 for i in itertools.combinations(range(4), 2))

# q_table = np.random.uniform(low=-1, high=1, size=(comb_states, len(game.actions)))
q_table = np.array([
    # estado 0 [1,0,2]
    [10, -1],
    # estado 1 [0,1,2]
    [10, -1],
    # estado 2 [0,2,1]
    [-1, 1],
    # estado 3 [1,2,0]
    [-1, 1],
    # estado 4 [2,1,0]
    [-1, 1],
    # estado 5 [2,0,1]
    [-1, 1],
], dtype="float64")
print(q_table)
for i in range(NUM_EPISODES):
    state = game.reset()
    state_id = game.state_to_number()
    while True:
        if np.random.random() < 1 - epsilon:
            action = np.argmax(q_table[state_id])
        else:
            action = np.random.randint(0, len(game.actions))
        state2, reward, done, info = game.step(action)
        if not done:
            state2_id = game.state_to_number()
            delta = LR * (reward + GAMMA * np.max(q_table[state2_id]) - q_table[state_id, action])
            q_table[state_id, action] += delta
            state_id = state2_id
        else:
            q_table[state_id, action] = reward
            break
print(q_table)
	import itertools
	import random

	import numpy as np


	class Game:
	def __init__(self):
	"""
	0 - casilla vacia
	1 - jugador
	2 - recompensa
	"""
	self.state = random.sample([0, 1, 2], 3)
	"""
	0 - Izquierda
	1 - Derecha
	"""
	self.actions = [0, 1]

	def step(self, action):
	"""
	Realiza una accion en el juego
	:param action: 0 - izquierda 1 - derecha
	:return:
	"""
	distance = 1 if action == 1 else -1
	player_idx = self.state.index(1)
	reward_idx = self.state.index(2)
	if 0 <= (player_idx + distance) <= 2:
	new_idx = player_idx + distance
	self.state[player_idx] = 0
	self.state[new_idx] = 1
	done = new_idx == reward_idx
	if done:
	reward = 10
	else:
	reward = 0
	else:
	reward = -10
	done = False
	return self.state, reward, done, {}

	def reset(self):
	"""
	Resetea el estado del juego
	:return:
	"""
	self.state = random.sample([0, 1, 2], 3)
	return self.state

	def state_to_number(self):
	"""
	Mapea un estado a un numero que sera el indice en la tabla Q
	:return:
	"""
	map = {
	0: [1, 0, 2],
	1: [0, 1, 2],
	2: [0, 2, 1],
	3: [1, 2, 0],
	4: [2, 1, 0],
	5: [2, 0, 1],
	}
	return [key for key, value in map.items() if value == self.state]


	game = Game()
	LR = 0.01
	GAMMA = 0.9
	NUM_EPISODES = 1000
	epsilon = 0.8

	comb_states = sum(1 for i in itertools.combinations(range(4), 2))

	# q_table = np.random.uniform(low=-1, high=1, size=(comb_states, len(game.actions)))
	q_table = np.array([
	# estado 0 [1,0,2]
	[10, -1],
	# estado 1 [0,1,2]
	[10, -1],
	# estado 2 [0,2,1]
	[-1, 1],
	# estado 3 [1,2,0]
	[-1, 1],
	# estado 4 [2,1,0]
	[-1, 1],
	# estado 5 [2,0,1]
	[-1, 1],
	], dtype="float64")
	print(q_table)
	for i in range(NUM_EPISODES):
	state = game.reset()
	state_id = game.state_to_number()
	while True:
	if np.random.random() < 1 - epsilon:
	action = np.argmax(q_table[state_id])
	else:
	action = np.random.randint(0, len(game.actions))
	state2, reward, done, info = game.step(action)
	if not done:
	state2_id = game.state_to_number()
	delta = LR * (reward + GAMMA * np.max(q_table[state2_id]) - q_table[state_id, action])
	q_table[state_id, action] += delta
	state_id = state2_id
	else:
	q_table[state_id, action] = reward
	break
	print(q_table)