prakhar21/qlearn_reinforcement.py

## qlearn_reinforcement.py
'''
@author: Prakhar
Motivation: http://mnemstudio.org/path-finding-q-learning-tutorial.htm
'''

import random
from termcolor import colored

class QLearning:

    def __init__(self):
        # initializing the environment here
        # ideally should have made a different class
        self.state_action_mat = {
  				   0:[4],
 			           1:[3,5],
                                   2:[3],
                                   3:[1,2,4],
                                   4:[0,3,5],
                                   5:[1,4,5]
                                }
        self.state_action_reward_mat = {
                                          (0,4):0,
                                          (1,3):0,
                                          (1,5):100,
                                          (2,3):0,
                                          (3,1):0,
                                          (3,2):0,
                                          (3,4):0,
                                          (4,0):0,
                                          (4,4):0,
                                          (4,5):100,
                                          (5,1):0,
                                          (5,4):0,
                                          (5,5):100,
                                       }

        self.q_matrix = {}
        self.goal_state = 5
        self.gamma = 0.5
        self.final_state = 5
        self.episodes = 50
        self.states = 6

    def populate_qmat(self):
        for state in xrange(self.states):
            for action in xrange(self.states):
                self.q_matrix[(state, action)] = 0

    def random_state(self):
        return random.choice(self.state_action_mat.keys())

    def max_reward_next_state(self, state):
        candidates = {}
        for s, val in self.q_matrix.iteritems():
            if s[0] == state:
                candidates[s] = val
        temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
        return temp[0][1]

    def find_action(self, state):
        candidates = {}
        for s, val in self.q_matrix.iteritems():
            if s[0] == state:
                candidates[s] = val

        temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
        return temp[0][0][1]

# Instance of QLearning
qlearn = QLearning()

# initialize Q-Matrix to 0(Zero) score values
qlearn.populate_qmat()

# loop through multiple episodes
for e in xrange(qlearn.episodes):

    # start with initial random state
    initial_state = qlearn.random_state()
    start = initial_state

    # steps taken to reach the goal
    steps = 0

    # path
    path = []

    # till goal is reached in each episode
    while True:

        steps += 1

        # find action from a particular state with max Q-value
        action = qlearn.find_action(initial_state)

        # set this to next state
        nextstate = action

        # find qmax for the next state
        qmax_next_state = qlearn.max_reward_next_state(nextstate)

        # update the q matrix
        reward = qlearn.state_action_reward_mat.get((initial_state, action))

        # transition not possible from state to state
        if not reward:
            reward = -1

        # update the Q-matrix
        qlearn.q_matrix[(initial_state, action)] = reward + qlearn.gamma * qmax_next_state

        # path update
        path.append(initial_state)

        # traverse to the next state
        initial_state = nextstate

        if initial_state == qlearn.final_state:
            print ('Reached the goal in the episode number {} in {}'.\
                      format(colored(e, 'red'), colored(steps,'green')))
            path = []
            break
	'''
	@author: Prakhar
	Motivation: http://mnemstudio.org/path-finding-q-learning-tutorial.htm
	'''

	import random
	from termcolor import colored

	class QLearning:

	def __init__(self):
	# initializing the environment here
	# ideally should have made a different class
	self.state_action_mat = {
	0:[4],
	1:[3,5],
	2:[3],
	3:[1,2,4],
	4:[0,3,5],
	5:[1,4,5]
	}
	self.state_action_reward_mat = {
	(0,4):0,
	(1,3):0,
	(1,5):100,
	(2,3):0,
	(3,1):0,
	(3,2):0,
	(3,4):0,
	(4,0):0,
	(4,4):0,
	(4,5):100,
	(5,1):0,
	(5,4):0,
	(5,5):100,
	}

	self.q_matrix = {}
	self.goal_state = 5
	self.gamma = 0.5
	self.final_state = 5
	self.episodes = 50
	self.states = 6

	def populate_qmat(self):
	for state in xrange(self.states):
	for action in xrange(self.states):
	self.q_matrix[(state, action)] = 0

	def random_state(self):
	return random.choice(self.state_action_mat.keys())

	def max_reward_next_state(self, state):
	candidates = {}
	for s, val in self.q_matrix.iteritems():
	if s[0] == state:
	candidates[s] = val
	temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
	return temp[0][1]

	def find_action(self, state):
	candidates = {}
	for s, val in self.q_matrix.iteritems():
	if s[0] == state:
	candidates[s] = val

	temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
	return temp[0][0][1]

	# Instance of QLearning
	qlearn = QLearning()

	# initialize Q-Matrix to 0(Zero) score values
	qlearn.populate_qmat()

	# loop through multiple episodes
	for e in xrange(qlearn.episodes):

	# start with initial random state
	initial_state = qlearn.random_state()
	start = initial_state

	# steps taken to reach the goal
	steps = 0

	# path
	path = []

	# till goal is reached in each episode
	while True:

	steps += 1

	# find action from a particular state with max Q-value
	action = qlearn.find_action(initial_state)

	# set this to next state
	nextstate = action

	# find qmax for the next state
	qmax_next_state = qlearn.max_reward_next_state(nextstate)

	# update the q matrix
	reward = qlearn.state_action_reward_mat.get((initial_state, action))

	# transition not possible from state to state
	if not reward:
	reward = -1

	# update the Q-matrix
	qlearn.q_matrix[(initial_state, action)] = reward + qlearn.gamma * qmax_next_state

	# path update
	path.append(initial_state)

	# traverse to the next state
	initial_state = nextstate

	if initial_state == qlearn.final_state:
	print ('Reached the goal in the episode number {} in {}'.\
	format(colored(e, 'red'), colored(steps,'green')))
	path = []
	break