Last active
May 5, 2018 06:31
-
-
Save prakhar21/564201531b9b46702d36fcfff0683655 to your computer and use it in GitHub Desktop.
Q-Learning Implementation to solve maze escape problem using Reinformcement Learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author: Prakhar | |
Motivation: http://mnemstudio.org/path-finding-q-learning-tutorial.htm | |
''' | |
import random | |
from termcolor import colored | |
class QLearning: | |
def __init__(self): | |
# initializing the environment here | |
# ideally should have made a different class | |
self.state_action_mat = { | |
0:[4], | |
1:[3,5], | |
2:[3], | |
3:[1,2,4], | |
4:[0,3,5], | |
5:[1,4,5] | |
} | |
self.state_action_reward_mat = { | |
(0,4):0, | |
(1,3):0, | |
(1,5):100, | |
(2,3):0, | |
(3,1):0, | |
(3,2):0, | |
(3,4):0, | |
(4,0):0, | |
(4,4):0, | |
(4,5):100, | |
(5,1):0, | |
(5,4):0, | |
(5,5):100, | |
} | |
self.q_matrix = {} | |
self.goal_state = 5 | |
self.gamma = 0.5 | |
self.final_state = 5 | |
self.episodes = 50 | |
self.states = 6 | |
def populate_qmat(self): | |
for state in xrange(self.states): | |
for action in xrange(self.states): | |
self.q_matrix[(state, action)] = 0 | |
def random_state(self): | |
return random.choice(self.state_action_mat.keys()) | |
def max_reward_next_state(self, state): | |
candidates = {} | |
for s, val in self.q_matrix.iteritems(): | |
if s[0] == state: | |
candidates[s] = val | |
temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True) | |
return temp[0][1] | |
def find_action(self, state): | |
candidates = {} | |
for s, val in self.q_matrix.iteritems(): | |
if s[0] == state: | |
candidates[s] = val | |
temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True) | |
return temp[0][0][1] | |
# Instance of QLearning | |
qlearn = QLearning() | |
# initialize Q-Matrix to 0(Zero) score values | |
qlearn.populate_qmat() | |
# loop through multiple episodes | |
for e in xrange(qlearn.episodes): | |
# start with initial random state | |
initial_state = qlearn.random_state() | |
start = initial_state | |
# steps taken to reach the goal | |
steps = 0 | |
# path | |
path = [] | |
# till goal is reached in each episode | |
while True: | |
steps += 1 | |
# find action from a particular state with max Q-value | |
action = qlearn.find_action(initial_state) | |
# set this to next state | |
nextstate = action | |
# find qmax for the next state | |
qmax_next_state = qlearn.max_reward_next_state(nextstate) | |
# update the q matrix | |
reward = qlearn.state_action_reward_mat.get((initial_state, action)) | |
# transition not possible from state to state | |
if not reward: | |
reward = -1 | |
# update the Q-matrix | |
qlearn.q_matrix[(initial_state, action)] = reward + qlearn.gamma * qmax_next_state | |
# path update | |
path.append(initial_state) | |
# traverse to the next state | |
initial_state = nextstate | |
if initial_state == qlearn.final_state: | |
print ('Reached the goal in the episode number {} in {}'.\ | |
format(colored(e, 'red'), colored(steps,'green'))) | |
path = [] | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment