Skip to content

Instantly share code, notes, and snippets.

@prakhar21
Last active May 5, 2018 06:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save prakhar21/564201531b9b46702d36fcfff0683655 to your computer and use it in GitHub Desktop.
Save prakhar21/564201531b9b46702d36fcfff0683655 to your computer and use it in GitHub Desktop.
Q-Learning Implementation to solve maze escape problem using Reinformcement Learning
'''
@author: Prakhar
Motivation: http://mnemstudio.org/path-finding-q-learning-tutorial.htm
'''
import random
from termcolor import colored
class QLearning:
def __init__(self):
# initializing the environment here
# ideally should have made a different class
self.state_action_mat = {
0:[4],
1:[3,5],
2:[3],
3:[1,2,4],
4:[0,3,5],
5:[1,4,5]
}
self.state_action_reward_mat = {
(0,4):0,
(1,3):0,
(1,5):100,
(2,3):0,
(3,1):0,
(3,2):0,
(3,4):0,
(4,0):0,
(4,4):0,
(4,5):100,
(5,1):0,
(5,4):0,
(5,5):100,
}
self.q_matrix = {}
self.goal_state = 5
self.gamma = 0.5
self.final_state = 5
self.episodes = 50
self.states = 6
def populate_qmat(self):
for state in xrange(self.states):
for action in xrange(self.states):
self.q_matrix[(state, action)] = 0
def random_state(self):
return random.choice(self.state_action_mat.keys())
def max_reward_next_state(self, state):
candidates = {}
for s, val in self.q_matrix.iteritems():
if s[0] == state:
candidates[s] = val
temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
return temp[0][1]
def find_action(self, state):
candidates = {}
for s, val in self.q_matrix.iteritems():
if s[0] == state:
candidates[s] = val
temp = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
return temp[0][0][1]
# Instance of QLearning
qlearn = QLearning()
# initialize Q-Matrix to 0(Zero) score values
qlearn.populate_qmat()
# loop through multiple episodes
for e in xrange(qlearn.episodes):
# start with initial random state
initial_state = qlearn.random_state()
start = initial_state
# steps taken to reach the goal
steps = 0
# path
path = []
# till goal is reached in each episode
while True:
steps += 1
# find action from a particular state with max Q-value
action = qlearn.find_action(initial_state)
# set this to next state
nextstate = action
# find qmax for the next state
qmax_next_state = qlearn.max_reward_next_state(nextstate)
# update the q matrix
reward = qlearn.state_action_reward_mat.get((initial_state, action))
# transition not possible from state to state
if not reward:
reward = -1
# update the Q-matrix
qlearn.q_matrix[(initial_state, action)] = reward + qlearn.gamma * qmax_next_state
# path update
path.append(initial_state)
# traverse to the next state
initial_state = nextstate
if initial_state == qlearn.final_state:
print ('Reached the goal in the episode number {} in {}'.\
format(colored(e, 'red'), colored(steps,'green')))
path = []
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment