lincerely/q-learning.py

## q-learning.py
#!/usr/bin/env python
#http://mnemstudio.org/path-finding-q-learning-tutorial.htm
#tested on python and python3

import numpy as np
import os

R = np.array([[-1, -1, -1, -1, 0, -1],
              [-1, -1, -1, 0, -1, 100],
              [-1, -1, -1, 0, -1, -1],
              [-1, 0, 0, -1, 0, -1],
              [0, -1, -1, 0, -1, 100],
              [-1, 0, -1, -1, 0, 100]]).astype("float16")

Q = np.zeros_like(R)

gamma = 0.8
max_iter = 1000
num_states = 6
actions = np.arange(6)

#start training
for n in range(int(max_iter)):
    isGoal = False

    #select a random inital state
    state = np.random.randint(num_states);

    while not isGoal:

        valid_moves = R[state] >= 0

        valid_actions = actions[valid_moves == True]
        action = int(np.random.choice(valid_actions,size=1))

        next_state = action

        Q[state,action] = R[state,action] + gamma * max(Q[next_state,:])

        #comment out the following 4 lines to skip to the result directly
        print(np.rint(Q))
        print('current state: ',state)
        print('iterations:', n)
        os.system('cls' if os.name == 'nt' else 'clear')

        if next_state == 5:
            isGoal = True

        state = next_state
#end of training

#display final Q
print('\nTrained Q: ')
Q =np.rint(Q)
print(Q)

nQ = Q/np.max(Q)*100
print('\nnormalized Q: ')
print(np.rint(nQ))

#testing
state = 2
steps = [];
steps.append(state)
while state != 5:
    actions = np.where(Q[state]== np.max(Q[state]))[0]

    if actions.shape[0] >1:
        action = int(np.random.choice(actions,size=1))
    else:
        action = int(actions)

    steps.append(action)
    state = action

print('\nFrom 2 to 5')
print(steps)
	#!/usr/bin/env python
	#http://mnemstudio.org/path-finding-q-learning-tutorial.htm
	#tested on python and python3

	import numpy as np
	import os

	R = np.array([[-1, -1, -1, -1, 0, -1],
	[-1, -1, -1, 0, -1, 100],
	[-1, -1, -1, 0, -1, -1],
	[-1, 0, 0, -1, 0, -1],
	[0, -1, -1, 0, -1, 100],
	[-1, 0, -1, -1, 0, 100]]).astype("float16")

	Q = np.zeros_like(R)

	gamma = 0.8
	max_iter = 1000
	num_states = 6
	actions = np.arange(6)

	#start training
	for n in range(int(max_iter)):
	isGoal = False

	#select a random inital state
	state = np.random.randint(num_states);

	while not isGoal:

	valid_moves = R[state] >= 0

	valid_actions = actions[valid_moves == True]
	action = int(np.random.choice(valid_actions,size=1))

	next_state = action

	Q[state,action] = R[state,action] + gamma * max(Q[next_state,:])

	#comment out the following 4 lines to skip to the result directly
	print(np.rint(Q))
	print('current state: ',state)
	print('iterations:', n)
	os.system('cls' if os.name == 'nt' else 'clear')

	if next_state == 5:
	isGoal = True

	state = next_state
	#end of training

	#display final Q
	print('\nTrained Q: ')
	Q =np.rint(Q)
	print(Q)

	nQ = Q/np.max(Q)*100
	print('\nnormalized Q: ')
	print(np.rint(nQ))

	#testing
	state = 2
	steps = [];
	steps.append(state)
	while state != 5:
	actions = np.where(Q[state]== np.max(Q[state]))[0]

	if actions.shape[0] >1:
	action = int(np.random.choice(actions,size=1))
	else:
	action = int(actions)

	steps.append(action)
	state = action

	print('\nFrom 2 to 5')
	print(steps)