SolClover/Art057_Python_007.py

## Art057_Python_007.py
# This is our acting policy (epsilon-greedy), which selects an action for exploration/exploitation during training
def epsilon_greedy(Qtable, state, epsilon):
    # Generate a random number and compare to epsilon, if lower then explore, otherwise exploit
    randnum = np.random.uniform(0, 1)
    if randnum < epsilon:
        action = env.action_space.sample()    # explore
    else:
        action = np.argmax(Qtable[state, :])  # exploit
    return action


# This function is to update the Qtable.
# It is also based on epsilon-greedy approach because the next_action is decided by epsilon-greedy policy
def update_Q(Qtable, state, action, reward, next_state, next_action):
    # 𝑄(𝑆𝑡,𝐴𝑡)=𝑄(𝑆𝑡,𝐴𝑡)+𝛼[𝑅𝑡+1+𝛾𝑄(𝑆𝑡+1,𝐴𝑡+1)−𝑄(𝑆𝑡,𝐴𝑡)]
    Qtable[state][action] = Qtable[state][action] + alpha * (reward + gamma * (Qtable[next_state][next_action]) - Qtable[state][action])
    return Qtable


# This function (greedy) will return the action from Qtable when we do evaluation
def eval_greedy(Qtable, state):
    action = np.argmax(Qtable[state, :])
    return action
	# This is our acting policy (epsilon-greedy), which selects an action for exploration/exploitation during training
	def epsilon_greedy(Qtable, state, epsilon):
	# Generate a random number and compare to epsilon, if lower then explore, otherwise exploit
	randnum = np.random.uniform(0, 1)
	if randnum < epsilon:
	action = env.action_space.sample() # explore
	else:
	action = np.argmax(Qtable[state, :]) # exploit
	return action


	# This function is to update the Qtable.
	# It is also based on epsilon-greedy approach because the next_action is decided by epsilon-greedy policy
	def update_Q(Qtable, state, action, reward, next_state, next_action):
	# 𝑄(𝑆𝑡,𝐴𝑡)=𝑄(𝑆𝑡,𝐴𝑡)+𝛼[𝑅𝑡+1+𝛾𝑄(𝑆𝑡+1,𝐴𝑡+1)−𝑄(𝑆𝑡,𝐴𝑡)]
	Qtable[state][action] = Qtable[state][action] + alpha * (reward + gamma * (Qtable[next_state][next_action]) - Qtable[state][action])
	return Qtable


	# This function (greedy) will return the action from Qtable when we do evaluation
	def eval_greedy(Qtable, state):
	action = np.argmax(Qtable[state, :])
	return action