Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
num_of_episodes = 100000
for episode in range(0, num_of_episodes):
# Reset the enviroment
state = enviroment.reset()
# Initialize variables
reward = 0
terminated = False
while not terminated:
# Take learned path or explore new actions based on the epsilon
if random.uniform(0, 1) < epsilon:
action = enviroment.action_space.sample()
else:
action = np.argmax(q_table[state])
# Take action
next_state, reward, terminated, info = enviroment.step(action)
# Recalculate
q_value = q_table[state, action]
max_value = np.max(q_table[next_state])
new_q_value = (1 - alpha) * q_value + alpha * (reward + gamma * max_value)
# Update Q-table
q_table[state, action] = new_q_value
state = next_state
if (episode + 1) % 100 == 0:
clear_output(wait=True)
print("Episode: {}".format(episode + 1))
enviroment.render()
print("**********************************")
print("Training is done!\n")
print("**********************************")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment