PierreExeter/train_test_cart.py

## train_test_cart.py
# TRAINING PHASE
rewards = []

for episode in range(n_episodes):
    current_state = env.reset()
    current_state = discretize(current_state)

    alpha = get_alpha(episode)
    epsilon = get_epsilon(episode)

    episode_rewards = 0

    for t in range(n_steps):
        # env.render()
        action = epsilon_policy(current_state, epsilon)
        new_state, reward, done, _ = env.step(action)
        new_state = discretize(new_state)
        update_q(current_state, action, reward, new_state, alpha)
        current_state = new_state

        # increment the cumulative reward
        episode_rewards += reward

        # at the end of the episode
        if done:
            print('Episode:{}/{} finished with a total reward of: {}'.format(episode, n_episodes, episode_rewards))
            break

    # append the episode cumulative reward to the reward list
    rewards.append(episode_rewards)


# PLOT RESULTS
x = range(n_episodes)
plt.plot(x, rewards)
plt.xlabel('episode')
plt.ylabel('Training cumulative reward')
plt.savefig('Q_learning_CART.png', dpi=300)
plt.show()

# TEST PHASE
current_state = env.reset()
current_state = discretize(current_state)
episode_rewards = 0

for t in range(n_steps):
    env.render()
    action = greedy_policy(current_state)
    new_state, reward, done, _ = env.step(action)
    new_state = discretize(new_state)
    update_q(current_state, action, reward, new_state, alpha)
    current_state = new_state
    episode_rewards += reward

    # at the end of the episode
    if done:
        print('Test episode finished with a total reward of: {}'.format(episode_rewards))
        break

env.close()
	# TRAINING PHASE
	rewards = []

	for episode in range(n_episodes):
	current_state = env.reset()
	current_state = discretize(current_state)

	alpha = get_alpha(episode)
	epsilon = get_epsilon(episode)

	episode_rewards = 0

	for t in range(n_steps):
	# env.render()
	action = epsilon_policy(current_state, epsilon)
	new_state, reward, done, _ = env.step(action)
	new_state = discretize(new_state)
	update_q(current_state, action, reward, new_state, alpha)
	current_state = new_state

	# increment the cumulative reward
	episode_rewards += reward

	# at the end of the episode
	if done:
	print('Episode:{}/{} finished with a total reward of: {}'.format(episode, n_episodes, episode_rewards))
	break

	# append the episode cumulative reward to the reward list
	rewards.append(episode_rewards)


	# PLOT RESULTS
	x = range(n_episodes)
	plt.plot(x, rewards)
	plt.xlabel('episode')
	plt.ylabel('Training cumulative reward')
	plt.savefig('Q_learning_CART.png', dpi=300)
	plt.show()

	# TEST PHASE
	current_state = env.reset()
	current_state = discretize(current_state)
	episode_rewards = 0

	for t in range(n_steps):
	env.render()
	action = greedy_policy(current_state)
	new_state, reward, done, _ = env.step(action)
	new_state = discretize(new_state)
	update_q(current_state, action, reward, new_state, alpha)
	current_state = new_state
	episode_rewards += reward

	# at the end of the episode
	if done:
	print('Test episode finished with a total reward of: {}'.format(episode_rewards))
	break

	env.close()