Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Last active November 14, 2016 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MaxHalford/e1fb11803315f6ec64d6a605f17fa183 to your computer and use it in GitHub Desktop.
Save MaxHalford/e1fb11803315f6ec64d6a605f17fa183 to your computer and use it in GitHub Desktop.
import random
STATES = (0, 1, 2, 3, 4, 5)
ACTION_SET = {
0: (0, 4),
1: (1, 3, 5),
2: (2, 3),
3: (1, 2, 3, 4),
4: (0, 3, 4, 5),
5: (1, 4, 5)
}
ENVIRONMENT_REWARDS = {
0: 0,
1: 0,
2: 0,
3: 0,
4: 0,
5: 100
}
LEARNING_RATE = 0.2
EPISODES = 10
GOAL_STATE = 5
POLICY = {
state: {
action: 0
for action in ACTION_SET[state]
}
for state in STATES
}
for episode in range(EPISODES):
state = random.choice(STATES)
while state != GOAL_STATE:
action = random.choice(ACTION_SET[state])
resulting_state = action
instant_reward = ENVIRONMENT_REWARDS[resulting_state]
delayed_reward = max(POLICY[resulting_state].values())
POLICY[state][action] = instant_reward + LEARNING_RATE * delayed_reward
state = action
print(POLICY)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment