Skip to content

Instantly share code, notes, and snippets.

@Danaze
Danaze / cartpole_1.py
Created August 25, 2020 16:17
gym cartpole environment observations and actions
import numpy as np
import gym
env = gym.make("CartPole-v1")
# there are two actions
print("number of actions: " + np.str(env.action_space.n))
# check the observations bounds for each (position, velocity, angle, angular velocity)
for _ in range(4):
print (env.observation_space.high[_])
print (env.observation_space.low[_])
def discretize_state(self, obs):
discretized = list()
for i in range(len(obs)):
scaling = (obs[i] + abs(self.lower_bounds[i])) / (self.upper_bounds[i] - self.lower_bounds[i])
new_obs = int(round((self.buckets[i] - 1) * scaling))
new_obs = min(self.buckets[i] - 1, max(0, new_obs))
discretized.append(new_obs)
return tuple(discretized)
def QLupdate(self, state, action, reward, new_state):
# updating the Q-value of the visited state-action pair
self.Q_table[state][action] += self.learning_rate * (reward + self.discount * np.max(self.Q_table[new_state]) - self.Q_table[state][action])
def SARSAupdate(self, state, action, reward, new_state, next_action):
# updating the Q-value of the visited state-action pair
self.Q_table[state][action] += self.learning_rate * (reward + self.discount * self.Q_table[new_state][next_action] - self.Q_table[state][action])
def QLtrain(self):
cum_reward = np.zeros((self.num_episodes))
for ep in range(self.num_episodes):
current_state = self.discretize_state(self.env.reset())
done = False
while not done:
#choosing action according to our exploration-exploitation policy
action = self.choose_action(current_state)
obs, reward, done, _ = self.env.step(action)
cum_reward[ep]+=reward