Skip to content

Instantly share code, notes, and snippets.

@NMZivkovic
Created Jul 20, 2019
Embed
What would you like to do?
class Agent(object):
def __init__(self, enviroment, optimizer, image_shape):
# Initialize atributes
self._action_size = enviroment.action_space.n
self._optimizer = optimizer
self._image_shape = image_shape
self.enviroment = enviroment
self.expirience_replay = deque(maxlen=100000)
# Initialize discount and exploration rate
self.gamma = 0.6
self.epsilon = 0.1
# Build networks
self.q_network = self._build_compile_model()
self.target_network = self._build_compile_model()
self.alighn_target_model()
def store(self, state, action, reward, next_state, terminated):
self.expirience_replay.append((state, action, reward, next_state, terminated))
def _update_epsilon(self):
self.epsilon -= self.epsilon_decay
self.epsilon = max(self.epsilon_min, self.epsilon)
def _build_compile_model(self):
model = Sequential()
model.add(Conv2D(32, 8, strides=(4, 4), padding="valid",activation="relu",
input_shape = self._image_shape))
model.add(Conv2D(64, 4, strides=(2, 2), padding="valid", activation="relu",
input_shape = self._image_shape))
model.add(Conv2D(64, 3, strides=(1, 1), padding="valid",activation="relu",
input_shape = self._image_shape))
model.add(Flatten())
model.add(Dense(512, activation="relu"))
model.add(Dense(self._action_size))
huber = Huber()
model.compile(loss = huber,
optimizer=self._optimizer,
metrics=["accuracy"])
return model
def alighn_target_model(self):
self.target_network.set_weights(self.q_network.get_weights())
def act(self, frame):
if np.random.rand() <= self.epsilon:
return self.enviroment.action_space.sample()
frame = np.expand_dims(np.asarray(frame).astype(np.float64), axis=0)
q_values = self.q_network.predict(frame)
return np.argmax(q_values[0])
def retrain(self, batch_size):
minibatch = random.sample(self.expirience_replay, batch_size)
for state, action, reward, next_state, terminated in minibatch:
state = np.expand_dims(np.asarray(state).astype(np.float64), axis=0)
next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0)
target = self.q_network.predict(state)
if terminated:
target[0][action] = reward
else:
t = self.target_network.predict(next_state)
target[0][action] = reward + self.gamma * np.amax(t)
self.q_network.fit(state, target, epochs=1, verbose=0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment