NMZivkovic/Agent.py

## Agent.py
class Agent(object):
    def __init__(self, enviroment, optimizer, image_shape):

        # Initialize atributes
        self._action_size = enviroment.action_space.n
        self._optimizer = optimizer
        self._image_shape = image_shape
        self.enviroment = enviroment

        self.expirience_replay = deque(maxlen=100000)

        # Initialize discount and exploration rate
        self.gamma = 0.6
        self.epsilon = 0.1

        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.alighn_target_model()

    def store(self, state, action, reward, next_state, terminated):
        self.expirience_replay.append((state, action, reward, next_state, terminated))

    def _update_epsilon(self):
        self.epsilon -= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)

    def _build_compile_model(self):
        model = Sequential()
        model.add(Conv2D(32, 8, strides=(4, 4), padding="valid",activation="relu",
                              input_shape = self._image_shape))
        model.add(Conv2D(64, 4, strides=(2, 2), padding="valid", activation="relu",
                              input_shape = self._image_shape))
        model.add(Conv2D(64, 3, strides=(1, 1), padding="valid",activation="relu",
                              input_shape = self._image_shape))
        model.add(Flatten())
        model.add(Dense(512, activation="relu"))
        model.add(Dense(self._action_size))
        huber = Huber()
        model.compile(loss = huber,
                           optimizer=self._optimizer,
                           metrics=["accuracy"])
        return model

    def alighn_target_model(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def act(self, frame):
        if np.random.rand() <= self.epsilon:
            return self.enviroment.action_space.sample()

        frame = np.expand_dims(np.asarray(frame).astype(np.float64), axis=0)

        q_values = self.q_network.predict(frame)
        return np.argmax(q_values[0])


    def retrain(self, batch_size):
        minibatch = random.sample(self.expirience_replay, batch_size)

        for state, action, reward, next_state, terminated in minibatch:

            state = np.expand_dims(np.asarray(state).astype(np.float64), axis=0)
            next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0)

            target = self.q_network.predict(state)

            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(t)

            self.q_network.fit(state, target, epochs=1, verbose=0)
	class Agent(object):
	def __init__(self, enviroment, optimizer, image_shape):

	# Initialize atributes
	self._action_size = enviroment.action_space.n
	self._optimizer = optimizer
	self._image_shape = image_shape
	self.enviroment = enviroment

	self.expirience_replay = deque(maxlen=100000)

	# Initialize discount and exploration rate
	self.gamma = 0.6
	self.epsilon = 0.1

	# Build networks
	self.q_network = self._build_compile_model()
	self.target_network = self._build_compile_model()
	self.alighn_target_model()

	def store(self, state, action, reward, next_state, terminated):
	self.expirience_replay.append((state, action, reward, next_state, terminated))

	def _update_epsilon(self):
	self.epsilon -= self.epsilon_decay
	self.epsilon = max(self.epsilon_min, self.epsilon)

	def _build_compile_model(self):
	model = Sequential()
	model.add(Conv2D(32, 8, strides=(4, 4), padding="valid",activation="relu",
	input_shape = self._image_shape))
	model.add(Conv2D(64, 4, strides=(2, 2), padding="valid", activation="relu",
	input_shape = self._image_shape))
	model.add(Conv2D(64, 3, strides=(1, 1), padding="valid",activation="relu",
	input_shape = self._image_shape))
	model.add(Flatten())
	model.add(Dense(512, activation="relu"))
	model.add(Dense(self._action_size))
	huber = Huber()
	model.compile(loss = huber,
	optimizer=self._optimizer,
	metrics=["accuracy"])
	return model

	def alighn_target_model(self):
	self.target_network.set_weights(self.q_network.get_weights())

	def act(self, frame):
	if np.random.rand() <= self.epsilon:
	return self.enviroment.action_space.sample()

	frame = np.expand_dims(np.asarray(frame).astype(np.float64), axis=0)

	q_values = self.q_network.predict(frame)
	return np.argmax(q_values[0])


	def retrain(self, batch_size):
	minibatch = random.sample(self.expirience_replay, batch_size)

	for state, action, reward, next_state, terminated in minibatch:

	state = np.expand_dims(np.asarray(state).astype(np.float64), axis=0)
	next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0)

	target = self.q_network.predict(state)

	if terminated:
	target[0][action] = reward
	else:
	t = self.target_network.predict(next_state)
	target[0][action] = reward + self.gamma * np.amax(t)

	self.q_network.fit(state, target, epochs=1, verbose=0)