def train_single_step(self, state0, state1, a, reward, maximum_discount): | |
Q0 = self.predict(state0) | |
Q1 = np.argmax(self.predict(state1)[0]) | |
Q0[0][a] = reward + maximum_discount * Q1 | |
self.model.fit(np.array(state0).reshape(1, -1), Q0, epochs=1, verbose=0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment