def train_single_step(self, state0, state1, a, reward, maximum_discount):
Q0 = self.predict(state0)
Q1 = np.argmax(self.predict(state1)[0])
Q0[0][a] = reward + maximum_discount * Q1, -1), Q0, epochs=1, verbose=0)
