neilslater/gist:28004397a544f97b2ff03d25d4ddae52

## gistfile1.txt
#improves the output of keras on Windows
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
import logging
logging.getLogger("tensorflow").setLevel(logging.WARNING)

import numpy as np
import keras as K
from matplotlib import pyplot as plt
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.layers.merge import concatenate
from keras.optimizers import Adam, SGD
from sklearn.preprocessing import OneHotEncoder
import math
import random

def build_model(lr):
    state_input_layer = Input((2,))
    action_input_layer = Input((3,))
    merge = concatenate([state_input_layer, action_input_layer])

    x = Dense(1200, activation="tanh")(merge)
    x = Dropout(0.5)(x)
    output_layer = Dense(1)(x)

    model = Model(inputs=[state_input_layer, action_input_layer], outputs=[output_layer])
    opt = SGD(lr=lr)
    model.compile(opt, "mse")
    model.summary()
    return model

class MountainCar(object):
    def __init__(self, alpha=.001, gamma=1.0, epsilon=0.1, boundaries=(-1.2, 0.5), velocity_boundaries=(-0.07, 0.07), model=None, experience=None):
        self.position = np.random.rand()*0.2-0.6
        self.velocity = 0.0

        self.boundaries = boundaries
        self.velocity_boundaries = velocity_boundaries

        self.actions = {-1: "move left", 0: "idle", 1: "move right"}
        self.action = 0
        self.is_terminal = False

        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma

        if experience is None:
            experience = []
        self.experienced_states = experience

        if model is None:
            model = build_model(lr=alpha)
        self.model = model
        self.action_encoder = OneHotEncoder().fit(np.array([0, 1, 2]).reshape((-1, 1)))

    def process_action(self, action):
        if self.position < self.boundaries[0]:
            self.velocity = 0.0
        self.position = np.clip(self.position+self.velocity, *self.boundaries)
        self.velocity = np.clip(self.velocity + 0.001*action-0.0025*np.cos(3*self.position), *self.velocity_boundaries)

    def choose_action(self):
        if np.random.rand() < self.epsilon:
            action = np.random.randint(-1, 2)
        else:
            state = np.array([self.position, self.velocity])

            actions = np.array([-1, 0, 1]).reshape(-1, 1)
            actions = self.encode_action(actions)

            q_prediction_input_state = np.tile(state, len(actions)).reshape((len(actions), -1))

            q_predictions = self.model.predict([q_prediction_input_state, actions])

            action = np.argmax(actions[np.argmax(q_predictions)])-1

        return action

    def encode_action(self, action):
        return self.action_encoder.transform(np.array([action]).reshape((-1, 1))+1).A

    def batch_samples(self, nsamples):
        batch = random.sample(self.experienced_states, nsamples)
        state_inputs = np.array([ x[0][0] for x in batch ])
        action_inputs = np.array([ x[1][0] for x in batch ])
        predictions = np.array( [ (x[2] + self.gamma*self.model.predict(x[3:5])[0]) for x in batch ] )
        for i, x in enumerate(batch):
            if x[3][0][0] >= self.boundaries[1]:
                predictions[i,0] = 0

        # print([state_inputs, action_inputs], predictions )
        return ( [state_inputs, action_inputs], predictions)

    def train_from_experience(self, nsamples):
        inputs, predictions = self.batch_samples( nsamples )
        fit_res = self.model.train_on_batch(inputs, predictions)

    def move(self):
        old_state = np.array([self.position, self.velocity]).reshape((-1, 2))

        self.process_action(self.action)
        new_state = np.array([self.position, self.velocity]).reshape((-1, 2))

        if self.position >= self.boundaries[1]:
            reward = 0
            prediction = [reward]

            self.is_terminal = True
            new_action = self.action
            new_action_encoded = self.encode_action(self.action)

            self.experienced_states.append([old_state, new_action_encoded, 0, new_state, new_action_encoded])
        else:
            new_action = self.choose_action()
            reward = -1
            new_action_encoded = self.encode_action(new_action)

            self.experienced_states.append([old_state, self.encode_action(self.action), -1, new_state, new_action_encoded])

        # Until we get 1000 experience, we don't even bother training!
        if len(self.experienced_states) > 1000:
            self.train_from_experience( 20 )

        self.action = new_action

def main():
    nb_episodes = 5000

    model = None
    experience = None

    steps = []
    for n in range(nb_episodes):
        car = MountainCar(model=model, experience=experience, epsilon=0.5/(n+1), alpha=0.001)

        t = 0
        positions = []
        while (not car.is_terminal):
            car.move()
            t+= 1
            positions += [car.position]

        print('Episode', n, 'finished in', t, 'steps')
        #plot the position curve
        #plt.close()
        #plt.plot(positions)
        #plt.show(block=False)

        steps += [t]

        model = car.model
        experience = car.experienced_states

        # Don't fill up memory with too much experience . . .
        if len(experience) > 100000:
            experience = random.sample(experience, 75000)

main()
	#improves the output of keras on Windows
	import os
	os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
	import tensorflow as tf
	tf.logging.set_verbosity(tf.logging.ERROR)
	import logging
	logging.getLogger("tensorflow").setLevel(logging.WARNING)

	import numpy as np
	import keras as K
	from matplotlib import pyplot as plt
	from keras.layers import Input, Dense, Dropout
	from keras.models import Model
	from keras.layers.merge import concatenate
	from keras.optimizers import Adam, SGD
	from sklearn.preprocessing import OneHotEncoder
	import math
	import random

	def build_model(lr):
	state_input_layer = Input((2,))
	action_input_layer = Input((3,))
	merge = concatenate([state_input_layer, action_input_layer])

	x = Dense(1200, activation="tanh")(merge)
	x = Dropout(0.5)(x)
	output_layer = Dense(1)(x)

	model = Model(inputs=[state_input_layer, action_input_layer], outputs=[output_layer])
	opt = SGD(lr=lr)
	model.compile(opt, "mse")
	model.summary()
	return model

	class MountainCar(object):
	def __init__(self, alpha=.001, gamma=1.0, epsilon=0.1, boundaries=(-1.2, 0.5), velocity_boundaries=(-0.07, 0.07), model=None, experience=None):
	self.position = np.random.rand()*0.2-0.6
	self.velocity = 0.0

	self.boundaries = boundaries
	self.velocity_boundaries = velocity_boundaries

	self.actions = {-1: "move left", 0: "idle", 1: "move right"}
	self.action = 0
	self.is_terminal = False

	self.epsilon = epsilon
	self.alpha = alpha
	self.gamma = gamma

	if experience is None:
	experience = []
	self.experienced_states = experience

	if model is None:
	model = build_model(lr=alpha)
	self.model = model
	self.action_encoder = OneHotEncoder().fit(np.array([0, 1, 2]).reshape((-1, 1)))

	def process_action(self, action):
	if self.position < self.boundaries[0]:
	self.velocity = 0.0
	self.position = np.clip(self.position+self.velocity, *self.boundaries)
	self.velocity = np.clip(self.velocity + 0.001action-0.0025np.cos(3self.position), self.velocity_boundaries)

	def choose_action(self):
	if np.random.rand() < self.epsilon:
	action = np.random.randint(-1, 2)
	else:
	state = np.array([self.position, self.velocity])

	actions = np.array([-1, 0, 1]).reshape(-1, 1)
	actions = self.encode_action(actions)

	q_prediction_input_state = np.tile(state, len(actions)).reshape((len(actions), -1))

	q_predictions = self.model.predict([q_prediction_input_state, actions])

	action = np.argmax(actions[np.argmax(q_predictions)])-1

	return action

	def encode_action(self, action):
	return self.action_encoder.transform(np.array([action]).reshape((-1, 1))+1).A

	def batch_samples(self, nsamples):
	batch = random.sample(self.experienced_states, nsamples)
	state_inputs = np.array([ x[0][0] for x in batch ])
	action_inputs = np.array([ x[1][0] for x in batch ])
	predictions = np.array( [ (x[2] + self.gamma*self.model.predict(x[3:5])[0]) for x in batch ] )
	for i, x in enumerate(batch):
	if x[3][0][0] >= self.boundaries[1]:
	predictions[i,0] = 0

	# print([state_inputs, action_inputs], predictions )
	return ( [state_inputs, action_inputs], predictions)

	def train_from_experience(self, nsamples):
	inputs, predictions = self.batch_samples( nsamples )
	fit_res = self.model.train_on_batch(inputs, predictions)

	def move(self):
	old_state = np.array([self.position, self.velocity]).reshape((-1, 2))

	self.process_action(self.action)
	new_state = np.array([self.position, self.velocity]).reshape((-1, 2))

	if self.position >= self.boundaries[1]:
	reward = 0
	prediction = [reward]

	self.is_terminal = True
	new_action = self.action
	new_action_encoded = self.encode_action(self.action)

	self.experienced_states.append([old_state, new_action_encoded, 0, new_state, new_action_encoded])
	else:
	new_action = self.choose_action()
	reward = -1
	new_action_encoded = self.encode_action(new_action)

	self.experienced_states.append([old_state, self.encode_action(self.action), -1, new_state, new_action_encoded])

	# Until we get 1000 experience, we don't even bother training!
	if len(self.experienced_states) > 1000:
	self.train_from_experience( 20 )

	self.action = new_action

	def main():
	nb_episodes = 5000

	model = None
	experience = None

	steps = []
	for n in range(nb_episodes):
	car = MountainCar(model=model, experience=experience, epsilon=0.5/(n+1), alpha=0.001)

	t = 0
	positions = []
	while (not car.is_terminal):
	car.move()
	t+= 1
	positions += [car.position]

	print('Episode', n, 'finished in', t, 'steps')
	#plot the position curve
	#plt.close()
	#plt.plot(positions)
	#plt.show(block=False)

	steps += [t]

	model = car.model
	experience = car.experienced_states

	# Don't fill up memory with too much experience . . .
	if len(experience) > 100000:
	experience = random.sample(experience, 75000)

	main()