pablocastilla/OpenAIGym-LunaLander-Evolutionary-Parallel.py

## OpenAIGym-LunaLander-Evolutionary-Parallel.py
 ## Evolutionary resolution of lunar lander v2


import gym
from gym import wrappers
import numpy as np
import concurrent.futures
import logging
import threading

gym.undo_logger_setup()


EXPERIMENTPATH = '/tmp/LunarLander-experiment-1'

main_env = gym.make('LunarLander-v2')
main_env = wrappers.Monitor(main_env, EXPERIMENTPATH,force=True)
print(main_env.observation_space.shape[0])
print(main_env.action_space.n)


MAX_EPISODES = 1000
MAX_STEPS = 250
DO_NOTHING_ACTION = 0
POPULATION=100
LEARNING_RATE_EXPLORING= 0.0002  # Learning rate
LEARNING_RATE_MATURE= 0.0002  # Learning rate
SIGMA=0.1


mutation_environments = []

for i in range(POPULATION):
    mutation_environments.append(gym.make('LunarLander-v2'))


class EvolutionaryNetWork:

    def __init__(self, sigma=0.01, state_size=8,
                 action_size=4, population_size=100):

        self.population_size=population_size
        self.action_size =action_size
        self.state_size =state_size
        self.sigma = sigma
        self.weight= np.random.rand(state_size, action_size)


    def generate_mutations(self):

        mutations=[]
        noise = np.random.randn(self.population_size, self.state_size, self.action_size)

        for i in range(self.population_size):
            mutations.append(self.weight+ self.sigma * noise[i])

        np_mutations=np.array(mutations)

        return np_mutations.reshape(self.population_size, self.state_size, self.action_size), noise


    def update_genes(self,total_rewards, noise, learning_rate):
        weighted_noise = np.matmul(noise.T, total_rewards).T
        self.weight = self.weight + learning_rate / (self.population_size * self.sigma) * weighted_noise


def run_episode(weight,env,show = False):

    state = env.reset()
    total_reward = 0
    done = False
    step = 0

    while not done:

        if(show):
            env.render()

        if step < MAX_STEPS:
            action = np.matmul(weight.T, state)
            move = np.argmax(action)
        else:
            move = DO_NOTHING_ACTION

        state, reward, done, _ = env.step(move)
        step += 1
        total_reward += reward


    return total_reward


genes = EvolutionaryNetWork(population_size=POPULATION,sigma=SIGMA)

#run episodes
for ep in range(MAX_EPISODES):

    show=False
    if(ep%100==0):
        show=True

    #run episode with current genes
    current_gen_eval = run_episode(genes.weight,main_env,show)

    mutations,noise = genes.generate_mutations()

    #run mutations in parallel
    total_rewards = np.zeros(POPULATION)
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for i in range(POPULATION):
            future=executor.submit(run_episode,mutations[i],mutation_environments[i],False)
            total_rewards[i] = future.result()

    #select LR
    learning_rate=LEARNING_RATE_EXPLORING
    if(current_gen_eval>200):
        learning_rate=LEARNING_RATE_MATURE

    #update genes
    genes.update_genes(total_rewards, noise, learning_rate)

    gen_mean = np.mean(total_rewards)

    if(ep%1==0):
        #print(genes.weight)
        print(ep, ': ',current_gen_eval,' ',gen_mean)

main_env.close()

for i in range(POPULATION):
    mutation_environments[i].close()
	## Evolutionary resolution of lunar lander v2


	import gym
	from gym import wrappers
	import numpy as np
	import concurrent.futures
	import logging
	import threading

	gym.undo_logger_setup()



	EXPERIMENTPATH = '/tmp/LunarLander-experiment-1'

	main_env = gym.make('LunarLander-v2')
	main_env = wrappers.Monitor(main_env, EXPERIMENTPATH,force=True)
	print(main_env.observation_space.shape[0])
	print(main_env.action_space.n)



	MAX_EPISODES = 1000
	MAX_STEPS = 250
	DO_NOTHING_ACTION = 0
	POPULATION=100
	LEARNING_RATE_EXPLORING= 0.0002 # Learning rate
	LEARNING_RATE_MATURE= 0.0002 # Learning rate
	SIGMA=0.1



	mutation_environments = []

	for i in range(POPULATION):
	mutation_environments.append(gym.make('LunarLander-v2'))



	class EvolutionaryNetWork:

	def __init__(self, sigma=0.01, state_size=8,
	action_size=4, population_size=100):

	self.population_size=population_size
	self.action_size =action_size
	self.state_size =state_size
	self.sigma = sigma
	self.weight= np.random.rand(state_size, action_size)


	def generate_mutations(self):

	mutations=[]
	noise = np.random.randn(self.population_size, self.state_size, self.action_size)

	for i in range(self.population_size):
	mutations.append(self.weight+ self.sigma * noise[i])

	np_mutations=np.array(mutations)

	return np_mutations.reshape(self.population_size, self.state_size, self.action_size), noise


	def update_genes(self,total_rewards, noise, learning_rate):
	weighted_noise = np.matmul(noise.T, total_rewards).T
	self.weight = self.weight + learning_rate / (self.population_size * self.sigma) * weighted_noise




	def run_episode(weight,env,show = False):

	state = env.reset()
	total_reward = 0
	done = False
	step = 0

	while not done:

	if(show):
	env.render()

	if step < MAX_STEPS:
	action = np.matmul(weight.T, state)
	move = np.argmax(action)
	else:
	move = DO_NOTHING_ACTION

	state, reward, done, _ = env.step(move)
	step += 1
	total_reward += reward



	return total_reward




	genes = EvolutionaryNetWork(population_size=POPULATION,sigma=SIGMA)

	#run episodes
	for ep in range(MAX_EPISODES):

	show=False
	if(ep%100==0):
	show=True

	#run episode with current genes
	current_gen_eval = run_episode(genes.weight,main_env,show)

	mutations,noise = genes.generate_mutations()

	#run mutations in parallel
	total_rewards = np.zeros(POPULATION)
	with concurrent.futures.ThreadPoolExecutor() as executor:
	for i in range(POPULATION):
	future=executor.submit(run_episode,mutations[i],mutation_environments[i],False)
	total_rewards[i] = future.result()

	#select LR
	learning_rate=LEARNING_RATE_EXPLORING
	if(current_gen_eval>200):
	learning_rate=LEARNING_RATE_MATURE

	#update genes
	genes.update_genes(total_rewards, noise, learning_rate)

	gen_mean = np.mean(total_rewards)

	if(ep%1==0):
	#print(genes.weight)
	print(ep, ': ',current_gen_eval,' ',gen_mean)

	main_env.close()

	for i in range(POPULATION):
	mutation_environments[i].close()