Skip to content

Instantly share code, notes, and snippets.

@guillefix
Created August 4, 2017 12:59
Show Gist options
  • Save guillefix/4c20284ca1e0d87dfab1dfde11ef9ea6 to your computer and use it in GitHub Desktop.
Save guillefix/4c20284ca1e0d87dfab1dfde11ef9ea6 to your computer and use it in GitHub Desktop.
import numpy as np
import gym
import matplotlib.pyplot
from math import *
import tensorflow as tf
from keras.layers import Input, Dense, concatenate
from keras.models import Sequential, Model
env = gym.make('CartPole-v1')
obs = env.reset()
obs
state_dim=4
hidden_layer_dim=50
action_dim=1
plan_size=16
latent_dim=2
#PLAN AUTOENCODER
context = Input(shape=(state_dim,))
input_plan = Input(shape=(plan_size*action_dim,))
plan_context = concatenate([input_plan, context])
pe_layer_1 = Dense(hidden_layer_dim, activation='relu')(plan_context)
pe_layer_2 = Dense(hidden_layer_dim, activation='relu')(pe_layer_1)
pe_layer_3 = Dense(hidden_layer_dim, activation='relu')(pe_layer_2)
encoded_plan = Dense(latent_dim, activation='tanh')(pe_layer_3)
encoded_plan_context = concatenate([encoded_plan, context])
pd_layer_1 = Dense(hidden_layer_dim, activation='relu')(encoded_plan_context)
pd_layer_2 = Dense(hidden_layer_dim, activation='relu')(pd_layer_1)
pd_layer_3 = Dense(hidden_layer_dim, activation='relu')(pd_layer_2)
decoded_plan = Dense(plan_size*action_dim, activation='sigmoid')(pd_layer_3)
plan_autoencoder = Model([input_plan, context], decoded_plan)
# retrieve the last layer of the autoencoder model
plan_decoder_l1 = plan_autoencoder.layers[-4]
plan_decoder_l2 = plan_autoencoder.layers[-3]
plan_decoder_l3 = plan_autoencoder.layers[-2]
plan_decoder = plan_autoencoder.layers[-1]
#FUTURE AUTOENCODER
input_future = Input(shape=(state_dim,))
future_context = concatenate([input_future, context])
fe_layer_1 = Dense(hidden_layer_dim, activation='relu')(future_context)
fe_layer_2 = Dense(hidden_layer_dim, activation='relu')(fe_layer_1)
fe_layer_3 = Dense(hidden_layer_dim, activation='relu')(fe_layer_2)
encoded_future = Dense(latent_dim, activation='tanh')(fe_layer_3)
encoded_future_context = concatenate([encoded_future, context])
fd_layer_1 = Dense(hidden_layer_dim, activation='relu')(encoded_future_context)
fd_layer_2 = Dense(hidden_layer_dim, activation='relu')(fd_layer_1)
fd_layer_3 = Dense(hidden_layer_dim, activation='relu')(fd_layer_2)
decoded_future = Dense(state_dim, activation='linear')(fd_layer_3)
future_autoencoder = Model([input_future, context], decoded_future)
future_decoder_l1 = future_autoencoder.layers[-4]
future_decoder_l2 = future_autoencoder.layers[-3]
future_decoder_l3 = future_autoencoder.layers[-2]
future_decoder = future_autoencoder.layers[-1]
future_encoder = Model([input_future,context], encoded_future)
plan_inference = Model([input_future,context],plan_decoder(plan_decoder_l3(plan_decoder_l2(plan_decoder_l1(encoded_future_context)))))
future_prediction = Model([input_plan,context],future_decoder(future_decoder_l3(future_decoder_l2(future_decoder_l1(encoded_plan_context)))))
plan_super_autoencoder = Model([input_plan, context], plan_inference([future_prediction([input_plan, context]), context]))
future_super_autoencoder = Model([input_future, context], future_prediction([plan_inference([input_future, context]), context]))
future_autoencoder.compile(optimizer='adam', loss='mean_squared_error')
future_super_autoencoder.compile(optimizer='adam', loss='mean_squared_error')
plan_autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
plan_super_autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
plan_inference.compile(optimizer='adadelta', loss='binary_crossentropy')
future_prediction.compile(optimizer='adam', loss='mean_squared_error')
def getPolicy(obs, targ):
plan = plan_inference.predict([np.array([targ]), np.array([obs])])
latent = future_encoder.predict([np.array([targ]),np.array([obs])])
return plan[0], latent[0]
def trainNet():
BS = 1000
futures = []
action_sequences = []
contexts = []
meanlen = np.mean(np.array([x.shape[0] for x in states_data]))
for i in range(BS*10):
# j = floor(len(states_data)*0.75) + np.random.randint(floor(len(states_data)*0.25))
j = np.random.randint(len(states_data))
if states_data[j].shape[0]>plan_size+1:
k = np.random.randint(states_data[j].shape[0]-plan_size-1)
contexts.append(states_data[j][k])
action_sequences.append(actions_data[j][k+1:k+1+plan_size])
futures.append(states_data[j][k+plan_size])
futures = np.array(futures)
action_sequences = np.array(action_sequences)
contexts = np.array(contexts)
future_super_autoencoder.fit([futures,contexts],futures,epochs=1,batch_size=BS, verbose=0)
future_autoencoder.fit([futures,contexts],futures,epochs=1,batch_size=BS, verbose=0)
plan_autoencoder.fit([action_sequences,contexts],action_sequences,epochs=1,batch_size=BS, verbose=0)
plan_super_autoencoder.fit([action_sequences,contexts],action_sequences,epochs=1,batch_size=BS, verbose=0)
plan_inference.fit([futures,contexts],action_sequences,epochs=1,batch_size=BS, verbose=0)
future_prediction.fit([action_sequences,contexts],futures,epochs=1,batch_size=BS, verbose=0)
# return d_err
def run_agent():
for sub in range(5):
obs = env.reset()
obs[0] *= 10
obs[2] *= 10
targ = np.zeros(4)
policy,latent = getPolicy(np.array(obs),targ)
done = False
run_obs = []
run_act = []
run_preds = []
run_latents = []
step= 0
j = 0
run_obs.append(obs)
while (not done) and (step<500):
# act = (np.random.rand()<(0.5*(policy[j]+1)))*1
act = (np.random.rand()<policy[j])
obs, reward, done, info = env.step(act)
obs[0] *= 10
obs[2] *= 10
# run_act.append(2*act-1)
run_act.append(act)
run_obs.append(obs)
# err = np.mean( (obs-policy[j*5:j*5+4])**2 )
j += 1
if j>1: # or err>0.05:
policy,latent = getPolicy(np.array(obs),targ)
j = 0
run_latents.append(latent)
# frames.append(env.render(mode = 'rgb_array'))
step += 1
run_act = np.array(run_act, dtype='int')
run_obs = np.array(run_obs)
actions_data.append(run_act)
states_data.append(run_obs)
dlatents.append(np.array(run_latents))
rewards.append(run_obs.shape[0])
# f = open("runs/%.6d.txt" % trial,"a")
# f.write("%d\n" % run_obs.shape[0])
# f.close()
# env.render(close=True)
actions_data = []
states_data = []
preds = []
rewards = []
dlatents = []
discerr = []
session=tf.Session()
session.run(tf.global_variables_initializer())
for cycle in range(100):
# rate = 1e-4
run_agent()
print(cycle, rewards[-5:])
for epoch in range(100):
trainNet()
import matplotlib.pyplot as plt
plt.plot(rewards)
plt.xlabel("episode")
plt.ylabel("reward")
plt.show()
# plt.savefig("learning.png")
saver = tf.train.Saver()
saver.save(session, "action_inference_cart_pole_plan16_5000episode.ckpt")
# saver.restore(sess, "/tmp/model.ckpt")
# # foo=tf.global_variables()[1657]
# foo=tf.global_variables()[1070]
# foo.name
# # bar=session.graph.get_tensor_by_name('Variable_896:0')
# # bar==foo
# # session.run(tf.variables_initializer([tf.global_variables()[1552], tf.global_variables()[1657]]))
# # session.run(tf.variables_initializer([tf.global_variables()[1553], tf.global_variables()[1070]]))
# bad_vars=[]
# for var in tf.global_variables():
# # if var.name[0]=="V":
# if var.name == "dense_241":
# bad_vars.append(var)
#
# session.run(tf.variables_initializer(bad_vars))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment