Skip to content

Instantly share code, notes, and snippets.

@crazyleg crazyleg/SimpleRL.py
Created Jun 29, 2018

Embed
What would you like to do?
Playground to develop neural network based landing pages optimisations.
'''
states = bandit number = user enviroment
action = bandit arm = type of landing page
'''
import numpy as np
import random
from tqdm import tqdm
import tensorflow as tf
import tensorflow.contrib.slim as slim
#TODO INBALANCE IN CLASSES
size=[3,2,3,3]
class WebsiteClass():
def __init__(self):
self.state = 0
#We want to find agrmax of this complex system, or approach it. Random distribution doesn't represent
#realistic distribution.
#self.bandits = np.random.rand(*size)/100+0.95
self.bandits = [np.random.random(x)/50+0.95 for x in size]
self.num_actions = size.__len__()
def pullArm(self, action):
prob = 0
for i in range(self.num_actions):
prob += self.bandits[i][action[i]]
prob = prob / self.num_actions
return 50 if random.random()>prob else -1
class agent():
def __init__(self, lr, a_size):
#These lines established the feed-forward part of the network. The agent takes a state and produces an action.
self.DummyState = tf.Variable([[0.1]])
output = slim.fully_connected(self.DummyState,a_size,\
biases_initializer=None, weights_initializer=tf.ones_initializer())
self.res = [tf.squeeze(slim.fully_connected(output, x, \
biases_initializer=None, activation_fn=tf.nn.sigmoid,
weights_initializer=tf.ones_initializer(), scope='results'+str(i))) for i, x in enumerate(size)]
self.chosen_actions = [tf.argmax(x, output_type=tf.int32) for x in self.res]
self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
self.action_holder = tf.placeholder(shape=[len(size)], dtype=tf.int32)
self.responsible_weights = [tf.cast(tf.slice(self.res[i], [self.action_holder[i]], [1]),dtype=tf.float32) for i, x in enumerate(size)]
self.losses = [-tf.log(x)*self.reward_holder for x in self.responsible_weights]
self.loss = tf.reduce_mean(self.losses)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
self.update = optimizer.minimize(self.loss, var_list=tf.trainable_variables())
if __name__ == "__main__":
tf.reset_default_graph()
total_episodes = 20000
e = 0.5
with tf.Session() as sess:
cBandit = WebsiteClass() # Load the bandits.
myAgent = agent(lr=0.001, a_size=10)
init = tf.global_variables_initializer()
sess.run(init)
optimal = np.zeros(cBandit.num_actions)
j=0
for i in tqdm(range(total_episodes)):
action = [random.randint(0,x-1) for x in size] if np.random.rand(1) < e \
else sess.run(myAgent.chosen_actions)
reward = cBandit.pullArm(action)
for q in range(cBandit.num_actions):
optimal[q] += 1 if np.argmax(cBandit.bandits[q])==action[q] else 0
feed_dict = {myAgent.reward_holder: [reward], myAgent.action_holder: action}
_, loss,q,w = sess.run([myAgent.update, myAgent.loss, myAgent.chosen_actions, myAgent.res], feed_dict=feed_dict)
j += 1
action, e = sess.run([
myAgent.chosen_actions, myAgent.res])
prob = 0
for i in range(cBandit.num_actions):
print(f'I triggered {i} bandit with {action[i]} action, leading to {cBandit.bandits[i][action[i]]:5.6f} result')
print(f'This guess for {i} bandit was {action[i]==np.argmax(cBandit.bandits[i])}')
prob = prob + cBandit.bandits[i][action[i]]
prob = prob / cBandit.num_actions
print(cBandit.bandits)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.