Ivan Novikov d0znpp

## gist:527c65a8b0d432643e476f7e955ec711
def train(mnist, max_layers):
    sess = tf.Session()
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.1
    learning_rate = tf.train.exponential_decay(0.99, global_step,
                                           500, 0.96, staircase=True)

    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

    reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step)

## gist:49b165bf8c3e04c4fb8f50f5c70faaa0
with tf.Session() as train_sess:
                    init = tf.global_variables_initializer()
                    train_sess.run(init)

                    for step in range(self.max_step_per_action):
                        batch_x, batch_y = self.mnist.train.next_batch(self.bathc_size)
                        feed = {model.X: batch_x,
                                model.Y: batch_y,
                                model.dropout_keep_prob: self.dropout_rate,
                                model.cnn_dropout_rates: cnn_drop_rate}

## gist:284efab903984dc2cb206531b5d6ce4f
def get_reward(self, action, step, pre_acc):
        action = [action[0][0][x:x+4] for x in range(0, len(action[0][0]), 4)]
        cnn_drop_rate = [c[3] for c in action]
Then we formed bathc with hyperparameters for every layer in "action" and we created cnn_drop_rate – list of dropout rates for every layer.
Now let's create new CNN with new architecture:
        with tf.Graph().as_default() as g:
            with g.container('experiment'+str(step)):
                model = CNN(self.num_input, self.num_classes, action)
                loss_op = tf.reduce_mean(model.loss)
                optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

## gist:7805cb2907724f1e6c4c189149c61f18
class NetManager():
    def __init__(self, num_input, num_classes, learning_rate, mnist,
                 max_step_per_action=5500,
                 bathc_size=100,
                 dropout_rate=0.85):

        self.num_input = num_input
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.mnist = mnist

## gist:35989385d1db1fa2a7215a00cdd07589
    def store_rollout(self, state, reward):
        self.reward_buffer.append(reward)
        self.state_buffer.append(state[0])

    def train_step(self, steps_count):
         states = np.array(self.state_buffer[-steps_count:])/self.division_rate
        rewars = self.reward_buffer[-steps_count:]
        _, ls = self.sess.run([self.train_op, self.loss],
                     {self.states: states,
                      self.discounted_rewards: rewars})

## gist:ca7fc8a25e1c8d23746bcb78e5277567
 def get_action(self, state):
        return self.sess.run(self.predicted_action, {self.states: state})
        if random.random() < self.exploration:
            return np.array([[random.sample(range(1, 35), 4*self.max_layers)]])
        else:
            return self.sess.run(self.predicted_action, {self.states: state})

## gist:fee7f4d3232f44de0aa7427465331c46
self.create_variables()
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
self.sess.run(tf.variables_initializer(var_lists))

## gist:adabe5f7160c1ac3a7088379ad4af746
    def create_variables(self):
        with tf.name_scope("model_inputs"):
            # raw state representation
            self.states = tf.placeholder(tf.float32, [None, self.max_layers*4], name="states")

        with tf.name_scope("predict_actions"):
            # initialize policy network
            with tf.variable_scope("policy_network"):
                self.policy_outputs = self.policy_network(self.states, self.max_layers)

## gist:05169a3513cf73a5c2a89c6471e395b6
class Reinforce():
    def __init__(self, sess, optimizer, policy_network, max_layers, global_step,
                 division_rate=100.0,
                 reg_param=0.001,
                 discount_factor=0.99,
                 exploration=0.3):
        self.sess = sess
        self.optimizer = optimizer
        self.policy_network = policy_network
        self.division_rate = division_rate

## gist:db1caaefd6c9ed5c72e65721fbd859bb
def policy_network(state, max_layers):
    with tf.name_scope("policy_network"):
        nas_cell = tf.contrib.rnn.NASCell(4*max_layers)
        outputs, state = tf.nn.dynamic_rnn(
            nas_cell,
            tf.expand_dims(state, -1),
            dtype=tf.float32
        )
        bias = tf.Variable([0.05]*4*max_layers)
        outputs = tf.nn.bias_add(outputs, bias)
	def train(mnist, max_layers):
	sess = tf.Session()
	global_step = tf.Variable(0, trainable=False)
	starter_learning_rate = 0.1
	learning_rate = tf.train.exponential_decay(0.99, global_step,
	500, 0.96, staircase=True)

	optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

	reinforce = Reinforce(sess, optimizer, policy_network, args.max_layers, global_step)
	with tf.Session() as train_sess:
	init = tf.global_variables_initializer()
	train_sess.run(init)

	for step in range(self.max_step_per_action):
	batch_x, batch_y = self.mnist.train.next_batch(self.bathc_size)
	feed = {model.X: batch_x,
	model.Y: batch_y,
	model.dropout_keep_prob: self.dropout_rate,
	model.cnn_dropout_rates: cnn_drop_rate}
	def get_reward(self, action, step, pre_acc):
	action = [action[0][0][x:x+4] for x in range(0, len(action[0][0]), 4)]
	cnn_drop_rate = [c[3] for c in action]
	Then we formed bathc with hyperparameters for every layer in "action" and we created cnn_drop_rate – list of dropout rates for every layer.
	Now let's create new CNN with new architecture:
	with tf.Graph().as_default() as g:
	with g.container('experiment'+str(step)):
	model = CNN(self.num_input, self.num_classes, action)
	loss_op = tf.reduce_mean(model.loss)
	optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
	class NetManager():
	def __init__(self, num_input, num_classes, learning_rate, mnist,
	max_step_per_action=5500,
	bathc_size=100,
	dropout_rate=0.85):

	self.num_input = num_input
	self.num_classes = num_classes
	self.learning_rate = learning_rate
	self.mnist = mnist
	def store_rollout(self, state, reward):
	self.reward_buffer.append(reward)
	self.state_buffer.append(state[0])

	def train_step(self, steps_count):
	states = np.array(self.state_buffer[-steps_count:])/self.division_rate
	rewars = self.reward_buffer[-steps_count:]
	_, ls = self.sess.run([self.train_op, self.loss],
	{self.states: states,
	self.discounted_rewards: rewars})
	def get_action(self, state):
	return self.sess.run(self.predicted_action, {self.states: state})
	if random.random() < self.exploration:
	return np.array([[random.sample(range(1, 35), 4*self.max_layers)]])
	else:
	return self.sess.run(self.predicted_action, {self.states: state})
	self.create_variables()
	var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
	self.sess.run(tf.variables_initializer(var_lists))
	def create_variables(self):
	with tf.name_scope("model_inputs"):
	# raw state representation
	self.states = tf.placeholder(tf.float32, [None, self.max_layers*4], name="states")

	with tf.name_scope("predict_actions"):
	# initialize policy network
	with tf.variable_scope("policy_network"):
	self.policy_outputs = self.policy_network(self.states, self.max_layers)
	class Reinforce():
	def __init__(self, sess, optimizer, policy_network, max_layers, global_step,
	division_rate=100.0,
	reg_param=0.001,
	discount_factor=0.99,
	exploration=0.3):
	self.sess = sess
	self.optimizer = optimizer
	self.policy_network = policy_network
	self.division_rate = division_rate
	def policy_network(state, max_layers):
	with tf.name_scope("policy_network"):
	nas_cell = tf.contrib.rnn.NASCell(4*max_layers)
	outputs, state = tf.nn.dynamic_rnn(
	nas_cell,
	tf.expand_dims(state, -1),
	dtype=tf.float32
	)
	bias = tf.Variable([0.05]4max_layers)
	outputs = tf.nn.bias_add(outputs, bias)