Skip to content

Instantly share code, notes, and snippets.

@kris-singh
Created March 14, 2017 00:48
Show Gist options
  • Save kris-singh/e1b4866273c2e809ad1deef398ce5103 to your computer and use it in GitHub Desktop.
Save kris-singh/e1b4866273c2e809ad1deef398ce5103 to your computer and use it in GitHub Desktop.
#include <cmath>
#include <time.h>
#include <stdlib.h>
#include <mlpack/core.hpp>
#include <mlpack/methods/ann/ffn.hpp>
#include <mlpack/methods/ann/layer/layer.hpp>
#include <mlpack/methods/ann/layer/leaky_relu.hpp>
#include <mlpack/methods/ann/visitor/reset_visitor.hpp>
#include <mlpack/methods/ann/visitor/backward_visitor.hpp>
#include <mlpack/methods/ann/visitor/parameters_visitor.hpp>
#include <mlpack/methods/ann/visitor/weight_set_visitor.hpp>
#include "environment.hpp"
#define _print(x) for(auto i: x) std::cout << i << std::endl;
using namespace mlpack;
using namespace mlpack::ann;
using namespace mlpack::optimization;
using namespace gym;
arma::mat convert(std::vector<arma::vec>& v)
{
arma::mat temp;
for(size_t i=0; i<v.size();i++)
temp.insert_cols(i, v[i]);
return temp;
}
std::vector<double> discountReward(std::vector<double>& reward, double gamma)
{
std::vector<double> discountReward(reward.size());
double runningAdd = 0;
for(int i = reward.size() - 1; i >= 0; i--)
{
// element wise multiplication
runningAdd = runningAdd * gamma + reward[i];
discountReward[i] = runningAdd;
}
double mean = 0;
double E = 0;
for(auto i: discountReward)
mean += i;
mean = mean / static_cast<double>(reward.size());
for(auto i: discountReward)
E += pow((i - mean), 2);
double stdev = sqrt(E / static_cast<double>(reward.size()));
//normalise
for(size_t i = 0; i < discountReward.size(); i++)
discountReward[i] -= mean;
for(size_t i = 0; i < discountReward.size(); i++)
discountReward[i] /= stdev;
return discountReward;
}
template<typename T>
std::vector<arma::mat>
backwardPolicy(T& model, std::vector<double>& discountReward,
std::vector<arma::vec>& epsHidden,
std::vector<arma::vec>& epsInput)
{
arma::mat dw2;
arma::mat dh;
arma::mat dw1;
arma::mat weights;
for(size_t i=0; i<epsInput.size();i++)
epsInput[i].insert_rows(epsInput[i].n_rows, 1.0);
for(size_t i=0; i<epsHidden.size();i++)
epsHidden[i].insert_rows(epsHidden[i].n_rows, 1.0);
arma::mat epsHiddenMat = convert(epsHidden).t();
arma::mat epsInputMat = convert(epsInput).t();
arma::mat discountRewardMat = arma::conv_to<arma::mat>::from(discountReward);
dw2 = epsHiddenMat.t() * discountRewardMat;
// check if this is a matrix if we have multiple outputs
// Add support for names to layers types and create a visitor to access it
boost::apply_visitor(ParametersVisitor(std::move(weights)), model.Model()[model.Model().size() - 1]);
dh = discountRewardMat*weights.t();
for(size_t i = 0; i < epsHiddenMat.n_rows; i++)
for(size_t j = 0; j < epsHiddenMat.n_cols; j++)
if(epsHiddenMat[i, j] <= 0)
{
dh[i,j] = 0;
}
dw1 = epsInputMat.t() * dh;
std::vector<arma::mat> grad;
grad.push_back(dw1);
grad.push_back(dw2);
return grad;
}
template<typename T>
void GetAction(Environment& env,
T& model,
arma::mat& data,
std::vector<arma::vec>& epsHidden,
std::vector<arma::vec>& epsInput,
std::vector<double>& epsLabel,
std::vector<double>& epsReward,
std::vector<double>& epsGrad)
{
double rewardSum = 0;
const double learning_rate = 0.8;
size_t eps = 1;
double running_reward = 0;
const size_t batch_size = 10;
arma::mat action, predictionTemp, outputParameter, w;
double yLabel, temp, r, action_double, gradLabel;
std::vector<arma::mat> gradBuff, grad, weights;
std::vector<double> discountReward1;
std::vector<int> idx;
while(eps < 200 ){
//for each episode get all the input states
epsInput.push_back(data);
//forward pass
model.Predict(data, predictionTemp);
temp = 1 / (1 + std::exp(arma::as_scalar(predictionTemp)));
//log all the hidden values(assuming 1 hidden layer)
boost::apply_visitor(ForwardVisitor(std::move(data), std::move(outputParameter)), model.Model()[0]);
epsHidden.push_back(outputParameter);
//select action to perform
srand(time(NULL));
r = ((double)rand() / (RAND_MAX));
action_double = r> temp? 1:0;
if (action_double==0)
action = arma::zeros(1);
else
action = arma::ones(1);
//take the action and note the new observation
env.step(action);
data = env.observation;
//fake label
yLabel = arma::as_scalar(action)==0?1:0;
//log all the output values
epsLabel.push_back(yLabel);
//log all the action gradients
// assuming loss function is mean squared error
// Todo change this to add Gradient() visitor when implemented
gradLabel = (yLabel - temp);
double rew = arma::as_scalar(env.reward);
rewardSum = rewardSum + rew;
epsGrad.push_back(gradLabel);
epsReward.push_back(rew);
if (env.done)
{
std::cout << "episode # " << eps << std::endl;
std::cout << "reward # " << rewardSum << std::endl;
eps++;
discountReward1 = discountReward(epsGrad, 0.99);
std::vector<double> tempGrad(epsGrad.size());
for(size_t i = 0; i< epsGrad.size(); i++)
{
tempGrad[i] = epsGrad[i] * discountReward1[i];
}
grad = backwardPolicy(model, tempGrad, epsHidden, epsInput);
for(size_t i = 0; i < grad.size(); i++)
{
// Accumalate the gradients over a batch
// Todo Change this to is_empty
if (gradBuff.size() == grad.size())
if(grad[i].n_rows == 50 || grad[i].n_rows == 50)
gradBuff[i] += grad[i];
else
gradBuff.push_back(grad[i]);
}
if (eps % batch_size == 0)
{
//collect the weights
for(size_t i=0; i<model.Model().size(); i++)
{
boost::apply_visitor(ParametersVisitor(std::move(w)), model.Model()[i]);
if(w.n_rows!=0 && w.n_cols!=0)
{
weights.push_back(w);
idx.push_back(i);
}
}
// Delete one weight(I don't know why ann is giving 3 parameters/ one is a duplicate)
weights.erase(weights.begin());
// Convert the weight to a matrix
for(size_t i=0; i< weights.size() -1; i++)
{
if(weights[i].n_rows==50)
{
// This is a hack. Change the ParameterVisitor to return a matrix.
weights[i].reshape(5, 10);
}
//bias addition
weights[i].insert_cols(weights[i].n_cols,arma::ones(weights[i].n_rows));
// update the weights using sgd
}
for(size_t i=0; i<gradBuff.size(); i++)
{
//perform the sgd updates on weights
w = weights[i] - (learning_rate * gradBuff[i]);
boost::apply_visitor(WeightSetVisitor(std::move(w),0), model.Model()[idx[i]]);
std::cout << weights[i] << std::endl;
arma::mat weight_temp;
boost::apply_visitor(ParametersVisitor(std::move(weight_temp)), model.Model()[idx[i]]);
std::cout << weight_temp << std::endl;
}
if (running_reward==0)
running_reward = rewardSum;
else
running_reward = running_reward * 0.99 + rewardSum * 0.01;
std::cout<<"Average reward for episode " << rewardSum/batch_size << " Total average reward" <<running_reward/batch_size << std::endl;
if (rewardSum/batch_size > 200)
{
std::cout << "Task solved";
break;
}
rewardSum = 0;
gradBuff.clear();
}
// clear all episode variables
epsGrad.clear();
epsLabel.clear();
epsHidden.clear();
epsInput.clear();
epsReward.clear();
data = env.reset();
}
}
}
int main(int argc, char* argv[])
{
const std::string environment = "CartPole-v0";
const std::string host = "127.0.0.1";
const std::string port = "4040";
FFN<MeanSquaredError<>,RandomInitialization> model;
std::vector<arma::vec> epsHidden, epsInput;
std::vector<double> epsReward, epsLabel, epsGrad;
size_t hiddenLayerSize = 10;
size_t numActions = 2;
double totalReward = 0;
size_t totalSteps = 0;
double learning_rate = 0.8;
double discount_rate = 0.1;
Environment env(host, port, environment);
arma::mat observation = env.reset();
Parser P();
model.Add<Linear<> >(observation.n_rows, hiddenLayerSize);
model.Add<LeakyReLU<> >(0);
model.Add<Linear<> >(hiddenLayerSize, 1);
GetAction<FFN<MeanSquaredError<>,RandomInitialization>>(env, model, observation,
epsHidden,
epsInput,
epsLabel,
epsReward,
epsGrad);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment