Created
March 14, 2017 00:48
-
-
Save kris-singh/e1b4866273c2e809ad1deef398ce5103 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cmath> | |
#include <time.h> | |
#include <stdlib.h> | |
#include <mlpack/core.hpp> | |
#include <mlpack/methods/ann/ffn.hpp> | |
#include <mlpack/methods/ann/layer/layer.hpp> | |
#include <mlpack/methods/ann/layer/leaky_relu.hpp> | |
#include <mlpack/methods/ann/visitor/reset_visitor.hpp> | |
#include <mlpack/methods/ann/visitor/backward_visitor.hpp> | |
#include <mlpack/methods/ann/visitor/parameters_visitor.hpp> | |
#include <mlpack/methods/ann/visitor/weight_set_visitor.hpp> | |
#include "environment.hpp" | |
#define _print(x) for(auto i: x) std::cout << i << std::endl; | |
using namespace mlpack; | |
using namespace mlpack::ann; | |
using namespace mlpack::optimization; | |
using namespace gym; | |
arma::mat convert(std::vector<arma::vec>& v) | |
{ | |
arma::mat temp; | |
for(size_t i=0; i<v.size();i++) | |
temp.insert_cols(i, v[i]); | |
return temp; | |
} | |
std::vector<double> discountReward(std::vector<double>& reward, double gamma) | |
{ | |
std::vector<double> discountReward(reward.size()); | |
double runningAdd = 0; | |
for(int i = reward.size() - 1; i >= 0; i--) | |
{ | |
// element wise multiplication | |
runningAdd = runningAdd * gamma + reward[i]; | |
discountReward[i] = runningAdd; | |
} | |
double mean = 0; | |
double E = 0; | |
for(auto i: discountReward) | |
mean += i; | |
mean = mean / static_cast<double>(reward.size()); | |
for(auto i: discountReward) | |
E += pow((i - mean), 2); | |
double stdev = sqrt(E / static_cast<double>(reward.size())); | |
//normalise | |
for(size_t i = 0; i < discountReward.size(); i++) | |
discountReward[i] -= mean; | |
for(size_t i = 0; i < discountReward.size(); i++) | |
discountReward[i] /= stdev; | |
return discountReward; | |
} | |
template<typename T> | |
std::vector<arma::mat> | |
backwardPolicy(T& model, std::vector<double>& discountReward, | |
std::vector<arma::vec>& epsHidden, | |
std::vector<arma::vec>& epsInput) | |
{ | |
arma::mat dw2; | |
arma::mat dh; | |
arma::mat dw1; | |
arma::mat weights; | |
for(size_t i=0; i<epsInput.size();i++) | |
epsInput[i].insert_rows(epsInput[i].n_rows, 1.0); | |
for(size_t i=0; i<epsHidden.size();i++) | |
epsHidden[i].insert_rows(epsHidden[i].n_rows, 1.0); | |
arma::mat epsHiddenMat = convert(epsHidden).t(); | |
arma::mat epsInputMat = convert(epsInput).t(); | |
arma::mat discountRewardMat = arma::conv_to<arma::mat>::from(discountReward); | |
dw2 = epsHiddenMat.t() * discountRewardMat; | |
// check if this is a matrix if we have multiple outputs | |
// Add support for names to layers types and create a visitor to access it | |
boost::apply_visitor(ParametersVisitor(std::move(weights)), model.Model()[model.Model().size() - 1]); | |
dh = discountRewardMat*weights.t(); | |
for(size_t i = 0; i < epsHiddenMat.n_rows; i++) | |
for(size_t j = 0; j < epsHiddenMat.n_cols; j++) | |
if(epsHiddenMat[i, j] <= 0) | |
{ | |
dh[i,j] = 0; | |
} | |
dw1 = epsInputMat.t() * dh; | |
std::vector<arma::mat> grad; | |
grad.push_back(dw1); | |
grad.push_back(dw2); | |
return grad; | |
} | |
template<typename T> | |
void GetAction(Environment& env, | |
T& model, | |
arma::mat& data, | |
std::vector<arma::vec>& epsHidden, | |
std::vector<arma::vec>& epsInput, | |
std::vector<double>& epsLabel, | |
std::vector<double>& epsReward, | |
std::vector<double>& epsGrad) | |
{ | |
double rewardSum = 0; | |
const double learning_rate = 0.8; | |
size_t eps = 1; | |
double running_reward = 0; | |
const size_t batch_size = 10; | |
arma::mat action, predictionTemp, outputParameter, w; | |
double yLabel, temp, r, action_double, gradLabel; | |
std::vector<arma::mat> gradBuff, grad, weights; | |
std::vector<double> discountReward1; | |
std::vector<int> idx; | |
while(eps < 200 ){ | |
//for each episode get all the input states | |
epsInput.push_back(data); | |
//forward pass | |
model.Predict(data, predictionTemp); | |
temp = 1 / (1 + std::exp(arma::as_scalar(predictionTemp))); | |
//log all the hidden values(assuming 1 hidden layer) | |
boost::apply_visitor(ForwardVisitor(std::move(data), std::move(outputParameter)), model.Model()[0]); | |
epsHidden.push_back(outputParameter); | |
//select action to perform | |
srand(time(NULL)); | |
r = ((double)rand() / (RAND_MAX)); | |
action_double = r> temp? 1:0; | |
if (action_double==0) | |
action = arma::zeros(1); | |
else | |
action = arma::ones(1); | |
//take the action and note the new observation | |
env.step(action); | |
data = env.observation; | |
//fake label | |
yLabel = arma::as_scalar(action)==0?1:0; | |
//log all the output values | |
epsLabel.push_back(yLabel); | |
//log all the action gradients | |
// assuming loss function is mean squared error | |
// Todo change this to add Gradient() visitor when implemented | |
gradLabel = (yLabel - temp); | |
double rew = arma::as_scalar(env.reward); | |
rewardSum = rewardSum + rew; | |
epsGrad.push_back(gradLabel); | |
epsReward.push_back(rew); | |
if (env.done) | |
{ | |
std::cout << "episode # " << eps << std::endl; | |
std::cout << "reward # " << rewardSum << std::endl; | |
eps++; | |
discountReward1 = discountReward(epsGrad, 0.99); | |
std::vector<double> tempGrad(epsGrad.size()); | |
for(size_t i = 0; i< epsGrad.size(); i++) | |
{ | |
tempGrad[i] = epsGrad[i] * discountReward1[i]; | |
} | |
grad = backwardPolicy(model, tempGrad, epsHidden, epsInput); | |
for(size_t i = 0; i < grad.size(); i++) | |
{ | |
// Accumalate the gradients over a batch | |
// Todo Change this to is_empty | |
if (gradBuff.size() == grad.size()) | |
if(grad[i].n_rows == 50 || grad[i].n_rows == 50) | |
gradBuff[i] += grad[i]; | |
else | |
gradBuff.push_back(grad[i]); | |
} | |
if (eps % batch_size == 0) | |
{ | |
//collect the weights | |
for(size_t i=0; i<model.Model().size(); i++) | |
{ | |
boost::apply_visitor(ParametersVisitor(std::move(w)), model.Model()[i]); | |
if(w.n_rows!=0 && w.n_cols!=0) | |
{ | |
weights.push_back(w); | |
idx.push_back(i); | |
} | |
} | |
// Delete one weight(I don't know why ann is giving 3 parameters/ one is a duplicate) | |
weights.erase(weights.begin()); | |
// Convert the weight to a matrix | |
for(size_t i=0; i< weights.size() -1; i++) | |
{ | |
if(weights[i].n_rows==50) | |
{ | |
// This is a hack. Change the ParameterVisitor to return a matrix. | |
weights[i].reshape(5, 10); | |
} | |
//bias addition | |
weights[i].insert_cols(weights[i].n_cols,arma::ones(weights[i].n_rows)); | |
// update the weights using sgd | |
} | |
for(size_t i=0; i<gradBuff.size(); i++) | |
{ | |
//perform the sgd updates on weights | |
w = weights[i] - (learning_rate * gradBuff[i]); | |
boost::apply_visitor(WeightSetVisitor(std::move(w),0), model.Model()[idx[i]]); | |
std::cout << weights[i] << std::endl; | |
arma::mat weight_temp; | |
boost::apply_visitor(ParametersVisitor(std::move(weight_temp)), model.Model()[idx[i]]); | |
std::cout << weight_temp << std::endl; | |
} | |
if (running_reward==0) | |
running_reward = rewardSum; | |
else | |
running_reward = running_reward * 0.99 + rewardSum * 0.01; | |
std::cout<<"Average reward for episode " << rewardSum/batch_size << " Total average reward" <<running_reward/batch_size << std::endl; | |
if (rewardSum/batch_size > 200) | |
{ | |
std::cout << "Task solved"; | |
break; | |
} | |
rewardSum = 0; | |
gradBuff.clear(); | |
} | |
// clear all episode variables | |
epsGrad.clear(); | |
epsLabel.clear(); | |
epsHidden.clear(); | |
epsInput.clear(); | |
epsReward.clear(); | |
data = env.reset(); | |
} | |
} | |
} | |
int main(int argc, char* argv[]) | |
{ | |
const std::string environment = "CartPole-v0"; | |
const std::string host = "127.0.0.1"; | |
const std::string port = "4040"; | |
FFN<MeanSquaredError<>,RandomInitialization> model; | |
std::vector<arma::vec> epsHidden, epsInput; | |
std::vector<double> epsReward, epsLabel, epsGrad; | |
size_t hiddenLayerSize = 10; | |
size_t numActions = 2; | |
double totalReward = 0; | |
size_t totalSteps = 0; | |
double learning_rate = 0.8; | |
double discount_rate = 0.1; | |
Environment env(host, port, environment); | |
arma::mat observation = env.reset(); | |
Parser P(); | |
model.Add<Linear<> >(observation.n_rows, hiddenLayerSize); | |
model.Add<LeakyReLU<> >(0); | |
model.Add<Linear<> >(hiddenLayerSize, 1); | |
GetAction<FFN<MeanSquaredError<>,RandomInitialization>>(env, model, observation, | |
epsHidden, | |
epsInput, | |
epsLabel, | |
epsReward, | |
epsGrad); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment