bbbales2/adj_jac_apply_benchmark_test.cpp

## adj_jac_apply_benchmark_test.cpp
#include <stan/math/rev/core.hpp>
#include <test/unit/math/rev/mat/util.hpp>
#include <gtest/gtest.h>
#include <algorithm>
#include <sstream>
#include <tuple>
#include <vector>
#include <chrono>

namespace stan {
  namespace math {
    /*
     * This is the simplest adj_jac functor in town
     */
    struct AddFunctor {
      template <std::size_t size>
      Eigen::VectorXd operator()(const std::array<bool, size>& needs_adj, const Eigen::VectorXd& x1, const Eigen::VectorXd& x2) {
	check_size_match("AddFunctor::operator()", "x1", x1.size(), "x2", x2.size());
	return x1 + x2;
      }

      template <std::size_t size>
      auto multiply_adjoint_jacobian(const std::array<bool, size>& needs_adj,
				     const Eigen::VectorXd& adj) {
	return std::make_tuple(adj, adj);
      }
    };

    /*
     * This is the simplest adj_jac functor in town, inefficiently written
     */
    struct AddFunctorInefficient {
      template <std::size_t size>
      Eigen::VectorXd operator()(const std::array<bool, size>& needs_adj, const Eigen::VectorXd& x1, const Eigen::VectorXd& x2) {
	check_size_match("AddFunctorInefficient::operator()", "x1", x1.size(), "x2", x2.size());
	return x1 + x2;
      }

      template <std::size_t size>
      auto multiply_adjoint_jacobian(const std::array<bool, size>& needs_adj,
				     const Eigen::VectorXd& adj) {
	Eigen::MatrixXd jac(adj.size(), adj.size());

	for (int i = 0; i < adj.size(); ++i) {
	  for (int j = 0; j < adj.size(); ++j) {
	    jac(i, j) = 0.0;
	  }

	  jac(i, i) = 1.0;
	}

	return std::make_tuple((adj.transpose() * jac).eval(), (adj.transpose() * jac).eval());
      }
    };

    auto AddFunctorAutodiffed = [](auto& x1, auto& x2) {
      check_size_match("AddFunctorAutodiffed::operator()", "x1", x1.size(), "x2", x2.size());
      return (x1 + x2).eval();
    };
  }
}

template<typename F>
double run_benchmark(F f, int N, int R) {
  stan::math::recover_memory();

  EXPECT_EQ(0, stan::math::ChainableStack::instance().var_stack_.size());
  EXPECT_EQ(0, stan::math::ChainableStack::instance().var_nochain_stack_.size());

  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x1(N);
  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x2(N);

  for (int i = 0; i < N; ++i) {
    x1(i) = i / static_cast<double>(N);
    x2(i) = i / static_cast<double>(N);
  }

  stan::math::start_nested();
  /**
   * By nesting y, we'll only be calling chain on the output variables.
   *
   * If you put the forward calculation of y inside the loop
   * then you have to make sure you nest/unnest each iteration so
   * you aren't dirtying up the autodiff stack.
   *
   * The results are about the same, and this focuses the benchmark
   * on the reverse mode.
   */
  stan::math::var y = stan::math::sum(f(x1, x2));
  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < R; ++i) {
    stan::math::grad(y.vi_);
    stan::math::set_zero_all_adjoints_nested();
  }

  double time = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - start).count() / R;

  stan::math::recover_memory_nested();

  return time;
}

TEST(AgradRev, benchmarks) {
  std::cout << "method, N, time" << std::endl;
  for(int N = 100; N <= 500; N += 100) {
    for(int j = 0; j < 5; ++j) {
      std::cout << "autodiffed, " << N << ", " << run_benchmark(stan::math::AddFunctorAutodiffed, N, 500) << std::endl;
      std::cout << "inefficient, " << N << ", " << run_benchmark([](auto x1, auto x2) { return stan::math::adj_jac_apply<stan::math::AddFunctorInefficient>(x1, x2); }, N, 500) << std::endl;
      std::cout << "efficient, " << N << ", " << run_benchmark([](auto x1, auto x2) { return stan::math::adj_jac_apply<stan::math::AddFunctor>(x1, x2); }, N, 500) << std::endl;
    }
  }
}

TEST(AgradRev, test_grads_match) {
  using stan::math::var;
  stan::math::recover_memory();

  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x1(5);
  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x2(5);

  x1 << 1.0, 2.0, 3.0, 4.0, 5.0;
  x2 << 6.0, 7.0, 8.0, 9.0, 10.0;

  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y1 = stan::math::adj_jac_apply<stan::math::AddFunctor>(x1, x2);
  for(int j = 0; j < 5; ++j) {
    stan::math::set_zero_all_adjoints();
    y1(j).grad();
    for(int i = 0; i < 5; ++i) {
      if(i == j) {
	EXPECT_FLOAT_EQ(1.0, x1(i).adj());
	EXPECT_FLOAT_EQ(1.0, x2(i).adj());
      } else {
	EXPECT_FLOAT_EQ(0.0, x1(i).adj());
	EXPECT_FLOAT_EQ(0.0, x2(i).adj());
      }
    }
  }
  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y2 = stan::math::adj_jac_apply<stan::math::AddFunctorInefficient>(x1, x2);
  for(int j = 0; j < 5; ++j) {
    stan::math::set_zero_all_adjoints();
    y2(j).grad();
    for(int i = 0; i < 5; ++i) {
      if(i == j) {
	EXPECT_FLOAT_EQ(1.0, x1(i).adj());
	EXPECT_FLOAT_EQ(1.0, x2(i).adj());
      } else {
	EXPECT_FLOAT_EQ(0.0, x1(i).adj());
	EXPECT_FLOAT_EQ(0.0, x2(i).adj());
      }
    }
  }
  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y3 = stan::math::AddFunctorAutodiffed(x1, x2);
  for(int j = 0; j < 5; ++j) {
    stan::math::set_zero_all_adjoints();
    y3(j).grad();
    for(int i = 0; i < 5; ++i) {
      if(i == j) {
	EXPECT_FLOAT_EQ(1.0, x1(i).adj());
	EXPECT_FLOAT_EQ(1.0, x2(i).adj());
      } else {
	EXPECT_FLOAT_EQ(0.0, x1(i).adj());
	EXPECT_FLOAT_EQ(0.0, x2(i).adj());
      }
    }
  }
}

## adj_jac_apply_simplex_benchmark_test.cpp
#include <stan/math/rev/core.hpp>
#include <test/unit/math/rev/mat/util.hpp>
#include <gtest/gtest.h>
#include <algorithm>
#include <sstream>
#include <tuple>
#include <vector>
#include <chrono>

namespace stan {
namespace math {

struct SimplexConstrainAutodiffed {
  template <typename T>
  Eigen::Matrix<T, Eigen::Dynamic, 1> operator()(
      const Eigen::Matrix<T, Eigen::Dynamic, 1>& y) {
    // cut & paste simplex_constrain(Eigen::Matrix, T) w/o Jacobian
    using Eigen::Dynamic;
    using Eigen::Matrix;
    using std::log;
    typedef typename index_type<Matrix<T, Dynamic, 1> >::type size_type;

    int Km1 = y.size();
    Matrix<T, Dynamic, 1> x(Km1 + 1);
    T stick_len(1.0);
    for (size_type k = 0; k < Km1; ++k) {
      T z_k(stan::math::inv_logit(y(k) - log(Km1 - k)));
      x(k) = stick_len * z_k;
      stick_len -= x(k);
    }
    x(Km1) = stick_len;
    return x;
  }
};

}  // namespace math
}  // namespace stan

template <typename F>
double run_benchmark(F f, int N, int R) {
  stan::math::recover_memory();

  EXPECT_EQ(0, stan::math::ChainableStack::instance().var_stack_.size());
  EXPECT_EQ(0,
            stan::math::ChainableStack::instance().var_nochain_stack_.size());

  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x(N);

  for (int i = 0; i < N; ++i) {
    x(i) = i / static_cast<double>(N);
  }

  stan::math::start_nested();
  /**
   * By nesting y, we'll only be calling chain on the output variables.
   *
   * If you put the forward calculation of y inside the loop
   * then you have to make sure you nest/unnest each iteration so
   * you aren't dirtying up the autodiff stack.
   *
   * The results are about the same, and this focuses the benchmark
   * on the reverse mode.
   */
  stan::math::var y = stan::math::sum(f(x));
  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < R; ++i) {
    stan::math::grad(y.vi_);
    stan::math::set_zero_all_adjoints_nested();
  }

  double time = std::chrono::duration<double>(
                    std::chrono::high_resolution_clock::now() - start)
                    .count()
                / R;

  stan::math::recover_memory_nested();

  return time;
}

TEST(AgradRev, benchmarks) {
  std::cout << "method, N, time" << std::endl;
  for (int j = 0; j < 5; ++j) {
    for (int N = 10; N <= 200; N += 10) {
      std::cout << "autodiffed, " << N << ", "
                << run_benchmark(stan::math::SimplexConstrainAutodiffed{}, N,
                                 500)
                << std::endl;
      std::cout << "efficient, " << N << ", "
                << run_benchmark(
                       [](auto x) {
                         return stan::math::adj_jac_apply<
                             stan::math::simplex_constrain_op>(x);
                       },
                       N, 500)
                << std::endl;
    }
  }
}

TEST(AgradRev, test_grads_match) {
  using stan::math::var;
  stan::math::recover_memory();

  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x1(5);

  x1 << 1.0, 2.0, 3.0, 4.0, 5.0;

  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y1
      = stan::math::adj_jac_apply<stan::math::simplex_constrain_op>(x1);
  Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y2
      = stan::math::SimplexConstrainAutodiffed{}(x1);

  for (int i = 0; i < y1.size(); ++i) {
    EXPECT_FLOAT_EQ(y1(i).val(), y2(i).val());
  }

  for (int i = 0; i < y1.size(); ++i) {
    stan::math::set_zero_all_adjoints();
    Eigen::VectorXd adjs(x1.size());
    y1(i).grad();
    for (int j = 0; j < x1.size(); ++j) {
      adjs(j) = x1(j).adj();
    }
    stan::math::set_zero_all_adjoints();
    y2(i).grad();
    for (int j = 0; j < x1.size(); ++j) {
      EXPECT_FLOAT_EQ(adjs(j), x1(j).adj());
    }
  }
}
	#include <stan/math/rev/core.hpp>
	#include <test/unit/math/rev/mat/util.hpp>
	#include <gtest/gtest.h>
	#include <algorithm>
	#include <sstream>
	#include <tuple>
	#include <vector>
	#include <chrono>

	namespace stan {
	namespace math {
	/*
	* This is the simplest adj_jac functor in town
	*/
	struct AddFunctor {
	template <std::size_t size>
	Eigen::VectorXd operator()(const std::array<bool, size>& needs_adj, const Eigen::VectorXd& x1, const Eigen::VectorXd& x2) {
	check_size_match("AddFunctor::operator()", "x1", x1.size(), "x2", x2.size());
	return x1 + x2;
	}

	template <std::size_t size>
	auto multiply_adjoint_jacobian(const std::array<bool, size>& needs_adj,
	const Eigen::VectorXd& adj) {
	return std::make_tuple(adj, adj);
	}
	};

	/*
	* This is the simplest adj_jac functor in town, inefficiently written
	*/
	struct AddFunctorInefficient {
	template <std::size_t size>
	Eigen::VectorXd operator()(const std::array<bool, size>& needs_adj, const Eigen::VectorXd& x1, const Eigen::VectorXd& x2) {
	check_size_match("AddFunctorInefficient::operator()", "x1", x1.size(), "x2", x2.size());
	return x1 + x2;
	}

	template <std::size_t size>
	auto multiply_adjoint_jacobian(const std::array<bool, size>& needs_adj,
	const Eigen::VectorXd& adj) {
	Eigen::MatrixXd jac(adj.size(), adj.size());

	for (int i = 0; i < adj.size(); ++i) {
	for (int j = 0; j < adj.size(); ++j) {
	jac(i, j) = 0.0;
	}

	jac(i, i) = 1.0;
	}

	return std::make_tuple((adj.transpose() * jac).eval(), (adj.transpose() * jac).eval());
	}
	};

	auto AddFunctorAutodiffed = [](auto& x1, auto& x2) {
	check_size_match("AddFunctorAutodiffed::operator()", "x1", x1.size(), "x2", x2.size());
	return (x1 + x2).eval();
	};
	}
	}

	template<typename F>
	double run_benchmark(F f, int N, int R) {
	stan::math::recover_memory();

	EXPECT_EQ(0, stan::math::ChainableStack::instance().var_stack_.size());
	EXPECT_EQ(0, stan::math::ChainableStack::instance().var_nochain_stack_.size());

	Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x1(N);
	Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x2(N);

	for (int i = 0; i < N; ++i) {
	x1(i) = i / static_cast<double>(N);
	x2(i) = i / static_cast<double>(N);
	}

	stan::math::start_nested();
	/**
	* By nesting y, we'll only be calling chain on the output variables.
	*
	* If you put the forward calculation of y inside the loop
	* then you have to make sure you nest/unnest each iteration so
	* you aren't dirtying up the autodiff stack.
	*
	* The results are about the same, and this focuses the benchmark
	* on the reverse mode.
	*/
	stan::math::var y = stan::math::sum(f(x1, x2));
	auto start = std::chrono::high_resolution_clock::now();
	for (int i = 0; i < R; ++i) {
	stan::math::grad(y.vi_);
	stan::math::set_zero_all_adjoints_nested();
	}

	double time = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - start).count() / R;

	stan::math::recover_memory_nested();

	return time;
	}

	TEST(AgradRev, benchmarks) {
	std::cout << "method, N, time" << std::endl;
	for(int N = 100; N <= 500; N += 100) {
	for(int j = 0; j < 5; ++j) {
	std::cout << "autodiffed, " << N << ", " << run_benchmark(stan::math::AddFunctorAutodiffed, N, 500) << std::endl;
	std::cout << "inefficient, " << N << ", " << run_benchmark([](auto x1, auto x2) { return stan::math::adj_jac_apply<stan::math::AddFunctorInefficient>(x1, x2); }, N, 500) << std::endl;
	std::cout << "efficient, " << N << ", " << run_benchmark([](auto x1, auto x2) { return stan::math::adj_jac_apply<stan::math::AddFunctor>(x1, x2); }, N, 500) << std::endl;
	}
	}
	}

	TEST(AgradRev, test_grads_match) {
	using stan::math::var;
	stan::math::recover_memory();

	Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x1(5);
	Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> x2(5);

	x1 << 1.0, 2.0, 3.0, 4.0, 5.0;
	x2 << 6.0, 7.0, 8.0, 9.0, 10.0;

	Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y1 = stan::math::adj_jac_apply<stan::math::AddFunctor>(x1, x2);
	for(int j = 0; j < 5; ++j) {
	stan::math::set_zero_all_adjoints();
	y1(j).grad();
	for(int i = 0; i < 5; ++i) {
	if(i == j) {
	EXPECT_FLOAT_EQ(1.0, x1(i).adj());
	EXPECT_FLOAT_EQ(1.0, x2(i).adj());
	} else {
	EXPECT_FLOAT_EQ(0.0, x1(i).adj());
	EXPECT_FLOAT_EQ(0.0, x2(i).adj());
	}
	}
	}
	Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y2 = stan::math::adj_jac_apply<stan::math::AddFunctorInefficient>(x1, x2);
	for(int j = 0; j < 5; ++j) {
	stan::math::set_zero_all_adjoints();
	y2(j).grad();
	for(int i = 0; i < 5; ++i) {
	if(i == j) {
	EXPECT_FLOAT_EQ(1.0, x1(i).adj());
	EXPECT_FLOAT_EQ(1.0, x2(i).adj());
	} else {
	EXPECT_FLOAT_EQ(0.0, x1(i).adj());
	EXPECT_FLOAT_EQ(0.0, x2(i).adj());
	}
	}
	}
	Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1> y3 = stan::math::AddFunctorAutodiffed(x1, x2);
	for(int j = 0; j < 5; ++j) {
	stan::math::set_zero_all_adjoints();
	y3(j).grad();
	for(int i = 0; i < 5; ++i) {
	if(i == j) {
	EXPECT_FLOAT_EQ(1.0, x1(i).adj());
	EXPECT_FLOAT_EQ(1.0, x2(i).adj());
	} else {
	EXPECT_FLOAT_EQ(0.0, x1(i).adj());
	EXPECT_FLOAT_EQ(0.0, x2(i).adj());
	}
	}
	}
	}