SteveBronder/multiply_mem.cpp

## multiply_mem.cpp
#include <benchmark/benchmark.h>
#include <stan/math.hpp>
#include <utility>

static bool needs_done = true;
// Just to fill up the var stack allocator
template <int max_alloc_r, int max_alloc_c>
static void toss_me(benchmark::State& state) {
  using stan::math::var;
  if (needs_done) {
    needs_done = false;
    using stan::math::var;
    using stan::math::sum;
    Eigen::Matrix<var, -1, -1> x(Eigen::MatrixXd::Random(max_alloc_r, max_alloc_c));
    Eigen::Matrix<var, -1, 1> y(Eigen::VectorXd::Random(max_alloc_c));
    stan::math::var lp = stan::math::sum(stan::math::multiply(x, y));
    benchmark::DoNotOptimize(lp.vi_);
    for (auto _ : state) {
      lp.grad();
      stan::math::set_zero_all_adjoints();
    }
    stan::math::recover_memory();
  }
}


constexpr int extra_alloc_r = 65536;
constexpr int extra_alloc_c = 8192;
static stan::math::stack_alloc arena_mem(extra_alloc_r * extra_alloc_c);
inline auto make_x() {
  stan::arena_t<Eigen::MatrixXd>::Base x_val(arena_mem.alloc_array<double>(extra_alloc_r * extra_alloc_c), extra_alloc_r, extra_alloc_c);
  x_val = Eigen::MatrixXd::Random(extra_alloc_r, extra_alloc_c).eval();
  return x_val;
}

auto x_val = make_x();
static void multiply_matrix_vector_var_double(benchmark::State& state) {
  using stan::math::var;
  using stan::math::promote_scalar;
  for (auto _ : state) {
    stan::math::var_value<Eigen::VectorXd> y(Eigen::VectorXd::Random(extra_alloc_c));
    auto start = std::chrono::high_resolution_clock::now();
    var lp = sum(multiply(x_val, y));
    lp.grad();
    benchmark::ClobberMemory();
    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed_seconds =
      std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
    state.SetIterationTime(elapsed_seconds.count());
    stan::math::recover_memory();
    benchmark::ClobberMemory();
  }
}

// prealloc a bunch of mem like we are in the middle of iterations
BENCHMARK_TEMPLATE(toss_me, extra_alloc_r, extra_alloc_c);
BENCHMARK(multiply_matrix_vector_var_double)->UseManualTime();
BENCHMARK_MAIN();

## multiply_normal.cpp
#include <benchmark/benchmark.h>
#include <stan/math.hpp>
#include <utility>

static bool needs_done = true;
// Just to fill up the var stack allocator
template <int max_alloc_r, int max_alloc_c>
static void toss_me(benchmark::State& state) {
  using stan::math::var;
  if (needs_done) {
    needs_done = false;
    using stan::math::var;
    using stan::math::sum;
    Eigen::Matrix<var, -1, -1> x(Eigen::MatrixXd::Random(max_alloc_r, max_alloc_c));
    Eigen::Matrix<var, -1, 1> y(Eigen::VectorXd::Random(max_alloc_c));
    stan::math::var lp = stan::math::sum(stan::math::multiply(x, y));
    benchmark::DoNotOptimize(lp.vi_);
    for (auto _ : state) {
      lp.grad();
      stan::math::set_zero_all_adjoints();
    }
    stan::math::recover_memory();
  }
}


constexpr int extra_alloc_r = 65536;
constexpr int extra_alloc_c = 8192;
inline auto make_x() {
  return Eigen::MatrixXd::Random(extra_alloc_r, extra_alloc_c).eval();
}

auto x_val = make_x();
static void multiply_matrix_vector_var_double(benchmark::State& state) {
  using stan::math::var;
  using stan::math::promote_scalar;
  for (auto _ : state) {
    stan::math::var_value<Eigen::VectorXd> y(Eigen::VectorXd::Random(extra_alloc_c));
    auto start = std::chrono::high_resolution_clock::now();
    var lp = sum(multiply(x_val, y));
    lp.grad();
    benchmark::ClobberMemory();
    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed_seconds =
      std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
    state.SetIterationTime(elapsed_seconds.count());
    stan::math::recover_memory();
    benchmark::ClobberMemory();
  }
}

// prealloc a bunch of mem like we are in the middle of iterations
BENCHMARK_TEMPLATE(toss_me, extra_alloc_r, extra_alloc_c);
BENCHMARK(multiply_matrix_vector_var_double)->UseManualTime();
BENCHMARK_MAIN();
	#include <benchmark/benchmark.h>
	#include <stan/math.hpp>
	#include <utility>

	static bool needs_done = true;
	// Just to fill up the var stack allocator
	template <int max_alloc_r, int max_alloc_c>
	static void toss_me(benchmark::State& state) {
	using stan::math::var;
	if (needs_done) {
	needs_done = false;
	using stan::math::var;
	using stan::math::sum;
	Eigen::Matrix<var, -1, -1> x(Eigen::MatrixXd::Random(max_alloc_r, max_alloc_c));
	Eigen::Matrix<var, -1, 1> y(Eigen::VectorXd::Random(max_alloc_c));
	stan::math::var lp = stan::math::sum(stan::math::multiply(x, y));
	benchmark::DoNotOptimize(lp.vi_);
	for (auto _ : state) {
	lp.grad();
	stan::math::set_zero_all_adjoints();
	}
	stan::math::recover_memory();
	}
	}


	constexpr int extra_alloc_r = 65536;
	constexpr int extra_alloc_c = 8192;
	static stan::math::stack_alloc arena_mem(extra_alloc_r * extra_alloc_c);
	inline auto make_x() {
	stan::arena_t<Eigen::MatrixXd>::Base x_val(arena_mem.alloc_array<double>(extra_alloc_r * extra_alloc_c), extra_alloc_r, extra_alloc_c);
	x_val = Eigen::MatrixXd::Random(extra_alloc_r, extra_alloc_c).eval();
	return x_val;
	}

	auto x_val = make_x();
	static void multiply_matrix_vector_var_double(benchmark::State& state) {
	using stan::math::var;
	using stan::math::promote_scalar;
	for (auto _ : state) {
	stan::math::var_value<Eigen::VectorXd> y(Eigen::VectorXd::Random(extra_alloc_c));
	auto start = std::chrono::high_resolution_clock::now();
	var lp = sum(multiply(x_val, y));
	lp.grad();
	benchmark::ClobberMemory();
	auto end = std::chrono::high_resolution_clock::now();
	auto elapsed_seconds =
	std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
	state.SetIterationTime(elapsed_seconds.count());
	stan::math::recover_memory();
	benchmark::ClobberMemory();
	}
	}

	// prealloc a bunch of mem like we are in the middle of iterations
	BENCHMARK_TEMPLATE(toss_me, extra_alloc_r, extra_alloc_c);
	BENCHMARK(multiply_matrix_vector_var_double)->UseManualTime();
	BENCHMARK_MAIN();