drin/array_greater_equal_benchmark.cc

## array_greater_equal_benchmark.cc
// A version that is directly comparable to
// https://gist.github.com/js8544/8569c0e0bb810f1254904e4584def167#file-benchmark-cc-L12
static void GreaterEqual(benchmark::State& state) {  // NOLINT non-const reference
  constexpr int64_t test_size = 10000;
  constexpr int64_t max_val   = std::numeric_limits<int64_t>::max();
  auto              test_vals = benchmark_rng.Int64(test_size, 0, max_val);
  auto              test_ints = std::static_pointer_cast<arrow::Int64Array>(test_vals);

  while (state.KeepRunning()) {
    arrow::BooleanBuilder builder;
    auto ret = builder.Reserve(test_ints->length());

    for (int i = 0; i < test_ints->length(); ++i) {
      if (test_ints->Value(i) >= 100) { ret = builder.Append(true);  }
      else                            { ret = builder.Append(false); }
    }

    auto result = builder.Finish();
    benchmark::DoNotOptimize(result);
  }

  state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
  state.SetItemsProcessed(state.iterations() * test_vals->length());
};

## bench_results.txt

** Run on an M1 **

Unable to determine clock rate from sysctl: hw.cpufrequency: No such file or directory

2022-08-30T10:40:36-07:00
Running ./release/arrow-misc-benchmark
Run on (8 X 24.121 MHz CPU s)

CPU Caches:
  L1 Data 64 KiB (x8)
  L1 Instruction 128 KiB (x8)
  L2 Unified 4096 KiB (x2)

Load Average: 1.60, 1.47, 1.36

-----------------------------------------------------------------------------
Benchmark                   Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------------
GreaterEqual            34893 ns        34893 ns        19840 bytes_per_second=2.13527G/s items_per_second=286.592M/s
ArrayGreaterThan        35654 ns        35654 ns        19631 bytes_per_second=2.08968G/s items_per_second=280.472M/s
ComputeGreaterThan       3182 ns         3182 ns       223978 bytes_per_second=23.4183G/s items_per_second=3.14315G/s
FastArrayGreaterThan     6256 ns         6256 ns       112027 bytes_per_second=11.9086G/s items_per_second=1.59834G/s

## compute_greater_equal_benchmark.cc
// version using the compute layer
static void ComputeGreaterThan(benchmark::State& state) {  // NOLINT non-const reference
  constexpr int64_t test_size = 10000;
  constexpr int64_t max_val   = std::numeric_limits<int64_t>::max();

  // test data
  auto test_vals = benchmark_rng.Int64(test_size, 0, max_val);
  auto scalar_100 = std::make_shared<arrow::Int64Scalar>(100);

  while (state.KeepRunning()) {
    ASSERT_OK_AND_ASSIGN(Datum gte_result,
                         compute::CallFunction("greater_equal", {test_vals, scalar_100}));

    benchmark::DoNotOptimize(gte_result);
    benchmark::DoNotOptimize(test_vals);
    benchmark::DoNotOptimize(scalar_100);
  }

  state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
  state.SetItemsProcessed(state.iterations() * test_vals->length());
}

## fast_compute_greater_equal_benchmark.cc
// ** A version that approximates the implementation of "greater_equal" **
static void ArrayGreaterThan(benchmark::State& state) {  // NOLINT non-const reference
  constexpr int64_t test_size = 10000;
  constexpr int64_t max_val   = std::numeric_limits<int64_t>::max();
  static constexpr int kBatchSize = 32;

  // Test data
  auto test_vals   = benchmark_rng.Int64(test_size, 0, max_val);
  auto test_intarr = std::static_pointer_cast<arrow::Int64Array>(test_vals);

  while (state.KeepRunning()) {
    const int64_t *int_vals = test_intarr->raw_values();
    arrow::BooleanBuilder out_builder;
    ASSERT_OK(out_builder.Reserve(test_vals->length()));

    int64_t num_batches = test_vals->length() / kBatchSize;
    uint8_t temp_output[kBatchSize];

    for (int64_t batch_ndx = 0; batch_ndx < num_batches; ++batch_ndx) {

      for (int val_ndx = 0; val_ndx < kBatchSize; ++val_ndx) {
        temp_output[val_ndx] = *(int_vals++) >= 100 ? 1 : 0;
        // ASSERT_OK(out_builder.Append(test_intarr->Value(val_ndx) >= 100));
      }

      ASSERT_OK(out_builder.AppendValues((const uint8_t *) &temp_output, kBatchSize));
    }

    auto out_array = out_builder.Finish();

    benchmark::DoNotOptimize(out_array);
    benchmark::DoNotOptimize(test_intarr);
    benchmark::DoNotOptimize(out_builder);
  }

  state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
  state.SetItemsProcessed(state.iterations() * test_vals->length());
}

## misc_benchmark.cc
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include <algorithm>
#include <cstdint>
#include <limits>
#include <random>
#include <string>
#include <vector>

#include "benchmark/benchmark.h"

#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"

#include "arrow/array/builder_primitive.h"
#include "arrow/compute/exec.h"


namespace arrow {
  namespace internal {

    namespace {
      constexpr auto   kSeed     = 0x94378165;
      constexpr double null_prob = 0;

      static random::RandomArrayGenerator benchmark_rng(kSeed);

    } // annonymous namespace

    // ------------------------------
    // Benchmark functions
    static void GreaterEqual(benchmark::State& state) {  // NOLINT non-const reference
      constexpr int64_t test_size = 10000;
      constexpr int64_t max_val   = std::numeric_limits<int64_t>::max();
      auto              test_vals = benchmark_rng.Int64(test_size, 0, max_val);
      auto              test_ints = std::static_pointer_cast<arrow::Int64Array>(test_vals);

      while (state.KeepRunning()) {
        arrow::BooleanBuilder builder;
        auto ret = builder.Reserve(test_ints->length());

        for (int i = 0; i < test_ints->length(); ++i) {
          if (test_ints->Value(i) >= 100) { ret = builder.Append(true);  }
          else                            { ret = builder.Append(false); }
        }

        auto result = builder.Finish();
        benchmark::DoNotOptimize(result);
      }

      state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
      state.SetItemsProcessed(state.iterations() * test_vals->length());
    };

    // >> Functions that use compute layer
    static void ComputeGreaterThan(benchmark::State& state) {  // NOLINT non-const reference
      constexpr int64_t test_size = 10000;
      constexpr int64_t max_val   = std::numeric_limits<int64_t>::max();

      // test data
      auto test_vals = benchmark_rng.Int64(test_size, 0, max_val);
      auto scalar_100 = std::make_shared<arrow::Int64Scalar>(100);

      while (state.KeepRunning()) {
        ASSERT_OK_AND_ASSIGN(Datum gte_result,
                             compute::CallFunction("greater_equal", {test_vals, scalar_100}));

        benchmark::DoNotOptimize(gte_result);
        benchmark::DoNotOptimize(test_vals);
        benchmark::DoNotOptimize(scalar_100);
      }

      state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
      state.SetItemsProcessed(state.iterations() * test_vals->length());
    }


    // ----------------------------------------------------------------------
    // Benchmark declarations

    BENCHMARK(GreaterEqual);
    BENCHMARK(ArrayGreaterThan);
    BENCHMARK(ComputeGreaterThan);

  }  // namespace internal
}  // namespace arrow
	// A version that is directly comparable to
	// https://gist.github.com/js8544/8569c0e0bb810f1254904e4584def167#file-benchmark-cc-L12
	static void GreaterEqual(benchmark::State& state) { // NOLINT non-const reference
	constexpr int64_t test_size = 10000;
	constexpr int64_t max_val = std::numeric_limits<int64_t>::max();
	auto test_vals = benchmark_rng.Int64(test_size, 0, max_val);
	auto test_ints = std::static_pointer_cast<arrow::Int64Array>(test_vals);

	while (state.KeepRunning()) {
	arrow::BooleanBuilder builder;
	auto ret = builder.Reserve(test_ints->length());

	for (int i = 0; i < test_ints->length(); ++i) {
	if (test_ints->Value(i) >= 100) { ret = builder.Append(true); }
	else { ret = builder.Append(false); }
	}

	auto result = builder.Finish();
	benchmark::DoNotOptimize(result);
	}

	state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
	state.SetItemsProcessed(state.iterations() * test_vals->length());
	};

	Run on an M1

	Unable to determine clock rate from sysctl: hw.cpufrequency: No such file or directory

	2022-08-30T10:40:36-07:00
	Running ./release/arrow-misc-benchmark
	Run on (8 X 24.121 MHz CPU s)

	CPU Caches:
	L1 Data 64 KiB (x8)
	L1 Instruction 128 KiB (x8)
	L2 Unified 4096 KiB (x2)

	Load Average: 1.60, 1.47, 1.36

	-----------------------------------------------------------------------------
	Benchmark Time CPU Iterations UserCounters...
	-----------------------------------------------------------------------------
	GreaterEqual 34893 ns 34893 ns 19840 bytes_per_second=2.13527G/s items_per_second=286.592M/s
	ArrayGreaterThan 35654 ns 35654 ns 19631 bytes_per_second=2.08968G/s items_per_second=280.472M/s
	ComputeGreaterThan 3182 ns 3182 ns 223978 bytes_per_second=23.4183G/s items_per_second=3.14315G/s
	FastArrayGreaterThan 6256 ns 6256 ns 112027 bytes_per_second=11.9086G/s items_per_second=1.59834G/s
	// version using the compute layer
	static void ComputeGreaterThan(benchmark::State& state) { // NOLINT non-const reference
	constexpr int64_t test_size = 10000;
	constexpr int64_t max_val = std::numeric_limits<int64_t>::max();

	// test data
	auto test_vals = benchmark_rng.Int64(test_size, 0, max_val);
	auto scalar_100 = std::make_shared<arrow::Int64Scalar>(100);

	while (state.KeepRunning()) {
	ASSERT_OK_AND_ASSIGN(Datum gte_result,
	compute::CallFunction("greater_equal", {test_vals, scalar_100}));

	benchmark::DoNotOptimize(gte_result);
	benchmark::DoNotOptimize(test_vals);
	benchmark::DoNotOptimize(scalar_100);
	}

	state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
	state.SetItemsProcessed(state.iterations() * test_vals->length());
	}
	// A version that approximates the implementation of "greater_equal"
	static void ArrayGreaterThan(benchmark::State& state) { // NOLINT non-const reference
	constexpr int64_t test_size = 10000;
	constexpr int64_t max_val = std::numeric_limits<int64_t>::max();
	static constexpr int kBatchSize = 32;

	// Test data
	auto test_vals = benchmark_rng.Int64(test_size, 0, max_val);
	auto test_intarr = std::static_pointer_cast<arrow::Int64Array>(test_vals);

	while (state.KeepRunning()) {
	const int64_t *int_vals = test_intarr->raw_values();
	arrow::BooleanBuilder out_builder;
	ASSERT_OK(out_builder.Reserve(test_vals->length()));

	int64_t num_batches = test_vals->length() / kBatchSize;
	uint8_t temp_output[kBatchSize];

	for (int64_t batch_ndx = 0; batch_ndx < num_batches; ++batch_ndx) {

	for (int val_ndx = 0; val_ndx < kBatchSize; ++val_ndx) {
	temp_output[val_ndx] = *(int_vals++) >= 100 ? 1 : 0;
	// ASSERT_OK(out_builder.Append(test_intarr->Value(val_ndx) >= 100));
	}

	ASSERT_OK(out_builder.AppendValues((const uint8_t *) &temp_output, kBatchSize));
	}

	auto out_array = out_builder.Finish();

	benchmark::DoNotOptimize(out_array);
	benchmark::DoNotOptimize(test_intarr);
	benchmark::DoNotOptimize(out_builder);
	}

	state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
	state.SetItemsProcessed(state.iterations() * test_vals->length());
	}
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <algorithm>
	#include <cstdint>
	#include <limits>
	#include <random>
	#include <string>
	#include <vector>

	#include "benchmark/benchmark.h"

	#include "arrow/testing/gtest_util.h"
	#include "arrow/testing/random.h"

	#include "arrow/array/builder_primitive.h"
	#include "arrow/compute/exec.h"


	namespace arrow {
	namespace internal {

	namespace {
	constexpr auto kSeed = 0x94378165;
	constexpr double null_prob = 0;

	static random::RandomArrayGenerator benchmark_rng(kSeed);

	} // annonymous namespace

	// ------------------------------
	// Benchmark functions
	static void GreaterEqual(benchmark::State& state) { // NOLINT non-const reference
	constexpr int64_t test_size = 10000;
	constexpr int64_t max_val = std::numeric_limits<int64_t>::max();
	auto test_vals = benchmark_rng.Int64(test_size, 0, max_val);
	auto test_ints = std::static_pointer_cast<arrow::Int64Array>(test_vals);

	while (state.KeepRunning()) {
	arrow::BooleanBuilder builder;
	auto ret = builder.Reserve(test_ints->length());

	for (int i = 0; i < test_ints->length(); ++i) {
	if (test_ints->Value(i) >= 100) { ret = builder.Append(true); }
	else { ret = builder.Append(false); }
	}

	auto result = builder.Finish();
	benchmark::DoNotOptimize(result);
	}

	state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
	state.SetItemsProcessed(state.iterations() * test_vals->length());
	};

	// >> Functions that use compute layer
	static void ComputeGreaterThan(benchmark::State& state) { // NOLINT non-const reference
	constexpr int64_t test_size = 10000;
	constexpr int64_t max_val = std::numeric_limits<int64_t>::max();

	// test data
	auto test_vals = benchmark_rng.Int64(test_size, 0, max_val);
	auto scalar_100 = std::make_shared<arrow::Int64Scalar>(100);

	while (state.KeepRunning()) {
	ASSERT_OK_AND_ASSIGN(Datum gte_result,
	compute::CallFunction("greater_equal", {test_vals, scalar_100}));

	benchmark::DoNotOptimize(gte_result);
	benchmark::DoNotOptimize(test_vals);
	benchmark::DoNotOptimize(scalar_100);
	}

	state.SetBytesProcessed(state.iterations() * (test_vals->length() * sizeof(int64_t)));
	state.SetItemsProcessed(state.iterations() * test_vals->length());
	}


	// ----------------------------------------------------------------------
	// Benchmark declarations

	BENCHMARK(GreaterEqual);
	BENCHMARK(ArrayGreaterThan);
	BENCHMARK(ComputeGreaterThan);

	} // namespace internal
	} // namespace arrow