disco0/vectorized-atan2f.cpp

## vectorized-atan2f.cpp
// Branchless, vectorized `atan2f`. Various functions of increasing
// performance are presented. The fastest version is 50~ faster than libc
// on batch workloads, outputing a result every ~2 clock cycles, compared to
// ~110 for libc. The functions all use the same `atan` approximation, and their
// max error is around ~1/10000 of a degree.
//
// They also do not handle inf / -inf
// and the origin as an input as they should -- in our case these are a sign
// that something is wrong anyway. Moreover, manual_2 does not handle NaN
// correctly (it drops them silently), and all the auto_ functions do not
// handle -0 correctly. But manual_1 handles everything but +-inf and +-0,+-0
// correctly.
//
// Tested on a Xeon W-2145, a Skylake processor. Compiled with
//
//     $ clang++ --version
//     clang version 12.0.1
//     $ clang++ -static --std=c++20 -march=skylake -O3 -Wall vectorized-atan2f.cpp -lm -o vectorized-atan2f
//
// Results:
//
//     $ ./vectorized-atan2f --test-edge-cases 100000 100 1000
//     Generating data... done.
//     Tests will read 824kB and write 412kB (103048 points)
//     Running 1000 warm ups and 1000 iterations
//
//     baseline:  2.46 s,   0.32GB/s, 105.79 cycles/elem, 129.34 instrs/elem,  1.22 instrs/cycle, 27.76 branches/elem,  7.08% branch misses,  0.47% cache misses,  4.29GHz
//     auto_1:    0.21 s,   3.75GB/s,   8.29 cycles/elem,   8.38 instrs/elem,  1.01 instrs/cycle,  0.13 branches/elem,  0.01% branch misses,  0.03% cache misses,  3.89GHz,  0.000109283deg max error,  max error point: -0.563291132, -0.544303775
//     auto_2:   97.50ms,   8.19GB/s,   3.79 cycles/elem,   5.38 instrs/elem,  1.42 instrs/cycle,  0.13 branches/elem,  0.01% branch misses,  0.01% cache misses,  3.88GHz,  0.000109283deg max error,  max error point: -0.854430377, +0.107594967
//     auto_3:   75.95ms,  10.52GB/s,   2.95 cycles/elem,   4.13 instrs/elem,  1.40 instrs/cycle,  0.13 branches/elem,  0.01% branch misses,  0.03% cache misses,  3.88GHz,  0.000109283deg max error,  max error point: -0.854430377, +0.107594967
//     auto_4:   55.89ms,  14.29GB/s,   2.17 cycles/elem,   3.63 instrs/elem,  1.67 instrs/cycle,  0.13 branches/elem,  0.01% branch misses,  0.01% cache misses,  3.88GHz,  0.000109283deg max error,  max error point: -0.854430377, +0.107594967
//     manual_1: 52.57ms,  15.20GB/s,   2.04 cycles/elem,   3.63 instrs/elem,  1.78 instrs/cycle,  0.13 branches/elem,  0.01% branch misses,  0.03% cache misses,  3.88GHz,  0.000109283deg max error,  max error point: -0.854430377, +0.107594967
//     manual_2: 50.16ms,  15.93GB/s,   1.95 cycles/elem,   3.88 instrs/elem,  1.99 instrs/cycle,  0.13 branches/elem,  0.01% branch misses,  0.02% cache misses,  3.88GHz,  0.000109283deg max error,  max error point: -0.854430377, +0.107594967
//
// The atan approximation is from sheet 11 of  "Approximations for digital
// computers", C. Hastings, 1955, a delightful document overall.
//
// Functions auto_1 to auto_5 are automatically vectorized. manual_1 and manual_2
// are vectorized manually.
//
// You'll get quite different results with g++. The main problem with g++ is that
// it vectorizes less aggressively. However it inserts FMA instructions _more_
// aggresively, which is a plus. clang needs the `fp-contract` pragma
// or an explicit `fmaf`.
//
// Moreover, the
#include <cmath>
#include <algorithm>
#include <iostream>
#include <limits>
#include <immintrin.h>
#include <vector>
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <string.h>
#include <asm/unistd.h>
#include <random>
#include <sstream>

#define USE_AVX

using namespace std;

#define NOINLINE __attribute__((noinline))
#define UNUSED __attribute__((unused))

// --------------------------------------------------------------------
// AVX utils

#ifdef USE_AVX

template<typename A>
inline void assert_avx_aligned(const A* ptr) {
  if (reinterpret_cast<uintptr_t>(ptr) % 32 != 0) {
    cerr << "Pointer " << ptr << " is not 32-byte aligned, exiting" << endl;
    exit(EXIT_FAILURE);
  }
}

inline void assert_multiple_of_8(size_t num) {
  if (num % 8 != 0) {
    cerr << "Array size " << num << " is not a multiple of 8, exiting" << endl;
    exit(EXIT_FAILURE);
  }
}

#endif

// --------------------------------------------------------------------
// functions

NOINLINE
static void atan2_baseline(size_t cases, const float* ys, const float* xs, float* out) {
  for (size_t i = 0; i < cases; i++) {
    out[i] = atan2f(ys[i], xs[i]);
  }
}

// not tested since it is very slow.
NOINLINE UNUSED
static void atan2_fpatan(size_t cases, const float* ys, const float* xs, float* out) {
  for (size_t i = 0; i < cases; i++) {
    asm (
      "flds (%[ys], %[i], 4)\n"
      "flds (%[xs], %[i], 4)\n"
      "fpatan\n"
      "fstps (%[out], %[i], 4)\n"
      :
      : [ys]"r"(ys), [xs]"r"(xs), [out]"r"(out), [i]"r"(i)
    );
  }
}

// Polynomial approximation of atan between [-1, 1]. Stated max error ~0.000001rad.
// See comment at the beginning of file for source.
inline float atan_approximation(float x) {
  float a1  =  0.99997726f;
  float a3  = -0.33262347f;
  float a5  =  0.19354346f;
  float a7  = -0.11643287f;
  float a9  =  0.05265332f;
  float a11 = -0.01172120f;

  float x_sq = x*x;
  return
    x * (a1 + x_sq * (a3 + x_sq * (a5 + x_sq * (a7 + x_sq * (a9 + x_sq * a11)))));
}

// First automatic version: naive translation of the maths
NOINLINE
static void atan2_auto_1(size_t num_points, const float* ys, const float* xs, float* out) {
  for (size_t i = 0; i < num_points; i++) {
    // Ensure input is in [-1, +1]
    float y = ys[i];
    float x = xs[i];
    bool swap = fabs(x) < fabs(y);
    float atan_input = (swap ? x : y) / (swap ? y : x);

    // Approximate atan
    float res = atan_approximation(atan_input);

    // If swapped, adjust atan output
    res = swap ? (atan_input >= 0.0f ? M_PI_2 : -M_PI_2) - res : res;
    // Adjust quadrants
    if      (x >= 0.0f && y >= 0.0f) {}                     // 1st quadrant
    else if (x <  0.0f && y >= 0.0f) { res =  M_PI + res; } // 2nd quadrant
    else if (x <  0.0f && y <  0.0f) { res = -M_PI + res; } // 3rd quadrant
    else if (x >= 0.0f && y <  0.0f) {}                     // 4th quadrant

    // Store result
    out[i] = res;
  }
}

// Second automatic version: get rid of casting to double
NOINLINE
static void atan2_auto_2(size_t num_points, const float* ys, const float* xs, float* out) {
  float pi = M_PI;
  float pi_2 = M_PI_2;

  for (size_t i = 0; i < num_points; i++) {
    // Ensure input is in [-1, +1]
    float y = ys[i];
    float x = xs[i];
    bool swap = fabs(x) < fabs(y);
    float atan_input = (swap ? x : y) / (swap ? y : x);

    // Approximate atan
    float res = atan_approximation(atan_input);

    // If swapped, adjust atan output
    res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res;
    // Adjust quadrants
    if      (x >= 0.0f && y >= 0.0f) {}                   // 1st quadrant
    else if (x <  0.0f && y >= 0.0f) { res =  pi + res; } // 2nd quadrant
    else if (x <  0.0f && y <  0.0f) { res = -pi + res; } // 3rd quadrant
    else if (x >= 0.0f && y <  0.0f) {}                   // 4th quadrant

    // Store result
    out[i] = res;
  }
}

// Third automatic version: perform positive check for x and y once -- the compiler
// can't assume that in the presence of NaNs since `0/0 >= 0` is false and `0/0 < 0` is also
// false.
NOINLINE
static void atan2_auto_3(size_t num_points, const float* ys, const float* xs, float* out) {
  float pi = M_PI;
  float pi_2 = M_PI_2;

  for (size_t i = 0; i < num_points; i++) {
    // Ensure input is in [-1, +1]
    float y = ys[i];
    float x = xs[i];
    bool swap = fabs(x) < fabs(y);
    float atan_input = (swap ? x : y) / (swap ? y : x);

    // Approximate atan
    float res = atan_approximation(atan_input);

    // If swapped, adjust atan output
    res = swap ? (atan_input >= 0.0f ? pi_2 : -pi_2) - res : res;
    // Adjust the result depending on the input quadrant
    if (x < 0.0f) {
      res = (y >= 0.0f ? pi : -pi) + res;
    }

    // Store result
    out[i] = res;
  }
}

inline float atan_fma_approximation(float x) {
  float a1  =  0.99997726f;
  float a3  = -0.33262347f;
  float a5  =  0.19354346f;
  float a7  = -0.11643287f;
  float a9  =  0.05265332f;
  float a11 = -0.01172120f;

  // Compute approximation using Horner's method
  float x_sq = x*x;
  return
    x * fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, fmaf(x_sq, a11, a9), a7), a5), a3), a1);
}

// Fifth automatic version: use FMA for the polynomial
NOINLINE
static void atan2_auto_4(size_t num_points, const float* ys, const float* xs, float* out) {
  float pi = M_PI;
  float pi_2 = M_PI_2;

  for (size_t i = 0; i < num_points; i++) {
    // Ensure input is in [-1, +1]
    float y = ys[i];
    float x = xs[i];
    bool swap = fabs(x) < fabs(y);
    float atan_input = (swap ? x : y) / (swap ? y : x);

    // Approximate atan
    float res = atan_fma_approximation(atan_input);

    // If swapped, adjust atan output
    res = swap ? copysignf(pi_2, atan_input) - res : res;
    // Adjust the result depending on the input quadrant
    if (x < 0.0f) {
      res = copysignf(pi, y) + res;
    }

    // Store result
    out[i] = res;
  }
}

#ifdef USE_AVX

inline __m256 atan_avx_approximation(__m256 x) {
  __m256 a1  = _mm256_set1_ps( 0.99997726f);
  __m256 a3  = _mm256_set1_ps(-0.33262347f);
  __m256 a5  = _mm256_set1_ps( 0.19354346f);
  __m256 a7  = _mm256_set1_ps(-0.11643287f);
  __m256 a9  = _mm256_set1_ps( 0.05265332f);
  __m256 a11 = _mm256_set1_ps(-0.01172120f);

  __m256 x_sq = _mm256_mul_ps(x, x);

  __m256 result;
  result =                               a11;
  result = _mm256_fmadd_ps(x_sq, result, a9);
  result = _mm256_fmadd_ps(x_sq, result, a7);
  result = _mm256_fmadd_ps(x_sq, result, a5);
  result = _mm256_fmadd_ps(x_sq, result, a3);
  result = _mm256_fmadd_ps(x_sq, result, a1);
  result = _mm256_mul_ps(x, result);

  return result;
}

// First manual version: straightfoward translation of atan2_auto_5
NOINLINE
static void atan2_manual_1(size_t num_points, const float* ys, const float* xs, float* out) {
  // Check that the input plays well with AVX
  assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out);

  // Store pi and pi/2 as constants
  const __m256 pi = _mm256_set1_ps(M_PI);
  const __m256 pi_2 = _mm256_set1_ps(M_PI_2);

  // Create bit masks that we will need.
  // The first one is all 1s except from the sign bit:
  //
  //     01111111111111111111111111111111
  //
  // We can use it to make a float absolute by AND'ing with it.
  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));;

  // The second is only the sign bit:
  //
  //     10000000000000000000000000000000
  //
  // we can use it to extract the sign of a number by AND'ing with it.
  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

  for (size_t i = 0; i < num_points; i += 8) {
    // Load 8 elements from `ys` and `xs` into two vectors.
    __m256 y = _mm256_load_ps(&ys[i]);
    __m256 x = _mm256_load_ps(&xs[i]);

    // Compare |y| > |x|. This will create an 8-vector of floats that we can
    // use as a mask: the elements where the respective comparison is true will be filled
    // with 1s, with 0s where the comparison is false.
    //
    // For example, if we were comparing the vectors
    //
    //      5 -5  5 -5  5 -5  5 -5
    //               >
    //     -5  5 -5  5 -5  5 -5  5
    //               =
    //      1s 0s 1s 0s 1s 0s 1s 0s
    //
    // Where `1s = 0xFFFFFFFF` and `0s = 0x00000000`.
    __m256 swap_mask = _mm256_cmp_ps(
      _mm256_and_ps(y, abs_mask), // |y|
      _mm256_and_ps(x, abs_mask), // |x|
      _CMP_GT_OS
    );
    // Create the atan input by "blending" `y` and `x`, according to the mask computed
    // above. The blend instruction will pick the first or second argument based on
    // the mask we passed in. In our case we need the number of larger magnitude to
    // be the denominator.
    __m256 atan_input = _mm256_div_ps(
      _mm256_blendv_ps(y, x, swap_mask), // pick the lowest between |y| and |x| for each number
      _mm256_blendv_ps(x, y, swap_mask)  // and the highest.
    );

    // Approximate atan
    __m256 result = atan_avx_approximation(atan_input);

    // If swapped, adjust atan output. We use blending again to leave
    // the output unchanged if we didn't swap anything.
    //
    // If we need to adjust it, we simply carry the sign over from the input
    // to `pi_2` by using the `sign_mask`. This avoids a more expensive comparison,
    // and also handles edge cases such as -0 better.
    result = _mm256_blendv_ps(
      result,
      _mm256_sub_ps(
        _mm256_or_ps(pi_2, _mm256_and_ps(atan_input, sign_mask)),
        result
      ),
      swap_mask
    );
    // Adjust the result depending on the input quadrant.
    //
    // We create a mask for the sign of `x` using an arithmetic right shift:
    // the mask will be all 0s if the sign if positive, and all 1s
    // if the sign is negative.
    __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
    // Then use the mask to perform the adjustment only when the sign
    // if positive, and use the sign bit of `y` to know whether to add
    // `pi` or `-pi`.
    result = _mm256_add_ps(
      _mm256_and_ps(
        _mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)),
        x_sign_mask
      ),
      result
    );

    // Store result
    _mm256_store_ps(&out[i], result);
  }
}

// Second manual version: use the abs values we get at the beginning
// for more ILP (see comment below)
NOINLINE
static void atan2_manual_2(size_t num_points, const float* ys, const float* xs, float* out) {
  assert_avx_aligned(ys), assert_avx_aligned(xs), assert_avx_aligned(out);

  const __m256 pi = _mm256_set1_ps(M_PI);
  const __m256 pi_2 = _mm256_set1_ps(M_PI_2);

  // no sign bit -- AND with this to get absolute value
  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));;
  // only sign bit -- XOR with this to get negative
  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

  for (size_t i = 0; i < num_points; i += 8) {
    __m256 y = _mm256_load_ps(&ys[i]);
    __m256 x = _mm256_load_ps(&xs[i]);

    __m256 abs_y = _mm256_and_ps(abs_mask, y);
    __m256 abs_x = _mm256_and_ps(abs_mask, x);

    // atan_input = min(|y|, |x|) / max(|y|, |x|)
    //
    // I've experimented with using blend rather than min, given that we
    // compute `swap_mask` later anyway, but that delays the atan
    // computation by 2+ cycles, and is less parallel, since on skylake:
    //
    //                latency  throughput
    //     vminps     4        0.5
    //     vmaxps     4        0.5
    //     vblendvps  2        0.66
    //     vcmpps     4        0.5
    //
    // So while it decreases the numbers of instructions needed it makes
    // the overall function slower.
    //
    // This has the unfortunate effect of silently dropping NaNs in the
    // inputs, since in case of NaNs these instructions just return
    // the second argument.
    __m256 atan_input = _mm256_div_ps(
      _mm256_min_ps(abs_y, abs_x), _mm256_min_ps(abs_x, abs_y)
    );

    // Approximate atan
    __m256 result = atan_avx_approximation(atan_input);

    // We first do the usual +- pi - res, but in this case
    // we know the sign of pi since the input is always positive.
    //
    // result = (abs_y > abs_x) ? pi - res : res;
    __m256 swap_mask = _mm256_cmp_ps(abs_y, abs_x, _CMP_GT_OQ);
    result = _mm256_add_ps(
      _mm256_xor_ps(result, _mm256_and_ps(sign_mask, swap_mask)),
      _mm256_and_ps(pi_2, swap_mask)
    );

    // We now have to adjust the quadrant slightly differently:
    //
    // * If both values are positive, do nothing;
    // * If y is negative and x is positive, do nothing;
    // * If y is positive and x is negative, do pi + result;
    // * If y is negative and x is negative, do result - pi;
    //
    // These can easily be verified by analyzing what happens to the output
    // for each input quadrant.
    //
    // These cases can be compressed to the two branches below, and then
    // made branchless without using any blend instruction.

    // result = (x < 0) ?  pi - res : res;
    __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
    result = _mm256_add_ps(
      _mm256_xor_ps(result, _mm256_and_ps(x, sign_mask)),
      _mm256_and_ps(pi, x_sign_mask)
    );
    // result = (y < 0) ? -res : res;
    result = _mm256_xor_ps(result, _mm256_and_ps(y, sign_mask));

    // Store result
    _mm256_store_ps(&out[i], result);
  }
}

#endif

// --------------------------------------------------------------------
// data generation

NOINLINE
static void generate_data(
  bool random_points,
  bool test_edge_cases,
  size_t desired_num_cases,
  size_t* cases,
  float** ys,
  float** xs,
  float** ref_out,
  float** out
) {
  cout << "Generating data..." << flush;
  size_t edge = static_cast<size_t>(sqrtf(static_cast<float>(desired_num_cases)));

  vector<float> extra_cases;
  if (test_edge_cases) {
    // We want to make sure to test some special cases, so we always add them.
    // We do not include negative zero, since we do not care about the sign
    // matching up in that case.
    // We also do not include NaN and inf, since we assume that the input does
    // not contain it.
    using limits = numeric_limits<float>;
    extra_cases = {
      1.0f, -1.0f, 0.0f, -0.0f,
      limits::epsilon(), -limits::epsilon(),
      limits::max(), -limits::max(),
      limits::quiet_NaN()
    };
  }

  // The -1 is for the 0, 0 case, which we do not want.
  *cases = (edge+extra_cases.size())*(edge+extra_cases.size()) - 1;
  // Make sure cases is a multiple of 8 so that the AVX functions can work cleanly
  *cases = *cases + (8 - (*cases % 8));
  *ys = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float)));
  *xs = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float)));
  *ref_out = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float)));
  *out = reinterpret_cast<float*>(aligned_alloc(32, *cases * sizeof(float)));
  if (*ys == nullptr || *xs == nullptr || *ref_out == nullptr || *out == nullptr) {
    cerr << "Could not allocate arrays" << endl;
    exit(EXIT_FAILURE);
  }

  mt19937_64 gen(0);

  {
    float bound = 1.0f;
    float step = (bound * 2.0f) / static_cast<float>(edge);
    uniform_real_distribution<float> dist{ -1.0f, 1.0f };

    const auto get_number = [random_points, &extra_cases, &gen, bound, step, &dist](size_t i) -> float {
      if (i < extra_cases.size()) {
        return extra_cases.at(i);
      } else if (random_points) {
        return dist(gen);
      } else {
        return -bound + static_cast<float>(i - extra_cases.size()) * step;
      }
    };

    size_t ix = 0;
    for (size_t i = 0; i < edge + extra_cases.size(); i++) {
      for (size_t j = 0; j < edge + extra_cases.size(); j++) {
        float y = get_number(i);
        float x = get_number(j);
        if (y == 0.0f && x == 0.0f) { continue; }
        (*ys)[ix] = y;
        (*xs)[ix] = x;
        ix++;
      }
    }

    // pad with dummies
    for (; ix < *cases; ix++) {
      (*ys)[ix] = 1.0f;
      (*xs)[ix] = 1.0f;
    }
  }

  // shuffle to confuse branch predictor in the case of explicit branches and
  // non-random points
  {
    for (size_t i = 0; i < *cases - 1; i++) {
      size_t swap_with = static_cast<size_t>(i + 1 + (gen() % (*cases - i - 1)));
      swap((*ys)[i], (*ys)[swap_with]);
      swap((*xs)[i], (*xs)[swap_with]);
    }
  }

  cout << " done." << endl;
}

// --------------------------------------------------------------------
// Pin to first CPU

static void pin_to_cpu_0(void) {
  cpu_set_t cpu_mask;
  CPU_ZERO(&cpu_mask);
  CPU_SET(0, &cpu_mask);
  if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) {
    fprintf(stderr, "Could not set CPU affinity\n");
    exit(EXIT_FAILURE);
  }
}

// --------------------------------------------------------------------
// perf instrumentation -- a mixture of man 3 perf_event_open and
// <https://stackoverflow.com/a/42092180>

static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
  int ret;
  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
  return ret;
}

static void setup_perf_event(
  struct perf_event_attr *evt,
  int *fd,
  uint64_t *id,
  uint32_t evt_type,
  uint64_t evt_config,
  int group_fd
) {
  memset(evt, 0, sizeof(struct perf_event_attr));
  evt->type = evt_type;
  evt->size = sizeof(struct perf_event_attr);
  evt->config = evt_config;
  evt->disabled = 1;
  evt->exclude_kernel = 1;
  evt->exclude_hv = 1;
  evt->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;

  *fd = perf_event_open(evt, 0, -1, group_fd, 0);
  if (*fd == -1) {
    fprintf(stderr, "Error opening leader %llx\n", evt->config);
    exit(EXIT_FAILURE);
  }

  ioctl(*fd, PERF_EVENT_IOC_ID, id);
}

static struct perf_event_attr perf_cycles_evt;
static int perf_cycles_fd;
static uint64_t perf_cycles_id;

static struct perf_event_attr perf_clock_evt;
static int perf_clock_fd;
static uint64_t perf_clock_id;

static struct perf_event_attr perf_instrs_evt;
static int perf_instrs_fd;
static uint64_t perf_instrs_id;

static struct perf_event_attr perf_cache_misses_evt;
static int perf_cache_misses_fd;
static uint64_t perf_cache_misses_id;

static struct perf_event_attr perf_cache_references_evt;
static int perf_cache_references_fd;
static uint64_t perf_cache_references_id;

static struct perf_event_attr perf_branch_misses_evt;
static int perf_branch_misses_fd;
static uint64_t perf_branch_misses_id;

static struct perf_event_attr perf_branch_instructions_evt;
static int perf_branch_instructions_fd;
static uint64_t perf_branch_instructions_id;

static void perf_init(void) {
  // Cycles
  setup_perf_event(
    &perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, -1
  );
  // Clock
  setup_perf_event(
    &perf_clock_evt, &perf_clock_fd, &perf_clock_id,
    PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd
  );
  // Instructions
  setup_perf_event(
    &perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd
  );
  // Cache misses
  setup_perf_event(
    &perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd
  );
  // Cache references
  setup_perf_event(
    &perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd
  );
  // Branch misses
  setup_perf_event(
    &perf_branch_misses_evt, &perf_branch_misses_fd, &perf_branch_misses_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, perf_cycles_fd
  );
  // Branch instructions
  setup_perf_event(
    &perf_branch_instructions_evt, &perf_branch_instructions_fd, &perf_branch_instructions_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, perf_cycles_fd
  );
}

static void perf_close(void) {
  close(perf_clock_fd);
  close(perf_cycles_fd);
  close(perf_instrs_fd);
  close(perf_cache_misses_fd);
  close(perf_cache_references_fd);
}


static void disable_perf_count(void) {
  ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
}

static void enable_perf_count(void) {
  ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
}

static void reset_perf_count(void) {
  ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
}

struct perf_read_value {
  uint64_t value;
  uint64_t id;
};

struct perf_read_format {
  uint64_t nr;
  struct perf_read_value values[];
};

static char perf_read_buf[4096];

struct perf_count {
  uint64_t cycles;
  double seconds;
  uint64_t instructions;
  uint64_t cache_misses;
  uint64_t cache_references;
  uint64_t branch_misses;
  uint64_t branch_instructions;

  perf_count(): cycles(0), seconds(0.0), instructions(0), cache_misses(0), cache_references(0), branch_misses(0), branch_instructions(0) {}
};

static void read_perf_count(struct perf_count *count) {
  if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) {
    fprintf(stderr, "Could not read cycles from perf\n");
    exit(EXIT_FAILURE);
  }
  struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf;
  if (rf->nr != 7) {
    fprintf(stderr, "Bad number of perf events\n");
    exit(EXIT_FAILURE);
  }
  for (int i = 0; i < static_cast<int>(rf->nr); i++) {
    struct perf_read_value *value = &rf->values[i];
    if (value->id == perf_cycles_id) {
      count->cycles = value->value;
    } else if (value->id == perf_clock_id) {
      count->seconds = ((double) (value->value / 1000ull)) / 1000000.0;
    } else if (value->id == perf_instrs_id) {
      count->instructions = value->value;
    } else if (value->id == perf_cache_misses_id) {
      count->cache_misses = value->value;
    } else if (value->id == perf_cache_references_id) {
      count->cache_references = value->value;
    } else if (value->id == perf_branch_misses_id) {
      count->branch_misses = value->value;
    } else if (value->id == perf_branch_instructions_id) {
      count->branch_instructions = value->value;
    } else {
      fprintf(stderr, "Spurious value in perf read (%ld)\n", value->id);
      exit(EXIT_FAILURE);
    }
  }
}

// --------------------------------------------------------------------
// running the function

struct max_error {
  float y;
  float x;
  float error;

  max_error(): y(0.0f), x(0.0f), error(0.0f) {}

  void update(float y, float x, float reference, float result) {
    if (isnan(reference) && !isnan(result)) {
      cerr << "Expected NaN in result, but got " << result << endl;
      cerr << "For point " << x << ", " << y << endl;
      exit(EXIT_FAILURE);
    }
    if (!isnan(reference) && isnan(result)) {
      cerr << "Unexpected NaN in result, expected " << reference << endl;
      cerr << "For point " << x << ", " << y << endl;
      exit(EXIT_FAILURE);
    }
    float error = abs(reference - result);
    if (error > this->error) {
      this->y = y;
      this->x = x;
      this->error = error;
    }
  }
};

NOINLINE
static void run_timed(
  int pre_iterations,
  int iterations,
  bool test_negative_zero,
  bool test_max,
  bool test_nan,
  size_t num,
  const string& name,
  void (*fun)(size_t, const float*, const float*, float*),
  const float* ref_out, // if null no comparison will be run
  const float* in_y,
  const float* in_x,
  float* out
) {
  if (iterations < 1) {
    fprintf(stderr, "iterations < 1: %d\n", iterations);
    exit(EXIT_FAILURE);
  }

  for (int i = 0; i < pre_iterations; i++) {
    fun(num, in_y, in_x, out);
  }
  struct perf_count counts;
  disable_perf_count();
  reset_perf_count();
  enable_perf_count();
  // The enabling / disabling adds noise for small timings, so
  // wrap the loop since the loop counts for very little (one add, one cmp, one
  // predicted conditional jump).
  for (int i = 0; i < iterations; i++) {
    fun(num, in_y, in_x, out);
  }
  disable_perf_count();
  read_perf_count(&counts);

  // two floats per element
  uint64_t bytes = ((uint64_t) num) * sizeof(float) * 2;
  double gb_per_s = (((double) bytes) / 1000000000.0) / (counts.seconds / ((double) iterations));

  double time = counts.seconds;
  const char *unit = " s";
  if (time < 0.1) {
    time *= 1000.0;
    unit = "ms";
  }
  if (time < 0.1) {
    time *= 1000.0;
    unit = "us";
  }

  constexpr int padded_name_size = 10;
  char padded_name[padded_name_size];
  int name_len = snprintf(padded_name, padded_name_size, "%s:", name.c_str());
  for (int i = name_len; i < padded_name_size; i++) {
    padded_name[i] = ' ';
  }
  padded_name[padded_name_size-1] = '\0';

  printf(
    "%s %5.2f%s, %6.2fGB/s, %6.2f cycles/elem, %6.2f instrs/elem, %5.2f instrs/cycle, %5.2f branches/elem, %5.2f%% branch misses, %5.2f%% cache misses, %5.2fGHz",
    padded_name, time, unit, gb_per_s,
    ((double) counts.cycles) / ((double) iterations) / ((double) num),
    ((double) counts.instructions) / ((double) iterations) / ((double) num),
    ((double) counts.instructions) / ((double) counts.cycles),
    ((double) counts.branch_instructions) / ((double) iterations) / ((double) num),
    100.0 * ((double) counts.branch_misses) / ((double) counts.branch_instructions),
    100.0 * ((double) counts.cache_misses) / ((double) counts.cache_references),
    ((double) counts.cycles) / counts.seconds / 1000000000.0
  );
  if (ref_out != nullptr) {
    max_error max_error;
    for (size_t ix = 0; ix < num; ix++) {
      float y = in_y[ix];
      float x = in_x[ix];
      if (!test_negative_zero && (y == -0.0f || x == -0.0f)) { continue; }
      if (
        !test_max &&
        (fabs(y) == numeric_limits<float>::max() || fabs(x) == numeric_limits<float>::max())
      ) { continue; }
      if (!test_nan && (isnan(y) || isnan(x))) { continue; }
      max_error.update(in_y[ix], in_x[ix], ref_out[ix], out[ix]);
    }
    printf(
      ",  %.9fdeg max error,  max error point: %+.9f, %+.9f\n",
      max_error.error * 180.0f / M_PI, max_error.x, max_error.y
    );
  } else {
    printf("\n");
  }
}

#define run_timed_atan2(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, fun, ref_out, in_y, in_x, out) \
  run_timed(pre_iterations, iterations, test_negative_zero, test_max, test_nan, num, #fun, atan2_ ## fun, ref_out, in_y, in_x, out)

// --------------------------------------------------------------------
// main

NOINLINE
static pair<string, size_t> format_size(size_t size) {
  string unit = "B";
  if (size > 10000ull) {
    size /= 1000ull;
    unit = "kB";
  }
  if (size > 10000ull) {
    size /= 1000ull;
    unit = "MB";
  }
  if (size > 10000ull) {
    size /= 1000ull;
    unit = "GB";
  }
  return { unit, size };
}

struct config {
  bool random_points = false;
  bool test_edge_cases = false;
  bool test_baseline = true;
  bool test_auto_1 = true;
  bool test_auto_2 = true;
  bool test_auto_3 = true;
  bool test_auto_4 = true;
  bool test_manual_1 = true;
  bool test_manual_2 = true;
};

int main(int argc, const char* argv[]) {
  cout.precision(numeric_limits<float>::max_digits10);
  pin_to_cpu_0();
  perf_init();

  config config;

  const auto bad_usage = [&argv]() {
    cerr << "Usage: " << argv[0] << " [--random-points] [--test-edge-cases] [--functions COMMA_SEPARATED_FUNCTIONS] NUMBER_OF_TEST_CASES NUMBER_OF_WARM_UPS NUMBER_OF_ITERATIONS" << endl;
    exit(EXIT_FAILURE);
  };

  vector<string> arguments;
  for (int i = 1; i < argc; i++) {
    string arg{argv[i]};
    if (arg == "--random-points") {
      config.random_points = true;
    } else if (arg == "--test-edge-cases") {
      config.test_edge_cases = true;
    } else if (arg == "--functions") {
      config.test_baseline = false;
      config.test_auto_1 = false;
      config.test_auto_2 = false;
      config.test_auto_3 = false;
      config.test_auto_4 = false;
      config.test_manual_1 = false;
      config.test_manual_2 = false;
      if (i < argc - 1) {
        i++;
        stringstream functions{ argv[i] };
        for (string function; getline(functions, function, ','); ) {
          if (function == "baseline") {
            config.test_baseline = true;
          } else if (function == "auto_1") {
            config.test_auto_1 = true;
          } else if (function == "auto_2") {
            config.test_auto_2 = true;
          } else if (function == "auto_3") {
            config.test_auto_3 = true;
          } else if (function == "auto_4") {
            config.test_auto_4 = true;
          }
#ifdef USE_AVX
          else if (function == "manual") {
            config.test_manual_1 = true;
          } else if (function == "manual_2") {
            config.test_manual_2 = true;
          }
#endif
          else {
            cerr << "Bad function " << function << endl;
            bad_usage();
          }
        }
      } else {
        cerr << "Expecting argument after --functions" << endl;
        bad_usage();
      }
    } else {
      arguments.emplace_back(arg);
    }
  }

  if (arguments.size() != 3) {
    bad_usage();
  }

  size_t desired_cases;
  if (sscanf(arguments.at(0).c_str(), "%zu", &desired_cases) != 1) {
    cerr << "Could not parse number of cases" << endl;
    bad_usage();
  }

  int pre_iterations;
  if (sscanf(arguments.at(1).c_str(), "%d", &pre_iterations) != 1) {
    cerr << "Could not parse warm ups" << endl;
    bad_usage();
  }

  int iterations;
  if (sscanf(arguments.at(2).c_str(), "%d", &iterations) != 1) {
    cerr << "Could not parse iterations" << endl;
    bad_usage();
  }

  size_t cases;
  float* ys;
  float* xs;
  float* ref_out;
  float* out;
  generate_data(
    config.random_points, config.test_edge_cases,
    desired_cases,
    &cases, &ys, &xs,
    &ref_out,
    &out
  );
  if (!config.test_baseline) {
    free(ref_out);
    ref_out = nullptr;
  }

  const auto formatted_input_size = format_size(sizeof(float) * 2 * cases);
  const auto formatted_output_size = format_size(sizeof(float) * cases);
  cout << "Tests will read " << formatted_input_size.second << formatted_input_size.first << " and write " << formatted_output_size.second << formatted_output_size.first << " (" << cases << " points)" << endl;
  cout << "Running " << pre_iterations << " warm ups and " << iterations << " iterations" << endl;
  cout << endl;

  if (config.test_baseline) {
    run_timed(pre_iterations, iterations, true, true, true, cases, "baseline", atan2_baseline, nullptr, ys, xs, ref_out);
  }
  // auto_1 to auto_3 do not support the max value because the input gets
  // reduce to negative zero in that case and the atan_input >= 0 branch
  // doesn't preserve the sign properly.
  //
  // Similarly, none of the functions apart from manual_1 and manual_2 support
  // negative zero properly because of the x < 0 branch.
  //
  // manual_2 silently drop NaNs, it can be made to not do that at slight
  // performance hit (see comment in it).
  if (config.test_auto_1) {
    run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_1, ref_out, ys, xs, out);
  }
  if (config.test_auto_2) {
    run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_2, ref_out, ys, xs, out);
  }
  if (config.test_auto_3) {
    run_timed_atan2(pre_iterations, iterations, false, false, true, cases, auto_3, ref_out, ys, xs, out);
  }
  if (config.test_auto_4) {
    run_timed_atan2(pre_iterations, iterations, false, true, true, cases, auto_4, ref_out, ys, xs, out);
  }
  #ifdef USE_AVX
  if (config.test_manual_1) {
    run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_1, ref_out, ys, xs, out);
  }
  if (config.test_manual_2) {
    run_timed_atan2(pre_iterations, iterations, true, true, true, cases, manual_2, ref_out, ys, xs, out);
  }
  #endif
  cout << endl;

  free(ys); free(xs); free(ref_out); free(out);
  perf_close();

  return EXIT_SUCCESS;
}