bwasti/arm_bench.cc

## arm_bench.cc
/* @file arm_bench.cc
 * @author bwasti
 * @brief Simple benchmarking suite for a couple different ARM instructions.
 *
 * To add an instruction to be tested see the RUN_ITERS macro, which finds
 * the minimum time over a sequence of runs[0]. See usage below.
 *
 * To use this file standalone just compile it:
 *
 *  gcc -O2 arm_bench.c -o arm_bench
 *
 * To use this file with another you'll have to specify
 * -DCOMPILE_ARM_BENCH_AS_LIB and call runBenchmark().
 *
 *  clang -O2 my_test.c -DCOMPILE_ARM_BENCH_AS_LIB arm_bench.c -o my_test
 *
 * To hard code a clock rate (such as when profiling an Apple device) use
 * -DCLOCK_RATE=2330000000 to set a 2.33GHz clock rate.
 *
 * [0] Specified by outerIters.  Use innerIters for your asm loop or the math
 * will be wrong.
 */
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

#ifdef __linux__
#include <asm/unistd.h>
#include <linux/hw_breakpoint.h>
#include <linux/perf_event.h>
#include <sys/syscall.h>
#endif

#ifdef __APPLE__
#include <mach/clock.h>
#include <mach/mach.h>
#include <mach/mach_time.h>
#endif

extern "C" {
void runBenchmarks(void);
}

#define BILLION 1000000000

#if defined(__aarch64__) || defined(__arm__)
#define COMPILE_ARM
#endif

/*
 * ASM macros here.
 *
 */
#ifdef COMPILE_ARM
// Apple has an ancient assembler that doesn't support .rept directives
#ifdef __APPLE__
const static size_t innerIters = 500000;
#define ASM_REPS 1
#define START_ASM "1:\n\t"
#define END_ASM(_i)                                                            \
  "SUBS %[" #_i "], %[" #_i "], #1\n\t"                                        \
  "BNE 1b\n\t"
#else
const static size_t innerIters = 10000;
#define ASM_REPS 50
#define START_ASM                                                              \
  "1:\n\t"                                                                     \
  ".REPT 50"
#define END_ASM(_i)                                                            \
  ".ENDR\n\t"                                                                  \
  "SUBS %[" #_i "], %[" #_i "], #1\n\t"                                        \
  "BNE 1b\n\t"
#endif
#else
#define START_ASM #error Macro not yet supported on non-ARM platforms.
#define END_ASM #error Macro not yet supported on non-ARM platforms.
#endif

/*
 * Set up timers here.
 *
 */
#ifdef __linux__
static int gPerfEventFd;
static void perf_event_handler(int signum, siginfo_t *info, void *ucontext) {
  if (info->si_code == POLL_IN) {
    fprintf(stderr, "Recieved POLL_IN from perf_event\n");
    exit(1);
  }
  if (info->si_code != POLL_HUP) {
    fprintf(stderr, "Recieved unexpected signal from perf_event (%d)\n",
            info->si_errno);
    exit(1);
  }
  ioctl(info->si_fd, PERF_EVENT_IOC_REFRESH, 1);
}

static void shutdownPerfEvent(void) {
  if (gPerfEventFd != -1) {
    ioctl(gPerfEventFd, PERF_EVENT_IOC_DISABLE, 0);
    close(gPerfEventFd);
  }
}
#endif
#ifdef __APPLE__
static double gMachToNano;
#endif

static void setupTimers(void) {
#ifdef __linux__
  struct sigaction sa;
  memset(&sa, 0, sizeof(struct sigaction));
  sa.sa_sigaction = perf_event_handler;
  sa.sa_flags = SA_SIGINFO;

  if (sigaction(SIGIO, &sa, NULL) < 0) {
    fprintf(stderr,
            "Error setting up signal handler. Can't set up perf_event\n");
  }

  struct perf_event_attr pe;
  memset(&pe, 0, sizeof(struct perf_event_attr));
  pe.size = sizeof(struct perf_event_attr);
  pe.type = PERF_TYPE_HARDWARE;
  pe.config = PERF_COUNT_HW_CPU_CYCLES;
  pe.exclude_kernel = 1;
  pe.exclude_hv = 1;
  gPerfEventFd = syscall(__NR_perf_event_open, &pe, 0 /* pid */,
                         -1 /* any cpu */, -1 /* group pid */, 0 /* flags */);
  if (gPerfEventFd == -1) {
    perror("Couldn't open perf_event");
  }

  fcntl(gPerfEventFd, F_SETFL, O_NONBLOCK | O_ASYNC);
  fcntl(gPerfEventFd, F_SETSIG, SIGIO);
  fcntl(gPerfEventFd, F_SETOWN, getpid());
#endif
#ifdef __APPLE__
  mach_timebase_info_data_t timebase;
  (void)mach_timebase_info(&timebase);
  gMachToNano = (double)timebase.numer / timebase.denom;
#endif
}

void cleanupTimers(void) {
#ifdef __linux__
  shutdownPerfEvent();
#endif
}

static void startCycleCount() {
#ifdef __linux__
  if (gPerfEventFd == -1)
    return;
  ioctl(gPerfEventFd, PERF_EVENT_IOC_RESET, 0);
  ioctl(gPerfEventFd, PERF_EVENT_IOC_REFRESH, 1);
#endif
}

// Returns the number of cycles
static long long endCycleCount() {
#ifdef __linux__
  if (gPerfEventFd == -1)
    return 0;
  long long counter;
  int err = read(gPerfEventFd, &counter, sizeof(long long));
  if (err < 0) {
    perror("Couldn't read!\n");
  }
  return counter;
#endif
  return 0;
}

/* Some helper macros for timing chunks of code.
 */
static double gClockRate = 0;
#ifdef __linux__
#define TIME(_instr)                                                           \
  long instructionNanos = 0;                                                   \
  long long instructionCycles = 0;                                             \
  do {                                                                         \
    startCycleCount();                                                         \
    struct timespec start, stop;                                               \
    clock_gettime(CLOCK_MONOTONIC, &start);                                    \
    _instr;                                                                    \
    clock_gettime(CLOCK_MONOTONIC, &stop);                                     \
    instructionNanos =                                                         \
        stop.tv_nsec - start.tv_nsec + (stop.tv_sec - start.tv_sec) * BILLION; \
    instructionCycles = endCycleCount();                                       \
    if (!instructionCycles) {                                                  \
      instructionCycles = (long)(gClockRate * instructionNanos);               \
    }                                                                          \
  } while (0);

#else
#ifdef __APPLE__
#define TIME(_instr)                                                           \
  long instructionNanos = 0;                                                   \
  long long instructionCycles = 0;                                             \
  do {                                                                         \
    long long machTimeBegin = mach_absolute_time();                            \
    _instr;                                                                    \
    long long machTimeEnd = mach_absolute_time();                              \
    long long machTimePassed = machTimeEnd - machTimeBegin;                    \
    instructionNanos = (long)(machTimePassed * gMachToNano);                   \
    instructionCycles = (long)(gClockRate * instructionNanos);                 \
  } while (0);

#else // Not Linux or Apple
#error Only linux or apple targets are supported.
#endif
#endif

#ifdef __linux__
#define FMT_STR "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
static long getIntFromFile(const char *filename) {
  char buf[1024];
  FILE *file = fopen(filename, "r");
  fread(buf, 1, sizeof(buf), file);
  return atol(buf);
}
#endif

static void setupCPU(void) {
#ifdef __linux__
  const int numCPUs = sysconf(_SC_NPROCESSORS_ONLN);
  printf("\n\t%d CPUs found\n", numCPUs);
  char buf[1024];

  // Assumption here is that the first half are little
  sprintf(buf, FMT_STR, 0);
  long int littleSpeed = getIntFromFile(buf);
  sprintf(buf, FMT_STR, numCPUs / 2);
  long int bigSpeed = getIntFromFile(buf);

  printf("\tbig    %ldKHz\n", bigSpeed);
  printf("\tLITTLE %ldKHz\n", littleSpeed);

  cpu_set_t mask;
  CPU_ZERO(&mask);

  // Affinity to big cores (maybe?)
  for (int i = numCPUs / 2; i < numCPUs; ++i) {
    CPU_SET(i, &mask);
  }
  gClockRate = (double)bigSpeed * 1000 / BILLION;

  int result = sched_setaffinity(0, sizeof(mask), &mask);
  if (result) {
    fprintf(stderr, "Warning: could not set CPU affinity (err %d)\n", result);
  }
#endif
}

#define RUN_ITERS(_asm, _cycles_ref, _nanos_ref, _factor, _inner_iters,        \
                  _outer_iters, _overhead_cycles, _overhead_nanos)             \
  do {                                                                         \
    unsigned long long _minInstructionCycles = BILLION;                        \
    unsigned long long _minInstructionNanos = BILLION;                         \
    for (size_t outerIter = 0; outerIter < (_outer_iters); ++outerIter) {      \
      TIME(_asm)                                                               \
      if (instructionCycles < _minInstructionCycles) {                         \
        _minInstructionCycles = instructionCycles;                             \
      }                                                                        \
      if (instructionNanos < _minInstructionNanos) {                           \
        _minInstructionNanos = instructionNanos;                               \
      }                                                                        \
    }                                                                          \
    if ((_cycles_ref)) {                                                       \
      *(_cycles_ref) = (double)(_minInstructionCycles - (_overhead_cycles)) /  \
                       (double)((_inner_iters)*ASM_REPS * (_factor));          \
    }                                                                          \
    if ((_nanos_ref)) {                                                        \
      *(_nanos_ref) = (double)(_minInstructionNanos - (_overhead_nanos)) /     \
                      (double)((_inner_iters)*ASM_REPS * (_factor));           \
    }                                                                          \
  } while (0)

#define PRINT_TIME(_name, _cycles, _nanos)                                     \
  printf("%20s:\t %.3f cycles\t %4.3f ns\n", (_name), (_cycles), (_nanos));

// Number of iterations to find minimum over.
const static size_t outerIters = 100;

#ifdef COMPILE_ARM
inline static void overheadCheck(double *overheadCycles,
                                 double *overheadNanos) {
  RUN_ITERS(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "d0");
      },
      overheadCycles, overheadNanos, 1, innerIters, outerIters, 0, 0);
}
/* In case we don't have the ability to count cycles, we need to
 * infer how long an individual cycle is.
 */
inline static const void cycleTimeCheck(double overheadCycles,
                                        double overheadNanos, double *cycles,
                                        double *nanos) {
  RUN_ITERS(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "EOR V1.8B, V0.8B, V1.8B\n\t"
                               "EOR V1.8B, V2.8B, V1.8B\n\t"
                               "EOR V1.8B, V1.8B, V3.8B\n\t" END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "v0", "v1", "v2", "v3");
      },
      cycles, nanos, 3, innerIters, outerIters, overheadCycles, overheadNanos);
}

#else
#error Cannont check time for cycles on non-ARM architecture.
#endif
void getStatistics(double *overheadCycles, double *overheadNanos) {
  printf("\n\tChecking loop overhead...");
  overheadCheck(overheadCycles, overheadNanos);
  printf("%lfns.\n", *overheadNanos);
  double cycle_ns = 0;
  double cycle_cycle = 0;
  printf("\tChecking individual cycle run...");
  cycleTimeCheck(*overheadCycles, *overheadNanos, &cycle_cycle, &cycle_ns);
  printf("%lfns.\n", cycle_ns);
  if (*overheadCycles == 0 && *overheadNanos != 0) {
    printf("\tWe will need to infer cycles based on clock rate.\n");
    cycle_cycle = 1;
  }
#ifndef CLOCK_RATE
  if (gClockRate == 0) {
    gClockRate = 1.0 / cycle_ns;
    printf("\tInferred clock rate: %lfGHz\n", gClockRate);
  } else {
    printf("\tSystem queried clock rate: %lfGHz\n", gClockRate);
  }
#else
  gClockRate = (double)CLOCK_RATE / BILLION;
  printf("\tUsing compiler flag clock rate: %lfGHz\n", gClockRate);
#endif
}

void runBenchmarks(void) {
  printf("Setting up CPUs...");
  setupCPU();
  printf("done.\n");
  printf("Setting up timers...");
  setupTimers();
  printf("done.\n");

  double overheadCycles = 0;
  double overheadNanos = 0;
  printf("Getting system statistics...");
  getStatistics(&overheadCycles, &overheadNanos);
  printf("done.\n");

  double minInstructionCycles;
  double minInstructionNanos;
#define ASM_ITER(_asm, _factor)                                                \
  RUN_ITERS((_asm), &minInstructionCycles, &minInstructionNanos, (_factor),    \
            innerIters, outerIters, overheadCycles, overheadNanos);
#define ASM_PRINT(_str)                                                        \
  PRINT_TIME((_str), minInstructionCycles, minInstructionNanos);
  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "EOR W1, W3, W2, LSL #13\n\t"
                               "EOR W3, W2, W1, LSR #17\n\t"
                               "EOR W2, W1, W3, LSL #5\n\t" END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "d3", "d1", "d2");
      },
      3);
  ASM_PRINT("EOR latency");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "EOR W3, W3, W3, LSL #13\n\t"
                               "EOR W1, W1, W1, LSR #17\n\t"
                               "EOR W2, W2, W2, LSL #5\n\t" END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "d0", "d1", "d2", "d3");
      },
      3);
  ASM_PRINT("EOR throughput");

  ASM_ITER(
      {
        const double input = 0x1.0000000000000p+1;
        size_t innerIter = innerIters;
        asm volatile(START_ASM "FSQRT d2, d2\n\t"
                               "FSQRT d2, d2\n\t"
                               "FSQRT d2, d2\n\t" END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     : [input] "r"(input)
                     : "d2");
      },
      3);
  ASM_PRINT("FSQRT latency");

  ASM_ITER(
      {
        const double input = 0x1.0000000000000p+1;
        size_t innerIter = innerIters;
        asm volatile(START_ASM "FSQRT d2, d2\n\t"
                               "FSQRT d3, d3\n\t"
                               "FSQRT d4, d4\n\t"
                               "FSQRT d5, d5\n\t"
                               "FSQRT d6, d6\n\t"
                               "FSQRT d7, d7\n\t"
                               "FSQRT d8, d8\n\t"
                               "FSQRT d9, d9\n\t"
                               "FSQRT d10, d10\n\t"
                               "FSQRT d11, d11\n\t" END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     : [input] "w"(input)
                     : "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
                       "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18",
                       "d19", "d20", "d21");
      },
      10);
  ASM_PRINT("FSQRT throughput");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "FMLA V0.4S, V1.4S, V2.4S\n\t"
                               "FMLA V1.4S, V2.4S, V0.4S\n\t" // 0 dep
                               "FMLA V2.4S, V0.4S, V1.4S\n\t" // 1 dep
                     END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "v0", "v1", "v2");
      },
      3);
  ASM_PRINT("FMLA latency");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(
            START_ASM "FMLA V0.4S, V0.4S, V0.4S\n\t"
                      "FMLA V1.4S, V1.4S, V1.4S\n\t"
                      "FMLA V2.4S, V2.4S, V2.4S\n\t"
                      "FMLA V3.4S, V3.4S, V3.4S\n\t"
                      "FMLA V4.4S, V4.4S, V4.4S\n\t"
                      "FMLA V5.4S, V5.4S, V5.4S\n\t"
                      "FMLA V6.4S, V6.4S, V6.4S\n\t"
                      "FMLA V7.4S, V7.4S, V7.4S\n\t"
                      "FMLA V8.4S, V8.4S, V8.4S\n\t"
                      "FMLA V9.4S, V9.4S, V9.4S\n\t"
                      "FMLA V10.4S, V10.4S, V10.4S\n\t"
                      "FMLA V11.4S, V11.4S, V11.4S\n\t"
                      "FMLA V12.4S, V12.4S, V12.4S\n\t"
                      "FMLA V13.4S, V13.4S, V13.4S\n\t"
                      "FMLA V14.4S, V14.4S, V14.4S\n\t"
                      "FMLA V15.4S, V15.4S, V15.4S\n\t"
                      "FMLA V16.4S, V16.4S, V16.4S\n\t"
                      "FMLA V17.4S, V17.4S, V17.4S\n\t"
                      "FMLA V18.4S, V18.4S, V18.4S\n\t"
                      "FMLA V19.4S, V19.4S, V19.4S\n\t" END_ASM(innerIter)
            : [innerIter] "+r"(innerIter)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
      },
      20);
  ASM_PRINT("FMLA throughput");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "SQDMLAL2 V0.4S, V0.8H, V0.8H\n\t"
                               "SQDMLAL2 V1.4S, V1.8H, V0.8H\n\t" // 0 dep
                               "SQDMLAL2 V0.4S, V2.8H, V1.8H\n\t" // 1 dep
                               "SQDMLAL2 V2.4S, V0.8H, V1.8H\n\t" // 0 dep
                               "SQDMLAL2 V0.4S, V2.8H, V1.8H\n\t" // 2 dep
                               "SQDMLAL2 V0.4S, V2.8H, V0.8H\n\t" // 0 dep
                               "SQDMLAL2 V1.4S, V1.8H, V0.8H\n\t" // 0 dep
                               "SQDMLAL2 V1.4S, V1.8H, V0.8H\n\t" // 1 dep
                     END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "v0", "v1", "v2");
      },
      7);
  ASM_PRINT("SQDMLA2 latency");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(
            START_ASM "SQDMLAL2 V0.4S, V0.8H, V0.8H\n\t"
                      "SQDMLAL2 V1.4S, V1.8H, V1.8H\n\t"
                      "SQDMLAL2 V2.4S, V2.8H, V2.8H\n\t"
                      "SQDMLAL2 V3.4S, V3.8H, V3.8H\n\t"
                      "SQDMLAL2 V4.4S, V4.8H, V4.8H\n\t"
                      "SQDMLAL2 V5.4S, V5.8H, V5.8H\n\t"
                      "SQDMLAL2 V6.4S, V6.8H, V6.8H\n\t"
                      "SQDMLAL2 V7.4S, V7.8H, V7.8H\n\t"
                      "SQDMLAL2 V8.4S, V8.8H, V8.8H\n\t"
                      "SQDMLAL2 V9.4S, V9.8H, V9.8H\n\t" END_ASM(innerIter)
            : [innerIter] "+r"(innerIter)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
      },
      10);
  ASM_PRINT("SQDMLA2 throughput");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "MLA V0.16B, V0.16B, V0.16B\n\t"
                               "MLA V1.16B, V1.16B, V0.16B\n\t" // 0 dep
                               "MLA V0.16B, V2.16B, V1.16B\n\t" // 1 dep
                               "MLA V2.16B, V0.16B, V1.16B\n\t" // 0 dep
                               "MLA V0.16B, V2.16B, V1.16B\n\t" // 2 dep
                               "MLA V0.16B, V2.16B, V0.16B\n\t" // 0 dep
                               "MLA V1.16B, V1.16B, V0.16B\n\t" // 0 dep
                               "MLA V1.16B, V1.16B, V0.16B\n\t" // 1 dep
                     END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "v0", "v1", "v2");
      },
      7);
  ASM_PRINT("MLA latency");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(
            START_ASM "MLA V0.16B, V0.16B, V0.16B\n\t"
                      "MLA V1.16B, V1.16B, V1.16B\n\t"
                      "MLA V2.16B, V2.16B, V2.16B\n\t"
                      "MLA V3.16B, V3.16B, V3.16B\n\t"
                      "MLA V4.16B, V4.16B, V4.16B\n\t"
                      "MLA V5.16B, V5.16B, V5.16B\n\t"
                      "MLA V6.16B, V6.16B, V6.16B\n\t"
                      "MLA V7.16B, V7.16B, V7.16B\n\t"
                      "MLA V8.16B, V8.16B, V8.16B\n\t"
                      "MLA V9.16B, V9.16B, V9.16B\n\t"
                      "MLA V10.16B, V10.16B, V10.16B\n\t"
                      "MLA V11.16B, V11.16B, V11.16B\n\t"
                      "MLA V12.16B, V12.16B, V12.16B\n\t"
                      "MLA V13.16B, V13.16B, V13.16B\n\t"
                      "MLA V14.16B, V14.16B, V14.16B\n\t"
                      "MLA V15.16B, V15.16B, V15.16B\n\t"
                      "MLA V16.16B, V16.16B, V16.16B\n\t"
                      "MLA V17.16B, V17.16B, V17.16B\n\t"
                      "MLA V18.16B, V18.16B, V18.16B\n\t"
                      "MLA V19.16B, V19.16B, V19.16B\n\t" END_ASM(innerIter)
            : [innerIter] "+r"(innerIter)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
      },
      20);
  ASM_PRINT("MLA throughput");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "AND V0.16B, V0.16B, V0.16B\n\t"
                               "AND V1.16B, V1.16B, V0.16B\n\t" // 0 dep
                               "AND V0.16B, V2.16B, V1.16B\n\t" // 1 dep
                               "AND V2.16B, V0.16B, V1.16B\n\t" // 0 dep
                               "AND V0.16B, V2.16B, V1.16B\n\t" // 2 dep
                               "AND V0.16B, V2.16B, V0.16B\n\t" // 0 dep
                               "AND V1.16B, V1.16B, V0.16B\n\t" // 0 dep
                               "AND V1.16B, V1.16B, V0.16B\n\t" // 1 dep
                     END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "v0", "v1", "v2");
      },
      7);
  ASM_PRINT("AND latency");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(
            START_ASM "AND V0.16B, V0.16B, V0.16B\n\t"
                      "AND V1.16B, V1.16B, V1.16B\n\t"
                      "AND V2.16B, V2.16B, V2.16B\n\t"
                      "AND V3.16B, V3.16B, V3.16B\n\t"
                      "AND V4.16B, V4.16B, V4.16B\n\t"
                      "AND V5.16B, V5.16B, V5.16B\n\t"
                      "AND V6.16B, V6.16B, V6.16B\n\t"
                      "AND V7.16B, V7.16B, V7.16B\n\t"
                      "AND V8.16B, V8.16B, V8.16B\n\t"
                      "AND V9.16B, V9.16B, V9.16B\n\t"
                      "AND V10.16B, V10.16B, V10.16B\n\t"
                      "AND V11.16B, V11.16B, V11.16B\n\t"
                      "AND V12.16B, V12.16B, V12.16B\n\t"
                      "AND V13.16B, V13.16B, V13.16B\n\t"
                      "AND V14.16B, V14.16B, V14.16B\n\t"
                      "AND V15.16B, V15.16B, V15.16B\n\t"
                      "AND V16.16B, V16.16B, V16.16B\n\t"
                      "AND V17.16B, V17.16B, V17.16B\n\t"
                      "AND V18.16B, V18.16B, V18.16B\n\t"
                      "AND V19.16B, V19.16B, V19.16B\n\t" END_ASM(innerIter)
            : [innerIter] "+r"(innerIter)
            :
            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
      },
      20);
  ASM_PRINT("AND throughput");

  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "ADDV B0, V0.16B\n\t"
                               "ADDV B1, V1.16B\n\t"
                               "ADDV B2, V2.16B\n\t"
                               "ADDV B3, V3.16B\n\t"
                               "ADDV B4, V4.16B\n\t" END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "d0", "d1", "d2", "d3", "d4", "v0", "v1", "v2", "v3",
                       "v4");
      },
      5);
  ASM_PRINT("ADDV.16B throughput");
  ASM_ITER(
      {
        size_t innerIter = innerIters;
        asm volatile(START_ASM "ADDV S0, V0.4S\n\t"
                               "ADDV S1, V1.4S\n\t"
                               "ADDV S2, V2.4S\n\t"
                               "ADDV S3, V3.4S\n\t"
                               "ADDV S4, V4.4S\n\t"
                               "ADDV S5, V5.4S\n\t"
                               "ADDV S6, V6.4S\n\t"
                               "ADDV S7, V7.4S\n\t"
                               "ADDV S8, V8.4S\n\t"
                               "ADDV S9, V9.4S\n\t" END_ASM(innerIter)
                     : [innerIter] "+r"(innerIter)
                     :
                     : "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d0",
                       "d1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
                       "v8", "v9");
      },
      10);
  ASM_PRINT("ADDV.4S throughput");

  cleanupTimers();
}
#ifndef COMPILE_ARM_BENCH_AS_LIB
int main(void) {
  runBenchmarks();
  return 0;
}
#endif
	/* @file arm_bench.cc
	* @author bwasti
	* @brief Simple benchmarking suite for a couple different ARM instructions.
	*
	* To add an instruction to be tested see the RUN_ITERS macro, which finds
	* the minimum time over a sequence of runs[0]. See usage below.
	*
	* To use this file standalone just compile it:
	*
	* gcc -O2 arm_bench.c -o arm_bench
	*
	* To use this file with another you'll have to specify
	* -DCOMPILE_ARM_BENCH_AS_LIB and call runBenchmark().
	*
	* clang -O2 my_test.c -DCOMPILE_ARM_BENCH_AS_LIB arm_bench.c -o my_test
	*
	* To hard code a clock rate (such as when profiling an Apple device) use
	* -DCLOCK_RATE=2330000000 to set a 2.33GHz clock rate.
	*
	* [0] Specified by outerIters. Use innerIters for your asm loop or the math
	* will be wrong.
	*/
	#include <fcntl.h>
	#include <sched.h>
	#include <signal.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>
	#include <unistd.h>

	#ifdef __linux__
	#include <asm/unistd.h>
	#include <linux/hw_breakpoint.h>
	#include <linux/perf_event.h>
	#include <sys/syscall.h>
	#endif

	#ifdef __APPLE__
	#include <mach/clock.h>
	#include <mach/mach.h>
	#include <mach/mach_time.h>
	#endif

	extern "C" {
	void runBenchmarks(void);
	}

	#define BILLION 1000000000

	#if defined(__aarch64__) \|\| defined(__arm__)
	#define COMPILE_ARM
	#endif

	/*
	* ASM macros here.
	*
	*/
	#ifdef COMPILE_ARM
	// Apple has an ancient assembler that doesn't support .rept directives
	#ifdef __APPLE__
	const static size_t innerIters = 500000;
	#define ASM_REPS 1
	#define START_ASM "1:\n\t"
	#define END_ASM(_i) \
	"SUBS %[" #_i "], %[" #_i "], #1\n\t" \
	"BNE 1b\n\t"
	#else
	const static size_t innerIters = 10000;
	#define ASM_REPS 50
	#define START_ASM \
	"1:\n\t" \
	".REPT 50"
	#define END_ASM(_i) \
	".ENDR\n\t" \
	"SUBS %[" #_i "], %[" #_i "], #1\n\t" \
	"BNE 1b\n\t"
	#endif
	#else
	#define START_ASM #error Macro not yet supported on non-ARM platforms.
	#define END_ASM #error Macro not yet supported on non-ARM platforms.
	#endif

	/*
	* Set up timers here.
	*
	*/
	#ifdef __linux__
	static int gPerfEventFd;
	static void perf_event_handler(int signum, siginfo_t info, void ucontext) {
	if (info->si_code == POLL_IN) {
	fprintf(stderr, "Recieved POLL_IN from perf_event\n");
	exit(1);
	}
	if (info->si_code != POLL_HUP) {
	fprintf(stderr, "Recieved unexpected signal from perf_event (%d)\n",
	info->si_errno);
	exit(1);
	}
	ioctl(info->si_fd, PERF_EVENT_IOC_REFRESH, 1);
	}

	static void shutdownPerfEvent(void) {
	if (gPerfEventFd != -1) {
	ioctl(gPerfEventFd, PERF_EVENT_IOC_DISABLE, 0);
	close(gPerfEventFd);
	}
	}
	#endif
	#ifdef __APPLE__
	static double gMachToNano;
	#endif

	static void setupTimers(void) {
	#ifdef __linux__
	struct sigaction sa;
	memset(&sa, 0, sizeof(struct sigaction));
	sa.sa_sigaction = perf_event_handler;
	sa.sa_flags = SA_SIGINFO;

	if (sigaction(SIGIO, &sa, NULL) < 0) {
	fprintf(stderr,
	"Error setting up signal handler. Can't set up perf_event\n");
	}

	struct perf_event_attr pe;
	memset(&pe, 0, sizeof(struct perf_event_attr));
	pe.size = sizeof(struct perf_event_attr);
	pe.type = PERF_TYPE_HARDWARE;
	pe.config = PERF_COUNT_HW_CPU_CYCLES;
	pe.exclude_kernel = 1;
	pe.exclude_hv = 1;
	gPerfEventFd = syscall(__NR_perf_event_open, &pe, 0 /* pid */,
	-1 /* any cpu /, -1 / group pid /, 0 / flags */);
	if (gPerfEventFd == -1) {
	perror("Couldn't open perf_event");
	}

	fcntl(gPerfEventFd, F_SETFL, O_NONBLOCK \| O_ASYNC);
	fcntl(gPerfEventFd, F_SETSIG, SIGIO);
	fcntl(gPerfEventFd, F_SETOWN, getpid());
	#endif
	#ifdef __APPLE__
	mach_timebase_info_data_t timebase;
	(void)mach_timebase_info(&timebase);
	gMachToNano = (double)timebase.numer / timebase.denom;
	#endif
	}

	void cleanupTimers(void) {
	#ifdef __linux__
	shutdownPerfEvent();
	#endif
	}

	static void startCycleCount() {
	#ifdef __linux__
	if (gPerfEventFd == -1)
	return;
	ioctl(gPerfEventFd, PERF_EVENT_IOC_RESET, 0);
	ioctl(gPerfEventFd, PERF_EVENT_IOC_REFRESH, 1);
	#endif
	}

	// Returns the number of cycles
	static long long endCycleCount() {
	#ifdef __linux__
	if (gPerfEventFd == -1)
	return 0;
	long long counter;
	int err = read(gPerfEventFd, &counter, sizeof(long long));
	if (err < 0) {
	perror("Couldn't read!\n");
	}
	return counter;
	#endif
	return 0;
	}

	/* Some helper macros for timing chunks of code.
	*/
	static double gClockRate = 0;
	#ifdef __linux__
	#define TIME(_instr) \
	long instructionNanos = 0; \
	long long instructionCycles = 0; \
	do { \
	startCycleCount(); \
	struct timespec start, stop; \
	clock_gettime(CLOCK_MONOTONIC, &start); \
	_instr; \
	clock_gettime(CLOCK_MONOTONIC, &stop); \
	instructionNanos = \
	stop.tv_nsec - start.tv_nsec + (stop.tv_sec - start.tv_sec) * BILLION; \
	instructionCycles = endCycleCount(); \
	if (!instructionCycles) { \
	instructionCycles = (long)(gClockRate * instructionNanos); \
	} \
	} while (0);

	#else
	#ifdef __APPLE__
	#define TIME(_instr) \
	long instructionNanos = 0; \
	long long instructionCycles = 0; \
	do { \
	long long machTimeBegin = mach_absolute_time(); \
	_instr; \
	long long machTimeEnd = mach_absolute_time(); \
	long long machTimePassed = machTimeEnd - machTimeBegin; \
	instructionNanos = (long)(machTimePassed * gMachToNano); \
	instructionCycles = (long)(gClockRate * instructionNanos); \
	} while (0);

	#else // Not Linux or Apple
	#error Only linux or apple targets are supported.
	#endif
	#endif

	#ifdef __linux__
	#define FMT_STR "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
	static long getIntFromFile(const char *filename) {
	char buf[1024];
	FILE *file = fopen(filename, "r");
	fread(buf, 1, sizeof(buf), file);
	return atol(buf);
	}
	#endif

	static void setupCPU(void) {
	#ifdef __linux__
	const int numCPUs = sysconf(_SC_NPROCESSORS_ONLN);
	printf("\n\t%d CPUs found\n", numCPUs);
	char buf[1024];

	// Assumption here is that the first half are little
	sprintf(buf, FMT_STR, 0);
	long int littleSpeed = getIntFromFile(buf);
	sprintf(buf, FMT_STR, numCPUs / 2);
	long int bigSpeed = getIntFromFile(buf);

	printf("\tbig %ldKHz\n", bigSpeed);
	printf("\tLITTLE %ldKHz\n", littleSpeed);

	cpu_set_t mask;
	CPU_ZERO(&mask);

	// Affinity to big cores (maybe?)
	for (int i = numCPUs / 2; i < numCPUs; ++i) {
	CPU_SET(i, &mask);
	}
	gClockRate = (double)bigSpeed * 1000 / BILLION;

	int result = sched_setaffinity(0, sizeof(mask), &mask);
	if (result) {
	fprintf(stderr, "Warning: could not set CPU affinity (err %d)\n", result);
	}
	#endif
	}

	#define RUN_ITERS(_asm, _cycles_ref, _nanos_ref, _factor, _inner_iters, \
	_outer_iters, _overhead_cycles, _overhead_nanos) \
	do { \
	unsigned long long _minInstructionCycles = BILLION; \
	unsigned long long _minInstructionNanos = BILLION; \
	for (size_t outerIter = 0; outerIter < (_outer_iters); ++outerIter) { \
	TIME(_asm) \
	if (instructionCycles < _minInstructionCycles) { \
	_minInstructionCycles = instructionCycles; \
	} \
	if (instructionNanos < _minInstructionNanos) { \
	_minInstructionNanos = instructionNanos; \
	} \
	} \
	if ((_cycles_ref)) { \
	*(_cycles_ref) = (double)(_minInstructionCycles - (_overhead_cycles)) / \
	(double)((_inner_iters)ASM_REPS (_factor)); \
	} \
	if ((_nanos_ref)) { \
	*(_nanos_ref) = (double)(_minInstructionNanos - (_overhead_nanos)) / \
	(double)((_inner_iters)ASM_REPS (_factor)); \
	} \
	} while (0)

	#define PRINT_TIME(_name, _cycles, _nanos) \
	printf("%20s:\t %.3f cycles\t %4.3f ns\n", (_name), (_cycles), (_nanos));

	// Number of iterations to find minimum over.
	const static size_t outerIters = 100;

	#ifdef COMPILE_ARM
	inline static void overheadCheck(double *overheadCycles,
	double *overheadNanos) {
	RUN_ITERS(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "d0");
	},
	overheadCycles, overheadNanos, 1, innerIters, outerIters, 0, 0);
	}
	/* In case we don't have the ability to count cycles, we need to
	* infer how long an individual cycle is.
	*/
	inline static const void cycleTimeCheck(double overheadCycles,
	double overheadNanos, double *cycles,
	double *nanos) {
	RUN_ITERS(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "EOR V1.8B, V0.8B, V1.8B\n\t"
	"EOR V1.8B, V2.8B, V1.8B\n\t"
	"EOR V1.8B, V1.8B, V3.8B\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2", "v3");
	},
	cycles, nanos, 3, innerIters, outerIters, overheadCycles, overheadNanos);
	}

	#else
	#error Cannont check time for cycles on non-ARM architecture.
	#endif
	void getStatistics(double overheadCycles, double overheadNanos) {
	printf("\n\tChecking loop overhead...");
	overheadCheck(overheadCycles, overheadNanos);
	printf("%lfns.\n", *overheadNanos);
	double cycle_ns = 0;
	double cycle_cycle = 0;
	printf("\tChecking individual cycle run...");
	cycleTimeCheck(overheadCycles, overheadNanos, &cycle_cycle, &cycle_ns);
	printf("%lfns.\n", cycle_ns);
	if (overheadCycles == 0 && overheadNanos != 0) {
	printf("\tWe will need to infer cycles based on clock rate.\n");
	cycle_cycle = 1;
	}
	#ifndef CLOCK_RATE
	if (gClockRate == 0) {
	gClockRate = 1.0 / cycle_ns;
	printf("\tInferred clock rate: %lfGHz\n", gClockRate);
	} else {
	printf("\tSystem queried clock rate: %lfGHz\n", gClockRate);
	}
	#else
	gClockRate = (double)CLOCK_RATE / BILLION;
	printf("\tUsing compiler flag clock rate: %lfGHz\n", gClockRate);
	#endif
	}

	void runBenchmarks(void) {
	printf("Setting up CPUs...");
	setupCPU();
	printf("done.\n");
	printf("Setting up timers...");
	setupTimers();
	printf("done.\n");

	double overheadCycles = 0;
	double overheadNanos = 0;
	printf("Getting system statistics...");
	getStatistics(&overheadCycles, &overheadNanos);
	printf("done.\n");

	double minInstructionCycles;
	double minInstructionNanos;
	#define ASM_ITER(_asm, _factor) \
	RUN_ITERS((_asm), &minInstructionCycles, &minInstructionNanos, (_factor), \
	innerIters, outerIters, overheadCycles, overheadNanos);
	#define ASM_PRINT(_str) \
	PRINT_TIME((_str), minInstructionCycles, minInstructionNanos);
	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "EOR W1, W3, W2, LSL #13\n\t"
	"EOR W3, W2, W1, LSR #17\n\t"
	"EOR W2, W1, W3, LSL #5\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "d3", "d1", "d2");
	},
	3);
	ASM_PRINT("EOR latency");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "EOR W3, W3, W3, LSL #13\n\t"
	"EOR W1, W1, W1, LSR #17\n\t"
	"EOR W2, W2, W2, LSL #5\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "d0", "d1", "d2", "d3");
	},
	3);
	ASM_PRINT("EOR throughput");

	ASM_ITER(
	{
	const double input = 0x1.0000000000000p+1;
	size_t innerIter = innerIters;
	asm volatile(START_ASM "FSQRT d2, d2\n\t"
	"FSQRT d2, d2\n\t"
	"FSQRT d2, d2\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	: [input] "r"(input)
	: "d2");
	},
	3);
	ASM_PRINT("FSQRT latency");

	ASM_ITER(
	{
	const double input = 0x1.0000000000000p+1;
	size_t innerIter = innerIters;
	asm volatile(START_ASM "FSQRT d2, d2\n\t"
	"FSQRT d3, d3\n\t"
	"FSQRT d4, d4\n\t"
	"FSQRT d5, d5\n\t"
	"FSQRT d6, d6\n\t"
	"FSQRT d7, d7\n\t"
	"FSQRT d8, d8\n\t"
	"FSQRT d9, d9\n\t"
	"FSQRT d10, d10\n\t"
	"FSQRT d11, d11\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	: [input] "w"(input)
	: "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
	"d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18",
	"d19", "d20", "d21");
	},
	10);
	ASM_PRINT("FSQRT throughput");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "FMLA V0.4S, V1.4S, V2.4S\n\t"
	"FMLA V1.4S, V2.4S, V0.4S\n\t" // 0 dep
	"FMLA V2.4S, V0.4S, V1.4S\n\t" // 1 dep
	END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2");
	},
	3);
	ASM_PRINT("FMLA latency");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(
	START_ASM "FMLA V0.4S, V0.4S, V0.4S\n\t"
	"FMLA V1.4S, V1.4S, V1.4S\n\t"
	"FMLA V2.4S, V2.4S, V2.4S\n\t"
	"FMLA V3.4S, V3.4S, V3.4S\n\t"
	"FMLA V4.4S, V4.4S, V4.4S\n\t"
	"FMLA V5.4S, V5.4S, V5.4S\n\t"
	"FMLA V6.4S, V6.4S, V6.4S\n\t"
	"FMLA V7.4S, V7.4S, V7.4S\n\t"
	"FMLA V8.4S, V8.4S, V8.4S\n\t"
	"FMLA V9.4S, V9.4S, V9.4S\n\t"
	"FMLA V10.4S, V10.4S, V10.4S\n\t"
	"FMLA V11.4S, V11.4S, V11.4S\n\t"
	"FMLA V12.4S, V12.4S, V12.4S\n\t"
	"FMLA V13.4S, V13.4S, V13.4S\n\t"
	"FMLA V14.4S, V14.4S, V14.4S\n\t"
	"FMLA V15.4S, V15.4S, V15.4S\n\t"
	"FMLA V16.4S, V16.4S, V16.4S\n\t"
	"FMLA V17.4S, V17.4S, V17.4S\n\t"
	"FMLA V18.4S, V18.4S, V18.4S\n\t"
	"FMLA V19.4S, V19.4S, V19.4S\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
	"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
	},
	20);
	ASM_PRINT("FMLA throughput");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "SQDMLAL2 V0.4S, V0.8H, V0.8H\n\t"
	"SQDMLAL2 V1.4S, V1.8H, V0.8H\n\t" // 0 dep
	"SQDMLAL2 V0.4S, V2.8H, V1.8H\n\t" // 1 dep
	"SQDMLAL2 V2.4S, V0.8H, V1.8H\n\t" // 0 dep
	"SQDMLAL2 V0.4S, V2.8H, V1.8H\n\t" // 2 dep
	"SQDMLAL2 V0.4S, V2.8H, V0.8H\n\t" // 0 dep
	"SQDMLAL2 V1.4S, V1.8H, V0.8H\n\t" // 0 dep
	"SQDMLAL2 V1.4S, V1.8H, V0.8H\n\t" // 1 dep
	END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2");
	},
	7);
	ASM_PRINT("SQDMLA2 latency");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(
	START_ASM "SQDMLAL2 V0.4S, V0.8H, V0.8H\n\t"
	"SQDMLAL2 V1.4S, V1.8H, V1.8H\n\t"
	"SQDMLAL2 V2.4S, V2.8H, V2.8H\n\t"
	"SQDMLAL2 V3.4S, V3.8H, V3.8H\n\t"
	"SQDMLAL2 V4.4S, V4.8H, V4.8H\n\t"
	"SQDMLAL2 V5.4S, V5.8H, V5.8H\n\t"
	"SQDMLAL2 V6.4S, V6.8H, V6.8H\n\t"
	"SQDMLAL2 V7.4S, V7.8H, V7.8H\n\t"
	"SQDMLAL2 V8.4S, V8.8H, V8.8H\n\t"
	"SQDMLAL2 V9.4S, V9.8H, V9.8H\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
	},
	10);
	ASM_PRINT("SQDMLA2 throughput");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "MLA V0.16B, V0.16B, V0.16B\n\t"
	"MLA V1.16B, V1.16B, V0.16B\n\t" // 0 dep
	"MLA V0.16B, V2.16B, V1.16B\n\t" // 1 dep
	"MLA V2.16B, V0.16B, V1.16B\n\t" // 0 dep
	"MLA V0.16B, V2.16B, V1.16B\n\t" // 2 dep
	"MLA V0.16B, V2.16B, V0.16B\n\t" // 0 dep
	"MLA V1.16B, V1.16B, V0.16B\n\t" // 0 dep
	"MLA V1.16B, V1.16B, V0.16B\n\t" // 1 dep
	END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2");
	},
	7);
	ASM_PRINT("MLA latency");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(
	START_ASM "MLA V0.16B, V0.16B, V0.16B\n\t"
	"MLA V1.16B, V1.16B, V1.16B\n\t"
	"MLA V2.16B, V2.16B, V2.16B\n\t"
	"MLA V3.16B, V3.16B, V3.16B\n\t"
	"MLA V4.16B, V4.16B, V4.16B\n\t"
	"MLA V5.16B, V5.16B, V5.16B\n\t"
	"MLA V6.16B, V6.16B, V6.16B\n\t"
	"MLA V7.16B, V7.16B, V7.16B\n\t"
	"MLA V8.16B, V8.16B, V8.16B\n\t"
	"MLA V9.16B, V9.16B, V9.16B\n\t"
	"MLA V10.16B, V10.16B, V10.16B\n\t"
	"MLA V11.16B, V11.16B, V11.16B\n\t"
	"MLA V12.16B, V12.16B, V12.16B\n\t"
	"MLA V13.16B, V13.16B, V13.16B\n\t"
	"MLA V14.16B, V14.16B, V14.16B\n\t"
	"MLA V15.16B, V15.16B, V15.16B\n\t"
	"MLA V16.16B, V16.16B, V16.16B\n\t"
	"MLA V17.16B, V17.16B, V17.16B\n\t"
	"MLA V18.16B, V18.16B, V18.16B\n\t"
	"MLA V19.16B, V19.16B, V19.16B\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
	"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
	},
	20);
	ASM_PRINT("MLA throughput");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "AND V0.16B, V0.16B, V0.16B\n\t"
	"AND V1.16B, V1.16B, V0.16B\n\t" // 0 dep
	"AND V0.16B, V2.16B, V1.16B\n\t" // 1 dep
	"AND V2.16B, V0.16B, V1.16B\n\t" // 0 dep
	"AND V0.16B, V2.16B, V1.16B\n\t" // 2 dep
	"AND V0.16B, V2.16B, V0.16B\n\t" // 0 dep
	"AND V1.16B, V1.16B, V0.16B\n\t" // 0 dep
	"AND V1.16B, V1.16B, V0.16B\n\t" // 1 dep
	END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2");
	},
	7);
	ASM_PRINT("AND latency");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(
	START_ASM "AND V0.16B, V0.16B, V0.16B\n\t"
	"AND V1.16B, V1.16B, V1.16B\n\t"
	"AND V2.16B, V2.16B, V2.16B\n\t"
	"AND V3.16B, V3.16B, V3.16B\n\t"
	"AND V4.16B, V4.16B, V4.16B\n\t"
	"AND V5.16B, V5.16B, V5.16B\n\t"
	"AND V6.16B, V6.16B, V6.16B\n\t"
	"AND V7.16B, V7.16B, V7.16B\n\t"
	"AND V8.16B, V8.16B, V8.16B\n\t"
	"AND V9.16B, V9.16B, V9.16B\n\t"
	"AND V10.16B, V10.16B, V10.16B\n\t"
	"AND V11.16B, V11.16B, V11.16B\n\t"
	"AND V12.16B, V12.16B, V12.16B\n\t"
	"AND V13.16B, V13.16B, V13.16B\n\t"
	"AND V14.16B, V14.16B, V14.16B\n\t"
	"AND V15.16B, V15.16B, V15.16B\n\t"
	"AND V16.16B, V16.16B, V16.16B\n\t"
	"AND V17.16B, V17.16B, V17.16B\n\t"
	"AND V18.16B, V18.16B, V18.16B\n\t"
	"AND V19.16B, V19.16B, V19.16B\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
	"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
	},
	20);
	ASM_PRINT("AND throughput");

	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "ADDV B0, V0.16B\n\t"
	"ADDV B1, V1.16B\n\t"
	"ADDV B2, V2.16B\n\t"
	"ADDV B3, V3.16B\n\t"
	"ADDV B4, V4.16B\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "d0", "d1", "d2", "d3", "d4", "v0", "v1", "v2", "v3",
	"v4");
	},
	5);
	ASM_PRINT("ADDV.16B throughput");
	ASM_ITER(
	{
	size_t innerIter = innerIters;
	asm volatile(START_ASM "ADDV S0, V0.4S\n\t"
	"ADDV S1, V1.4S\n\t"
	"ADDV S2, V2.4S\n\t"
	"ADDV S3, V3.4S\n\t"
	"ADDV S4, V4.4S\n\t"
	"ADDV S5, V5.4S\n\t"
	"ADDV S6, V6.4S\n\t"
	"ADDV S7, V7.4S\n\t"
	"ADDV S8, V8.4S\n\t"
	"ADDV S9, V9.4S\n\t" END_ASM(innerIter)
	: [innerIter] "+r"(innerIter)
	:
	: "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d0",
	"d1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
	"v8", "v9");
	},
	10);
	ASM_PRINT("ADDV.4S throughput");

	cleanupTimers();
	}
	#ifndef COMPILE_ARM_BENCH_AS_LIB
	int main(void) {
	runBenchmarks();
	return 0;
	}
	#endif