simonwagner/benchmark.cpp

## benchmark.cpp
/*
Compile with gcc, because clang does not support alignment of stack values (and does not even warn about that).

g++-mp-4.8 -march=native -mtune=native -std=c++11 benchmark.cpp -O3 -o benchmark

Results on my computer:
naive: 22.6027 (1)
hadd: 10.0168 (0.443169)
hadd_2: 9.88255 (0.437228)
movemask: 9.85698 (0.436097)
*/

#include <iostream>
#include <algorithm>
#include <memory>
#include <functional>
#include <cassert>
#include <string>
#include <chrono>
#include <fstream>

#include <emmintrin.h>
#include <xmmintrin.h>
#include <tmmintrin.h>

using namespace std;

#define assert_aligned(x) assert((uintptr_t)(x) % 16 == 0)

#define RUNS (1000)
#define SIZE (1024*1024*8)


typedef uint32_t aligned_uint32_t __attribute__ ((aligned (16)));
typedef uint8_t aligned_uint8_t __attribute__ ((aligned (16)));

size_t inline constexpr aligned_length(size_t length, int alignement) {
  return length - (length % alignement);
}

size_t inline constexpr padded_length(size_t length, int multiple) {
  return length/multiple + (length % multiple != 0 ? 1 : 0);
}

void eq_naive(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
  size_t i;
  const size_t result_length = padded_length(length, 8);

  for(i = 0; i < length; ++i) {
    uint8_t cmp = (a[i] == b[i] ? 1 : 0);
    result[i/8] |= cmp << (i % 8);
  }

  if(length % 8 > 0) {
    result[length - 1] >>= 8 - (length % 8);
  }
}

__m128i inline vector_cmp(const aligned_uint32_t* a, const aligned_uint32_t* b) {
  __m128i mmx_mask = _mm_set_epi32(8,4,2,1);
  __m128i mmx_a = _mm_load_si128((__m128i*)a);
  __m128i mmx_b = _mm_load_si128((__m128i*)b);
  __m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b);
  __m128i mmx_and = _mm_and_si128(mmx_cmp, mmx_mask);

  return mmx_and;
}

void eq_hadd(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
  assert_aligned(a);
  assert_aligned(b);
  assert_aligned(result);

  size_t i;
  const size_t result_length = padded_length(length, 8);
  __m128i mmx_zero = _mm_setzero_si128();

  for(i = 0; i < aligned_length(length, 8); i+=8) {
    __m128i mmx_temp_result[2];

    for(int mi = 0; mi < 2; ++mi) {
      __m128i mmx_and = vector_cmp(&a[i+4*mi], &b[i+4*mi]);
      __m128i mmx_hadd1 = _mm_hadd_epi32(mmx_and, mmx_zero);
      mmx_temp_result[mi] = _mm_hadd_epi32(mmx_hadd1, mmx_zero);
    }
    __m128i shifted_mmx_temp_result = _mm_slli_epi64(mmx_temp_result[1], 4);
    __m128i mmx_result = _mm_or_si128(mmx_temp_result[0], shifted_mmx_temp_result);

    result[i/8] = _mm_extract_epi8(mmx_result, 0);
  }

  for(; i < length; ++i) {
    uint8_t cmp = (a[i] == b[i] ? 1 : 0);
    result[i/8] |= cmp << (i % 8);
  }

  if(length % 8 > 0) {
    result[length - 1] >>= 8 - (length % 8);
  }
}

void eq_hadd_2(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
  assert_aligned(a);
  assert_aligned(b);
  assert_aligned(result);

  size_t i;
  const size_t result_length = padded_length(length, 8);
  __m128i mmx_mult_shift = _mm_set_epi32(16,1,16,1);

  uint16_t* result_16bit_block = (uint16_t*)result; //we will write to the result in 16bit blocks
  for(i = 0; i < aligned_length(length, 16); i+=16) {
    __m128i mmx_a = vector_cmp(&a[i + 4*0], &b[i + 4*0]);
    __m128i mmx_b = vector_cmp(&a[i + 4*1], &b[i + 4*1]);
    __m128i mmx_c = vector_cmp(&a[i + 4*2], &b[i + 4*2]);
    __m128i mmx_d = vector_cmp(&a[i + 4*3], &b[i + 4*3]);

    __m128i mmx_x = _mm_hadd_epi32(mmx_a,mmx_b);
    __m128i mmx_y = _mm_hadd_epi32(mmx_c,mmx_d);

    __m128i mmx_z = _mm_hadd_epi32(mmx_x,mmx_y);

    __m128i mmx_z_shifted = _mm_mullo_epi32(mmx_z, mmx_mult_shift);
    __m128i mmx_result = _mm_hadd_epi32(mmx_z_shifted, mmx_z_shifted /*this argument does not matter*/);

    /*
    result[i/16 + 0] = _mm_extract_epi8(mmx_result, 0);
    result[i/16 + 1] = _mm_extract_epi8(mmx_result, 4);
    */
    //instead of writing the final values twice to memory, shuffle the result
    //and write it into memory at once
    __m128i mmx_shuffle_mask = _mm_set_epi32(0x80808080,0x80808080,0x80808080,0x80800400); /*4->1 and 0->0 */
    __m128i mmx_shuffled_result = _mm_shuffle_epi8(mmx_result, mmx_shuffle_mask);
    result_16bit_block[i/16] = _mm_extract_epi16(mmx_shuffled_result, 0);
  }

  for(; i < length; ++i) {
    uint8_t cmp = (a[i] == b[i] ? 1 : 0);
    result[i/8] |= cmp << (i % 8);
  }

  if(length % 8 > 0) {
    result[length - 1] >>= 8 - (length % 8);
  }
}


void eq_movemask(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
  assert_aligned(a);
  assert_aligned(b);
  assert_aligned(result);

  size_t i;
  const size_t result_length = padded_length(length, 8);

  for(i = 0; i < aligned_length(length, 8); i+=8) {
    uint16_t tmp_result[2];

    for(int mi = 0; mi < 2; ++mi) {
      __m128i mmx_a = _mm_load_si128((__m128i*)&a[i+4*mi]);
      __m128i mmx_b = _mm_load_si128((__m128i*)&b[i+4*mi]);
      __m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b);
      auto tmp1 = _mm_packs_epi32(mmx_cmp, mmx_cmp);
      auto tmp2 = _mm_packs_epi32(tmp1, tmp1);
      tmp_result[mi] = _mm_movemask_epi8(tmp2) & 0x0F;
    }
    result[i/8] = (tmp_result[1] << 4) | (tmp_result[0]);
  }

  for(; i < length; ++i) {
    uint8_t cmp = (a[i] == b[i] ? 1 : 0);
    result[i/8] |= cmp << (i % 8);
  }

  if(length % 8 > 0) {
    result[length - 1] >>= 8 - (length % 8);
  }
}

template<typename ArrayType, typename ResultType>
bool exec_test(function<void(const ArrayType*, const ArrayType*, ResultType*, size_t)> tested_func, const ArrayType* a, const ArrayType* b, size_t input_length, const ResultType* exp, size_t result_length, function<ResultType*(size_t)> result_alloc, function<void(ResultType*)> result_dealloc) {
  ResultType* result = result_alloc(result_length);

  tested_func(a, b, result, input_length);

  bool passed = true;
  for(int i = 0; i < result_length; ++i) {
    if(!(exp[i] == result[i])) {
      cerr << (unsigned int)exp[i] << ", got instead " << (unsigned int)result[i] << endl;
      passed = false;
      break;
    }
  }

  uninitialized_fill_n(result, result_length, 42); //fill with garbage, so that the old result is no longer there if memory is reused

  result_dealloc(result);
  return passed;
}

template<typename ArrayType, typename ResultType>
void run_test_on_func(const string name, function<void(const ArrayType*, const ArrayType*, ResultType*, size_t)> tested_func, function<ResultType*(size_t)> alloc, function<void(ResultType*)> dealloc) {
  assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001111}, 1, alloc, dealloc));
  assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001011}, 1, alloc, dealloc));

  assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111111}, 1, alloc, dealloc));
  assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111011}, 1, alloc, dealloc));

  assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, 11, (const ResultType[]){0b11111111, 0b00000111}, 2, alloc, dealloc));
  assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,1}, 11, (const ResultType[]){0b11111011, 0b00000011}, 2, alloc, dealloc));

  assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111111, 0b11111111}, 2, alloc, dealloc));
  assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111011, 0b10111111}, 2, alloc, dealloc));

  assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111111, 0b11111111, 0b11}, 2, alloc, dealloc));
  assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5,5,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111011, 0b10111111, 0b10}, 2, alloc, dealloc));
  cout << "ok: " << name << endl;
}

void touch(uint8_t* result, size_t length) {
  ofstream null("/dev/null");
  null << accumulate(result, result+length, 0);
}

double benchmark(function<void(const aligned_uint32_t*, const aligned_uint32_t*, aligned_uint8_t*, size_t)> f, const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* buffer, size_t length) {
  chrono::time_point<chrono::high_resolution_clock> start, end;
  chrono::nanoseconds elapsed = chrono::nanoseconds(0);

  for(int i = 0; i < RUNS; i++) {
    start = std::chrono::high_resolution_clock::now();
    f(a, b, buffer, length);
    end = std::chrono::high_resolution_clock::now();

    size_t result_length = padded_length(length, 8);
    //touch(buffer, result_length);

    elapsed += end-start;
  }

  return elapsed.count()/1.0e9;
}

void run_test() {
  function<aligned_uint8_t*(size_t)> alloc_uint8_t = [](size_t size) -> aligned_uint8_t* { return new aligned_uint8_t[size]; };
  function<void(aligned_uint8_t*)> dealloc_uint8_t = [](aligned_uint8_t* r) -> void { delete [] r;};

  run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_naive", eq_naive, alloc_uint8_t, dealloc_uint8_t);
  run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd", eq_hadd, alloc_uint8_t, dealloc_uint8_t);
  run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd_2", eq_hadd_2, alloc_uint8_t, dealloc_uint8_t);
  run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_movemask", eq_movemask, alloc_uint8_t, dealloc_uint8_t);
}

void run_benchmark() {
  //const aligned_uint32_t* a = (const aligned_uint32_t[]){0,1,2,3,4,5,6,7,8,9,0};
  const size_t length = SIZE;
  aligned_uint32_t* a = (aligned_uint32_t*)_mm_malloc(sizeof(uint32_t) * length, 16);
  aligned_uint32_t* b = (aligned_uint32_t*)_mm_malloc(sizeof(uint32_t) * length, 16);
  uint8_t* result = (uint8_t*)_mm_malloc(sizeof(uint8_t) * padded_length(length, 8), 16);

  cout << "running naive..." << endl;
  double naive_time = benchmark(eq_naive, a, b, result, length);
  cout << "running hadd..." << endl;
  double hadd_time = benchmark(eq_hadd, a, b, result, length);
  cout << "running hadd_2..." << endl;
  double hadd_2_time = benchmark(eq_hadd_2, a, b, result, length);
  cout << "running movemask..." << endl;
  double movemask_time = benchmark(eq_movemask, a, b, result, length);
  cout << endl;

  double base_time = naive_time;

  cout << "naive: " << naive_time << " (" << naive_time/base_time << ")" << endl;
  cout << "hadd: " << hadd_time << " (" << hadd_time/base_time << ")" <<  endl;
  cout << "hadd_2: " << hadd_2_time << " (" << hadd_2_time/base_time << ")" <<  endl;
  cout << "movemask: " << movemask_time << " (" << movemask_time/base_time << ")" <<  endl;

  _mm_free(a);
  _mm_free(b);
  _mm_free(result);
}

int main(int argc, const char** argv) {
  run_test();
  cout << "-------" << endl;
  run_benchmark();
}
	/*
	Compile with gcc, because clang does not support alignment of stack values (and does not even warn about that).

	g++-mp-4.8 -march=native -mtune=native -std=c++11 benchmark.cpp -O3 -o benchmark

	Results on my computer:
	naive: 22.6027 (1)
	hadd: 10.0168 (0.443169)
	hadd_2: 9.88255 (0.437228)
	movemask: 9.85698 (0.436097)
	*/

	#include <iostream>
	#include <algorithm>
	#include <memory>
	#include <functional>
	#include <cassert>
	#include <string>
	#include <chrono>
	#include <fstream>

	#include <emmintrin.h>
	#include <xmmintrin.h>
	#include <tmmintrin.h>

	using namespace std;

	#define assert_aligned(x) assert((uintptr_t)(x) % 16 == 0)

	#define RUNS (1000)
	#define SIZE (102410248)


	typedef uint32_t aligned_uint32_t __attribute__ ((aligned (16)));
	typedef uint8_t aligned_uint8_t __attribute__ ((aligned (16)));

	size_t inline constexpr aligned_length(size_t length, int alignement) {
	return length - (length % alignement);
	}

	size_t inline constexpr padded_length(size_t length, int multiple) {
	return length/multiple + (length % multiple != 0 ? 1 : 0);
	}

	void eq_naive(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
	size_t i;
	const size_t result_length = padded_length(length, 8);

	for(i = 0; i < length; ++i) {
	uint8_t cmp = (a[i] == b[i] ? 1 : 0);
	result[i/8] \|= cmp << (i % 8);
	}

	if(length % 8 > 0) {
	result[length - 1] >>= 8 - (length % 8);
	}
	}

	__m128i inline vector_cmp(const aligned_uint32_t* a, const aligned_uint32_t* b) {
	__m128i mmx_mask = _mm_set_epi32(8,4,2,1);
	__m128i mmx_a = _mm_load_si128((__m128i*)a);
	__m128i mmx_b = _mm_load_si128((__m128i*)b);
	__m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b);
	__m128i mmx_and = _mm_and_si128(mmx_cmp, mmx_mask);

	return mmx_and;
	}

	void eq_hadd(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
	assert_aligned(a);
	assert_aligned(b);
	assert_aligned(result);

	size_t i;
	const size_t result_length = padded_length(length, 8);
	__m128i mmx_zero = _mm_setzero_si128();

	for(i = 0; i < aligned_length(length, 8); i+=8) {
	__m128i mmx_temp_result[2];

	for(int mi = 0; mi < 2; ++mi) {
	__m128i mmx_and = vector_cmp(&a[i+4mi], &b[i+4mi]);
	__m128i mmx_hadd1 = _mm_hadd_epi32(mmx_and, mmx_zero);
	mmx_temp_result[mi] = _mm_hadd_epi32(mmx_hadd1, mmx_zero);
	}
	__m128i shifted_mmx_temp_result = _mm_slli_epi64(mmx_temp_result[1], 4);
	__m128i mmx_result = _mm_or_si128(mmx_temp_result[0], shifted_mmx_temp_result);

	result[i/8] = _mm_extract_epi8(mmx_result, 0);
	}

	for(; i < length; ++i) {
	uint8_t cmp = (a[i] == b[i] ? 1 : 0);
	result[i/8] \|= cmp << (i % 8);
	}

	if(length % 8 > 0) {
	result[length - 1] >>= 8 - (length % 8);
	}
	}

	void eq_hadd_2(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
	assert_aligned(a);
	assert_aligned(b);
	assert_aligned(result);

	size_t i;
	const size_t result_length = padded_length(length, 8);
	__m128i mmx_mult_shift = _mm_set_epi32(16,1,16,1);

	uint16_t* result_16bit_block = (uint16_t*)result; //we will write to the result in 16bit blocks
	for(i = 0; i < aligned_length(length, 16); i+=16) {
	__m128i mmx_a = vector_cmp(&a[i + 40], &b[i + 40]);
	__m128i mmx_b = vector_cmp(&a[i + 41], &b[i + 41]);
	__m128i mmx_c = vector_cmp(&a[i + 42], &b[i + 42]);
	__m128i mmx_d = vector_cmp(&a[i + 43], &b[i + 43]);

	__m128i mmx_x = _mm_hadd_epi32(mmx_a,mmx_b);
	__m128i mmx_y = _mm_hadd_epi32(mmx_c,mmx_d);

	__m128i mmx_z = _mm_hadd_epi32(mmx_x,mmx_y);

	__m128i mmx_z_shifted = _mm_mullo_epi32(mmx_z, mmx_mult_shift);
	__m128i mmx_result = _mm_hadd_epi32(mmx_z_shifted, mmx_z_shifted /this argument does not matter/);

	/*
	result[i/16 + 0] = _mm_extract_epi8(mmx_result, 0);
	result[i/16 + 1] = _mm_extract_epi8(mmx_result, 4);
	*/
	//instead of writing the final values twice to memory, shuffle the result
	//and write it into memory at once
	__m128i mmx_shuffle_mask = _mm_set_epi32(0x80808080,0x80808080,0x80808080,0x80800400); /4->1 and 0->0 /
	__m128i mmx_shuffled_result = _mm_shuffle_epi8(mmx_result, mmx_shuffle_mask);
	result_16bit_block[i/16] = _mm_extract_epi16(mmx_shuffled_result, 0);
	}

	for(; i < length; ++i) {
	uint8_t cmp = (a[i] == b[i] ? 1 : 0);
	result[i/8] \|= cmp << (i % 8);
	}

	if(length % 8 > 0) {
	result[length - 1] >>= 8 - (length % 8);
	}
	}


	void eq_movemask(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
	assert_aligned(a);
	assert_aligned(b);
	assert_aligned(result);

	size_t i;
	const size_t result_length = padded_length(length, 8);

	for(i = 0; i < aligned_length(length, 8); i+=8) {
	uint16_t tmp_result[2];

	for(int mi = 0; mi < 2; ++mi) {
	__m128i mmx_a = _mm_load_si128((__m128i)&a[i+4mi]);
	__m128i mmx_b = _mm_load_si128((__m128i)&b[i+4mi]);
	__m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b);
	auto tmp1 = _mm_packs_epi32(mmx_cmp, mmx_cmp);
	auto tmp2 = _mm_packs_epi32(tmp1, tmp1);
	tmp_result[mi] = _mm_movemask_epi8(tmp2) & 0x0F;
	}
	result[i/8] = (tmp_result[1] << 4) \| (tmp_result[0]);
	}

	for(; i < length; ++i) {
	uint8_t cmp = (a[i] == b[i] ? 1 : 0);
	result[i/8] \|= cmp << (i % 8);
	}

	if(length % 8 > 0) {
	result[length - 1] >>= 8 - (length % 8);
	}
	}

	template<typename ArrayType, typename ResultType>
	bool exec_test(function<void(const ArrayType, const ArrayType, ResultType, size_t)> tested_func, const ArrayType a, const ArrayType* b, size_t input_length, const ResultType* exp, size_t result_length, function<ResultType(size_t)> result_alloc, function<void(ResultType)> result_dealloc) {
	ResultType* result = result_alloc(result_length);

	tested_func(a, b, result, input_length);

	bool passed = true;
	for(int i = 0; i < result_length; ++i) {
	if(!(exp[i] == result[i])) {
	cerr << (unsigned int)exp[i] << ", got instead " << (unsigned int)result[i] << endl;
	passed = false;
	break;
	}
	}

	uninitialized_fill_n(result, result_length, 42); //fill with garbage, so that the old result is no longer there if memory is reused

	result_dealloc(result);
	return passed;
	}

	template<typename ArrayType, typename ResultType>
	void run_test_on_func(const string name, function<void(const ArrayType, const ArrayType, ResultType, size_t)> tested_func, function<ResultType(size_t)> alloc, function<void(ResultType*)> dealloc) {
	assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001111}, 1, alloc, dealloc));
	assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001011}, 1, alloc, dealloc));

	assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111111}, 1, alloc, dealloc));
	assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111011}, 1, alloc, dealloc));

	assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, 11, (const ResultType[]){0b11111111, 0b00000111}, 2, alloc, dealloc));
	assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,1}, 11, (const ResultType[]){0b11111011, 0b00000011}, 2, alloc, dealloc));

	assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111111, 0b11111111}, 2, alloc, dealloc));
	assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111011, 0b10111111}, 2, alloc, dealloc));

	assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111111, 0b11111111, 0b11}, 2, alloc, dealloc));
	assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5,5,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111011, 0b10111111, 0b10}, 2, alloc, dealloc));
	cout << "ok: " << name << endl;
	}

	void touch(uint8_t* result, size_t length) {
	ofstream null("/dev/null");
	null << accumulate(result, result+length, 0);
	}

	double benchmark(function<void(const aligned_uint32_t, const aligned_uint32_t, aligned_uint8_t, size_t)> f, const aligned_uint32_t a, const aligned_uint32_t* b, aligned_uint8_t* buffer, size_t length) {
	chrono::time_point<chrono::high_resolution_clock> start, end;
	chrono::nanoseconds elapsed = chrono::nanoseconds(0);

	for(int i = 0; i < RUNS; i++) {
	start = std::chrono::high_resolution_clock::now();
	f(a, b, buffer, length);
	end = std::chrono::high_resolution_clock::now();

	size_t result_length = padded_length(length, 8);
	//touch(buffer, result_length);

	elapsed += end-start;
	}

	return elapsed.count()/1.0e9;
	}

	void run_test() {
	function<aligned_uint8_t(size_t)> alloc_uint8_t = [](size_t size) -> aligned_uint8_t { return new aligned_uint8_t[size]; };
	function<void(aligned_uint8_t)> dealloc_uint8_t = [](aligned_uint8_t r) -> void { delete [] r;};

	run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_naive", eq_naive, alloc_uint8_t, dealloc_uint8_t);
	run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd", eq_hadd, alloc_uint8_t, dealloc_uint8_t);
	run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd_2", eq_hadd_2, alloc_uint8_t, dealloc_uint8_t);
	run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_movemask", eq_movemask, alloc_uint8_t, dealloc_uint8_t);
	}

	void run_benchmark() {
	//const aligned_uint32_t* a = (const aligned_uint32_t[]){0,1,2,3,4,5,6,7,8,9,0};
	const size_t length = SIZE;
	aligned_uint32_t* a = (aligned_uint32_t)_mm_malloc(sizeof(uint32_t) length, 16);
	aligned_uint32_t* b = (aligned_uint32_t)_mm_malloc(sizeof(uint32_t) length, 16);
	uint8_t* result = (uint8_t)_mm_malloc(sizeof(uint8_t) padded_length(length, 8), 16);

	cout << "running naive..." << endl;
	double naive_time = benchmark(eq_naive, a, b, result, length);
	cout << "running hadd..." << endl;
	double hadd_time = benchmark(eq_hadd, a, b, result, length);
	cout << "running hadd_2..." << endl;
	double hadd_2_time = benchmark(eq_hadd_2, a, b, result, length);
	cout << "running movemask..." << endl;
	double movemask_time = benchmark(eq_movemask, a, b, result, length);
	cout << endl;

	double base_time = naive_time;

	cout << "naive: " << naive_time << " (" << naive_time/base_time << ")" << endl;
	cout << "hadd: " << hadd_time << " (" << hadd_time/base_time << ")" << endl;
	cout << "hadd_2: " << hadd_2_time << " (" << hadd_2_time/base_time << ")" << endl;
	cout << "movemask: " << movemask_time << " (" << movemask_time/base_time << ")" << endl;

	_mm_free(a);
	_mm_free(b);
	_mm_free(result);
	}

	int main(int argc, const char** argv) {
	run_test();
	cout << "-------" << endl;
	run_benchmark();
	}