a10y/benchmark_simd.c

## benchmark_simd.c
#include <stdlib.h>
#include <arm_neon.h>
#include <arm_acle.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/time.h>
#include <time.h>


#define NOT_FOUND -1


// Find a needle value in a haystack of data. Returns the index of the needle,
// or NOT_FOUND.
//
// This implementation uses scalar instructions and does naive linear probing to find the needle.
size_t find_needle_scalar(uint16_t needle, uint16_t *haystack, size_t len) {
	for (size_t i = 0; i < len; i++) {
		if (haystack[i] == needle) {
			return i;
		}
	}
	return NOT_FOUND;
}

// Find a needle value in a haystack of data. Returns the index of the needle,
// or NOT_FOUND.
//
// This implementation uses SIMD instructions to find an 8-element section of
// the array that contains a match for the needle, then uses the naive scalar
// method above to find the exact value within that method.
size_t find_needle_neon(uint16_t needle, uint16_t *haystack, size_t len) {
    const uint16x8_t vNeedle = vdupq_n_u16(needle); // Duplicate needle across vector

    // Process in blocks of 8 uint16_t elements
	size_t i;
    for (i = 0; i <= len - 8; i += 8) {
        const uint16x8_t vHaystack = vld1q_u16(haystack + i); // Load block from haystack
        const uint16x8_t cmpResult = vceqq_u16(vHaystack, vNeedle); // Compare
        const uint64x2_t cmpResultPacked = vreinterpretq_u64_u16(cmpResult); // Pack results for testing
        if (vgetq_lane_u64(cmpResultPacked, 0) || vgetq_lane_u64(cmpResultPacked, 1)) {
            for (size_t j = 0; j < 8; ++j) {
                if (haystack[i + j] == needle) {
                    return i + j;
                }
            }
        }
    }

    for (; i < len; ++i) {
        if (haystack[i] == needle) {
            return i;
        }
    }

    return NOT_FOUND;
}

// Find a needle value in a haystack of data. Returns the index of the needle,
// or NOT_FOUND.
//
// This implementation is more aggressive than the neon function above, in that it never needs to
// fall back to scalar code and instead uses ARM NEON instructions to do a fully vectorized computation.
//
// We use the shrn trick from this Google blog post:
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
size_t find_needle_shrn(uint16_t needle, uint16_t *haystack, size_t len) {
	// Assume that the input is a multiple of 8, so we don't need to pad.
	// In general, it'd be best to ensure that the caller pre-pads the value ahead of time,
	// so that we don't need to do any branching at this level.
	assert(len % 8 == 0);

	// Splat the needle value into an 8x, 16-bit mask vector
	const uint16x8_t needleMask = vdupq_n_u16(needle);

	for (size_t i = 0; i <= (len - 8); i += 8) {
		// Load from main-memory (or ideally cacheline) into vector
		const uint16x8_t chunk = vld1q_u16(haystack + i);
		// LANEWISE check if chunk == mask
		const uint16x8_t matches = vceqq_u16(chunk, needleMask);

		// `matches` will now contain 8x uint16_t values, each either 0xFFFF (EQUAL)
		//  or 0x0000 (!EQUAL).
		//
		// We can SHRN this vector, by 4, which will yield a vector of 8x
		// values, each 0xFF or 0x00. This can be interpreted as a single 64-bit integer.
		// For example:
		// chunk         = [ 1,  2,  3, 4, 5, 6, 7, 8]
		// needle        = [ 3,  3,  3, 3, 3, 3, 3, 3]
		// matches (hex) = 0000 0000 0000 0000 0000 0000 0000 FFFF
		// reduced (hex) =   00   00   00   00   00   00   00   FF
		// result (hex) = 0xFF00000000000000

		const uint8x8_t reduced = vshrn_n_u16(matches, 4);
		uint64_t result = vget_lane_u64(vreinterpret_u64_u8(reduced), 0);

		if (result != 0) {
			// Reverse the byte order of the vector, count leading zero bits
			// and multiply by 8 to get the original offset.
			size_t ctz = __builtin_clzll(__revll(result)) >> 3;
			return i + ctz;
		}

	}

	return NOT_FOUND;
}


#define LEN ((64 * 1024 * 1024))

int main() {
	uint16_t *haystack = calloc(sizeof(uint16_t), LEN);
	uint16_t needle = 0x0A0D;
	haystack[LEN - 1] = needle;

	size_t result = 0;

	printf("SCALAR: Warming up\n");
	printf("CHECK: %lu\n", find_needle_scalar(needle, haystack, LEN));
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);

	// Timed trials
	struct timespec start, end;
	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	// 10 trials
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);

	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
	uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;


	printf("Time to do 10 trials of find_needle_scalar: %lu us\n", delta_us);

	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	printf("VECTOR: Warming up\n");
	printf("CHECK: %lu\n", find_needle_neon(needle, haystack, LEN));
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);

	// Timed trials
	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	// 10 trials
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);

	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
	delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;


	printf("Time to do 10 trials of find_needle_neon: %lu us\n", delta_us);


	printf("SHRN: Warming up\n");
	printf("CHECK: %lu\n", find_needle_shrn(needle, haystack, LEN));
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);

	// Timed trials
	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	// 10 trials
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);

	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
	delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;


	printf("Time to do 10 trials of find_needle_shrn: %lu us\n", delta_us);

	printf("printing result so we don't lose it: %lu\n", result);
}
	#include <stdlib.h>
	#include <arm_neon.h>
	#include <arm_acle.h>
	#include <stdio.h>
	#include <string.h>
	#include <assert.h>
	#include <sys/time.h>
	#include <time.h>


	#define NOT_FOUND -1


	// Find a needle value in a haystack of data. Returns the index of the needle,
	// or NOT_FOUND.
	//
	// This implementation uses scalar instructions and does naive linear probing to find the needle.
	size_t find_needle_scalar(uint16_t needle, uint16_t *haystack, size_t len) {
	for (size_t i = 0; i < len; i++) {
	if (haystack[i] == needle) {
	return i;
	}
	}
	return NOT_FOUND;
	}

	// Find a needle value in a haystack of data. Returns the index of the needle,
	// or NOT_FOUND.
	//
	// This implementation uses SIMD instructions to find an 8-element section of
	// the array that contains a match for the needle, then uses the naive scalar
	// method above to find the exact value within that method.
	size_t find_needle_neon(uint16_t needle, uint16_t *haystack, size_t len) {
	const uint16x8_t vNeedle = vdupq_n_u16(needle); // Duplicate needle across vector

	// Process in blocks of 8 uint16_t elements
	size_t i;
	for (i = 0; i <= len - 8; i += 8) {
	const uint16x8_t vHaystack = vld1q_u16(haystack + i); // Load block from haystack
	const uint16x8_t cmpResult = vceqq_u16(vHaystack, vNeedle); // Compare
	const uint64x2_t cmpResultPacked = vreinterpretq_u64_u16(cmpResult); // Pack results for testing
	if (vgetq_lane_u64(cmpResultPacked, 0) \|\| vgetq_lane_u64(cmpResultPacked, 1)) {
	for (size_t j = 0; j < 8; ++j) {
	if (haystack[i + j] == needle) {
	return i + j;
	}
	}
	}
	}

	for (; i < len; ++i) {
	if (haystack[i] == needle) {
	return i;
	}
	}

	return NOT_FOUND;
	}

	// Find a needle value in a haystack of data. Returns the index of the needle,
	// or NOT_FOUND.
	//
	// This implementation is more aggressive than the neon function above, in that it never needs to
	// fall back to scalar code and instead uses ARM NEON instructions to do a fully vectorized computation.
	//
	// We use the shrn trick from this Google blog post:
	// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
	size_t find_needle_shrn(uint16_t needle, uint16_t *haystack, size_t len) {
	// Assume that the input is a multiple of 8, so we don't need to pad.
	// In general, it'd be best to ensure that the caller pre-pads the value ahead of time,
	// so that we don't need to do any branching at this level.
	assert(len % 8 == 0);

	// Splat the needle value into an 8x, 16-bit mask vector
	const uint16x8_t needleMask = vdupq_n_u16(needle);

	for (size_t i = 0; i <= (len - 8); i += 8) {
	// Load from main-memory (or ideally cacheline) into vector
	const uint16x8_t chunk = vld1q_u16(haystack + i);
	// LANEWISE check if chunk == mask
	const uint16x8_t matches = vceqq_u16(chunk, needleMask);

	// `matches` will now contain 8x uint16_t values, each either 0xFFFF (EQUAL)
	// or 0x0000 (!EQUAL).
	//
	// We can SHRN this vector, by 4, which will yield a vector of 8x
	// values, each 0xFF or 0x00. This can be interpreted as a single 64-bit integer.
	// For example:
	// chunk = [ 1, 2, 3, 4, 5, 6, 7, 8]
	// needle = [ 3, 3, 3, 3, 3, 3, 3, 3]
	// matches (hex) = 0000 0000 0000 0000 0000 0000 0000 FFFF
	// reduced (hex) = 00 00 00 00 00 00 00 FF
	// result (hex) = 0xFF00000000000000

	const uint8x8_t reduced = vshrn_n_u16(matches, 4);
	uint64_t result = vget_lane_u64(vreinterpret_u64_u8(reduced), 0);

	if (result != 0) {
	// Reverse the byte order of the vector, count leading zero bits
	// and multiply by 8 to get the original offset.
	size_t ctz = __builtin_clzll(__revll(result)) >> 3;
	return i + ctz;
	}

	}

	return NOT_FOUND;
	}


	#define LEN ((64 * 1024 * 1024))

	int main() {
	uint16_t *haystack = calloc(sizeof(uint16_t), LEN);
	uint16_t needle = 0x0A0D;
	haystack[LEN - 1] = needle;

	size_t result = 0;

	printf("SCALAR: Warming up\n");
	printf("CHECK: %lu\n", find_needle_scalar(needle, haystack, LEN));
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);

	// Timed trials
	struct timespec start, end;
	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	// 10 trials
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);
	result += find_needle_scalar(needle, haystack, LEN);

	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
	uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;


	printf("Time to do 10 trials of find_needle_scalar: %lu us\n", delta_us);

	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	printf("VECTOR: Warming up\n");
	printf("CHECK: %lu\n", find_needle_neon(needle, haystack, LEN));
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);

	// Timed trials
	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	// 10 trials
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);
	result += find_needle_neon(needle, haystack, LEN);

	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
	delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;


	printf("Time to do 10 trials of find_needle_neon: %lu us\n", delta_us);




	printf("SHRN: Warming up\n");
	printf("CHECK: %lu\n", find_needle_shrn(needle, haystack, LEN));
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);

	// Timed trials
	clock_gettime(CLOCK_MONOTONIC_RAW, &start);

	// 10 trials
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);
	result += find_needle_shrn(needle, haystack, LEN);

	clock_gettime(CLOCK_MONOTONIC_RAW, &end);
	delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;


	printf("Time to do 10 trials of find_needle_shrn: %lu us\n", delta_us);

	printf("printing result so we don't lose it: %lu\n", result);
	}