Skip to content

Instantly share code, notes, and snippets.

@a10y
Last active March 29, 2024 22:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save a10y/f70704180e76cf5d0cd24238653f5a02 to your computer and use it in GitHub Desktop.
Save a10y/f70704180e76cf5d0cd24238653f5a02 to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <arm_neon.h>
#include <arm_acle.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/time.h>
#include <time.h>
#define NOT_FOUND -1
// Find a needle value in a haystack of data. Returns the index of the needle,
// or NOT_FOUND.
//
// This implementation uses scalar instructions and does naive linear probing to find the needle.
size_t find_needle_scalar(uint16_t needle, uint16_t *haystack, size_t len) {
for (size_t i = 0; i < len; i++) {
if (haystack[i] == needle) {
return i;
}
}
return NOT_FOUND;
}
// Find a needle value in a haystack of data. Returns the index of the needle,
// or NOT_FOUND.
//
// This implementation uses SIMD instructions to find an 8-element section of
// the array that contains a match for the needle, then uses the naive scalar
// method above to find the exact value within that method.
size_t find_needle_neon(uint16_t needle, uint16_t *haystack, size_t len) {
const uint16x8_t vNeedle = vdupq_n_u16(needle); // Duplicate needle across vector
// Process in blocks of 8 uint16_t elements
size_t i;
for (i = 0; i <= len - 8; i += 8) {
const uint16x8_t vHaystack = vld1q_u16(haystack + i); // Load block from haystack
const uint16x8_t cmpResult = vceqq_u16(vHaystack, vNeedle); // Compare
const uint64x2_t cmpResultPacked = vreinterpretq_u64_u16(cmpResult); // Pack results for testing
if (vgetq_lane_u64(cmpResultPacked, 0) || vgetq_lane_u64(cmpResultPacked, 1)) {
for (size_t j = 0; j < 8; ++j) {
if (haystack[i + j] == needle) {
return i + j;
}
}
}
}
for (; i < len; ++i) {
if (haystack[i] == needle) {
return i;
}
}
return NOT_FOUND;
}
// Find a needle value in a haystack of data. Returns the index of the needle,
// or NOT_FOUND.
//
// This implementation is more aggressive than the neon function above, in that it never needs to
// fall back to scalar code and instead uses ARM NEON instructions to do a fully vectorized computation.
//
// We use the shrn trick from this Google blog post:
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
size_t find_needle_shrn(uint16_t needle, uint16_t *haystack, size_t len) {
// Assume that the input is a multiple of 8, so we don't need to pad.
// In general, it'd be best to ensure that the caller pre-pads the value ahead of time,
// so that we don't need to do any branching at this level.
assert(len % 8 == 0);
// Splat the needle value into an 8x, 16-bit mask vector
const uint16x8_t needleMask = vdupq_n_u16(needle);
for (size_t i = 0; i <= (len - 8); i += 8) {
// Load from main-memory (or ideally cacheline) into vector
const uint16x8_t chunk = vld1q_u16(haystack + i);
// LANEWISE check if chunk == mask
const uint16x8_t matches = vceqq_u16(chunk, needleMask);
// `matches` will now contain 8x uint16_t values, each either 0xFFFF (EQUAL)
// or 0x0000 (!EQUAL).
//
// We can SHRN this vector, by 4, which will yield a vector of 8x
// values, each 0xFF or 0x00. This can be interpreted as a single 64-bit integer.
// For example:
// chunk = [ 1, 2, 3, 4, 5, 6, 7, 8]
// needle = [ 3, 3, 3, 3, 3, 3, 3, 3]
// matches (hex) = 0000 0000 0000 0000 0000 0000 0000 FFFF
// reduced (hex) = 00 00 00 00 00 00 00 FF
// result (hex) = 0xFF00000000000000
const uint8x8_t reduced = vshrn_n_u16(matches, 4);
uint64_t result = vget_lane_u64(vreinterpret_u64_u8(reduced), 0);
if (result != 0) {
// Reverse the byte order of the vector, count leading zero bits
// and multiply by 8 to get the original offset.
size_t ctz = __builtin_clzll(__revll(result)) >> 3;
return i + ctz;
}
}
return NOT_FOUND;
}
#define LEN ((64 * 1024 * 1024))
int main() {
uint16_t *haystack = calloc(sizeof(uint16_t), LEN);
uint16_t needle = 0x0A0D;
haystack[LEN - 1] = needle;
size_t result = 0;
printf("SCALAR: Warming up\n");
printf("CHECK: %lu\n", find_needle_scalar(needle, haystack, LEN));
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
// Timed trials
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
// 10 trials
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
result += find_needle_scalar(needle, haystack, LEN);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
printf("Time to do 10 trials of find_needle_scalar: %lu us\n", delta_us);
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
printf("VECTOR: Warming up\n");
printf("CHECK: %lu\n", find_needle_neon(needle, haystack, LEN));
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
// Timed trials
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
// 10 trials
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
result += find_needle_neon(needle, haystack, LEN);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
printf("Time to do 10 trials of find_needle_neon: %lu us\n", delta_us);
printf("SHRN: Warming up\n");
printf("CHECK: %lu\n", find_needle_shrn(needle, haystack, LEN));
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
// Timed trials
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
// 10 trials
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
result += find_needle_shrn(needle, haystack, LEN);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
printf("Time to do 10 trials of find_needle_shrn: %lu us\n", delta_us);
printf("printing result so we don't lose it: %lu\n", result);
}
@a10y
Copy link
Author

a10y commented Mar 29, 2024

Compiling with

clang -O3 benchmark_simd.c

Execution on Macbook Pro (M2 Max)

SCALAR: Warming up
CHECK: 67108863
Time to do 10 trials of find_needle_scalar: 187932 us
VECTOR: Warming up
CHECK: 67108863
Time to do 10 trials of find_needle_neon: 41220 us
SHRN: Warming up
CHECK: 67108863
Time to do 10 trials of find_needle_shrn: 30735 us
printing result so we don't lose it: 2415919068

Execution on NVIDIA Jetson AGX Orin

SCALAR: Warming up
CHECK: 67108863
Time to do 10 trials of find_needle_scalar: 539544 us
VECTOR: Warming up
CHECK: 67108863
Time to do 10 trials of find_needle_neon: 137206 us
SHRN: Warming up
CHECK: 67108863
Time to do 10 trials of find_needle_shrn: 137703 us
printing result so we don't lose it: 2415919068

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment