Last active
March 29, 2024 22:13
-
-
Save a10y/f70704180e76cf5d0cd24238653f5a02 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <arm_neon.h> | |
#include <arm_acle.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <assert.h> | |
#include <sys/time.h> | |
#include <time.h> | |
#define NOT_FOUND -1 | |
// Find a needle value in a haystack of data. Returns the index of the needle, | |
// or NOT_FOUND. | |
// | |
// This implementation uses scalar instructions and does naive linear probing to find the needle. | |
size_t find_needle_scalar(uint16_t needle, uint16_t *haystack, size_t len) { | |
for (size_t i = 0; i < len; i++) { | |
if (haystack[i] == needle) { | |
return i; | |
} | |
} | |
return NOT_FOUND; | |
} | |
// Find a needle value in a haystack of data. Returns the index of the needle, | |
// or NOT_FOUND. | |
// | |
// This implementation uses SIMD instructions to find an 8-element section of | |
// the array that contains a match for the needle, then uses the naive scalar | |
// method above to find the exact value within that method. | |
size_t find_needle_neon(uint16_t needle, uint16_t *haystack, size_t len) { | |
const uint16x8_t vNeedle = vdupq_n_u16(needle); // Duplicate needle across vector | |
// Process in blocks of 8 uint16_t elements | |
size_t i; | |
for (i = 0; i <= len - 8; i += 8) { | |
const uint16x8_t vHaystack = vld1q_u16(haystack + i); // Load block from haystack | |
const uint16x8_t cmpResult = vceqq_u16(vHaystack, vNeedle); // Compare | |
const uint64x2_t cmpResultPacked = vreinterpretq_u64_u16(cmpResult); // Pack results for testing | |
if (vgetq_lane_u64(cmpResultPacked, 0) || vgetq_lane_u64(cmpResultPacked, 1)) { | |
for (size_t j = 0; j < 8; ++j) { | |
if (haystack[i + j] == needle) { | |
return i + j; | |
} | |
} | |
} | |
} | |
for (; i < len; ++i) { | |
if (haystack[i] == needle) { | |
return i; | |
} | |
} | |
return NOT_FOUND; | |
} | |
// Find a needle value in a haystack of data. Returns the index of the needle, | |
// or NOT_FOUND. | |
// | |
// This implementation is more aggressive than the neon function above, in that it never needs to | |
// fall back to scalar code and instead uses ARM NEON instructions to do a fully vectorized computation. | |
// | |
// We use the shrn trick from this Google blog post: | |
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon | |
size_t find_needle_shrn(uint16_t needle, uint16_t *haystack, size_t len) { | |
// Assume that the input is a multiple of 8, so we don't need to pad. | |
// In general, it'd be best to ensure that the caller pre-pads the value ahead of time, | |
// so that we don't need to do any branching at this level. | |
assert(len % 8 == 0); | |
// Splat the needle value into an 8x, 16-bit mask vector | |
const uint16x8_t needleMask = vdupq_n_u16(needle); | |
for (size_t i = 0; i <= (len - 8); i += 8) { | |
// Load from main-memory (or ideally cacheline) into vector | |
const uint16x8_t chunk = vld1q_u16(haystack + i); | |
// LANEWISE check if chunk == mask | |
const uint16x8_t matches = vceqq_u16(chunk, needleMask); | |
// `matches` will now contain 8x uint16_t values, each either 0xFFFF (EQUAL) | |
// or 0x0000 (!EQUAL). | |
// | |
// We can SHRN this vector, by 4, which will yield a vector of 8x | |
// values, each 0xFF or 0x00. This can be interpreted as a single 64-bit integer. | |
// For example: | |
// chunk = [ 1, 2, 3, 4, 5, 6, 7, 8] | |
// needle = [ 3, 3, 3, 3, 3, 3, 3, 3] | |
// matches (hex) = 0000 0000 0000 0000 0000 0000 0000 FFFF | |
// reduced (hex) = 00 00 00 00 00 00 00 FF | |
// result (hex) = 0xFF00000000000000 | |
const uint8x8_t reduced = vshrn_n_u16(matches, 4); | |
uint64_t result = vget_lane_u64(vreinterpret_u64_u8(reduced), 0); | |
if (result != 0) { | |
// Reverse the byte order of the vector, count leading zero bits | |
// and multiply by 8 to get the original offset. | |
size_t ctz = __builtin_clzll(__revll(result)) >> 3; | |
return i + ctz; | |
} | |
} | |
return NOT_FOUND; | |
} | |
#define LEN ((64 * 1024 * 1024)) | |
int main() { | |
uint16_t *haystack = calloc(sizeof(uint16_t), LEN); | |
uint16_t needle = 0x0A0D; | |
haystack[LEN - 1] = needle; | |
size_t result = 0; | |
printf("SCALAR: Warming up\n"); | |
printf("CHECK: %lu\n", find_needle_scalar(needle, haystack, LEN)); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
// Timed trials | |
struct timespec start, end; | |
clock_gettime(CLOCK_MONOTONIC_RAW, &start); | |
// 10 trials | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
result += find_needle_scalar(needle, haystack, LEN); | |
clock_gettime(CLOCK_MONOTONIC_RAW, &end); | |
uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; | |
printf("Time to do 10 trials of find_needle_scalar: %lu us\n", delta_us); | |
clock_gettime(CLOCK_MONOTONIC_RAW, &start); | |
printf("VECTOR: Warming up\n"); | |
printf("CHECK: %lu\n", find_needle_neon(needle, haystack, LEN)); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
// Timed trials | |
clock_gettime(CLOCK_MONOTONIC_RAW, &start); | |
// 10 trials | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
result += find_needle_neon(needle, haystack, LEN); | |
clock_gettime(CLOCK_MONOTONIC_RAW, &end); | |
delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; | |
printf("Time to do 10 trials of find_needle_neon: %lu us\n", delta_us); | |
printf("SHRN: Warming up\n"); | |
printf("CHECK: %lu\n", find_needle_shrn(needle, haystack, LEN)); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
// Timed trials | |
clock_gettime(CLOCK_MONOTONIC_RAW, &start); | |
// 10 trials | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
result += find_needle_shrn(needle, haystack, LEN); | |
clock_gettime(CLOCK_MONOTONIC_RAW, &end); | |
delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; | |
printf("Time to do 10 trials of find_needle_shrn: %lu us\n", delta_us); | |
printf("printing result so we don't lose it: %lu\n", result); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compiling with
Execution on Macbook Pro (M2 Max)
Execution on NVIDIA Jetson AGX Orin