-
-
Save iczelia/be5b9872216183af71de65b4e2d1b07d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define _POSIX_C_SOURCE 200809L | |
| #include <stdint.h> | |
| #include <stddef.h> | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| #include <time.h> | |
| #include <inttypes.h> | |
| #include <errno.h> | |
| #if defined(__GNUC__) || defined(__clang__) | |
| #define NOINLINE __attribute__((noinline)) | |
| #else | |
| #define NOINLINE | |
| #endif | |
| NOINLINE const char * simple_memchr(const char * text, char target, size_t length) { | |
| for (size_t i = 0; i < length; ++i) { | |
| if (text[i] == target) { | |
| return &text[i]; | |
| } | |
| } | |
| return nullptr; | |
| } | |
| NOINLINE const void * memchr2(const void * __restrict__ s, int c, size_t n) { | |
| const unsigned char * __restrict__ p = (const unsigned char * __restrict__) s; | |
| const unsigned char uc = (unsigned char)c; | |
| for (; n && ((uintptr_t)p & 7u); --n, ++p) | |
| if (*p == uc) return (const void *)p; | |
| const uint64_t pattern = 0x0101010101010101ULL * (uint64_t)uc; | |
| for (; n >= 8; n -= 8, p += 8) { | |
| uint64_t w; __builtin_memcpy(&w, p, sizeof(w)); | |
| uint64_t x = w ^ pattern; | |
| if (((x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL)) | |
| for (int i = 0; i < 8; ++i) | |
| if (p[i] == uc) return (const void *)(p + i); | |
| } | |
| for (; n--; ++p) if (*p == uc) return (const void *)p; | |
| return NULL; | |
| } | |
| static inline uint64_t ns_since(const struct timespec *a, const struct timespec *b) { | |
| uint64_t sec = (uint64_t)(b->tv_sec - a->tv_sec); | |
| uint64_t nsec = (uint64_t)(b->tv_nsec - a->tv_nsec); | |
| return sec * 1000000000ULL + nsec; | |
| } | |
| static inline uint64_t rotl64(uint64_t x, unsigned k) { | |
| return (x << k) | (x >> (64 - k)); | |
| } | |
| static inline uint64_t splitmix64_next(uint64_t *state) { | |
| uint64_t z = (*state += 0x9e3779b97f4a7c15ULL); | |
| z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL; | |
| z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL; | |
| return z ^ (z >> 31); | |
| } | |
| static void *xmalloc(size_t n) { | |
| void *p = NULL; | |
| #if defined(_ISOC11_SOURCE) | |
| p = aligned_alloc(64, (n + 63) & ~(size_t)63); | |
| if (!p) { | |
| fprintf(stderr, "aligned_alloc failed: %s\n", strerror(errno)); | |
| exit(1); | |
| } | |
| #else | |
| if (posix_memalign(&p, 64, (n + 63) & ~(size_t)63) != 0) { | |
| fprintf(stderr, "posix_memalign failed\n"); | |
| exit(1); | |
| } | |
| #endif | |
| return p; | |
| } | |
| typedef const void *(*memchr_fn)(const void *, int, size_t); | |
| static uint64_t bench_one(const char *name, | |
| memchr_fn fn, | |
| const unsigned char *buf, | |
| size_t buf_size, | |
| const size_t *offsets, | |
| const size_t *lengths, | |
| const unsigned char *needles, | |
| size_t queries, | |
| int iters) { | |
| // Warmup (cache + branch predictors) | |
| volatile uintptr_t sink = 0; | |
| for (size_t i = 0; i < queries; ++i) { | |
| const unsigned char *p = buf + offsets[i]; | |
| size_t n = lengths[i]; | |
| int c = needles[i]; | |
| sink ^= (uintptr_t)fn(p, c, n); | |
| } | |
| struct timespec t0, t1; | |
| if (clock_gettime(CLOCK_MONOTONIC, &t0) != 0) { | |
| perror("clock_gettime"); | |
| exit(1); | |
| } | |
| for (int it = 0; it < iters; ++it) { | |
| // mix sink each iter to discourage over-optimization | |
| sink = (uintptr_t)rotl64((uint64_t)sink + (uint64_t)it, 13); | |
| for (size_t i = 0; i < queries; ++i) { | |
| const unsigned char *p = buf + offsets[i]; | |
| size_t n = lengths[i]; | |
| int c = needles[i]; | |
| sink ^= (uintptr_t)fn(p, c, n); | |
| } | |
| } | |
| if (clock_gettime(CLOCK_MONOTONIC, &t1) != 0) { | |
| perror("clock_gettime"); | |
| exit(1); | |
| } | |
| uint64_t dt = ns_since(&t0, &t1); | |
| // Print checksum so runs are comparable and work isn't optimized away | |
| printf("%-14s time=%" PRIu64 " ns checksum=0x%016" PRIxPTR "\n", | |
| name, dt, (uintptr_t)sink); | |
| return dt; | |
| } | |
| static void verify_equal(const unsigned char *buf, | |
| size_t buf_size, | |
| const size_t *offsets, | |
| const size_t *lengths, | |
| const unsigned char *needles, | |
| size_t queries) { | |
| for (size_t i = 0; i < queries; ++i) { | |
| const unsigned char *p = buf + offsets[i]; | |
| size_t n = lengths[i]; | |
| int c = needles[i]; | |
| const void *a = memchr(p, c, n); // glibc | |
| const void *b = simple_memchr((const char *)p, c, n); | |
| const void *d = memchr2(p, c, n); | |
| if (a != b || a != d) { | |
| ptrdiff_t da = a ? (const unsigned char *)a - p : -1; | |
| ptrdiff_t db = b ? (const unsigned char *)b - p : -1; | |
| ptrdiff_t dd = d ? (const unsigned char *)d - p : -1; | |
| fprintf(stderr, | |
| "Mismatch at query %zu: off=%zu n=%zu c=%u glibc=%td simple=%td memchr2=%td\n", | |
| i, offsets[i], n, (unsigned)(unsigned char)c, da, db, dd); | |
| exit(2); | |
| } | |
| } | |
| } | |
| int main(int argc, char **argv) { | |
| size_t buf_size = (size_t)64 * 1024 * 1024; // 64 MiB | |
| int iters = 500; | |
| uint64_t seed = 1; | |
| if (argc >= 2) buf_size = (size_t)strtoull(argv[1], NULL, 0); | |
| if (argc >= 3) iters = (int)strtol(argv[2], NULL, 0); | |
| if (argc >= 4) seed = (uint64_t)strtoull(argv[3], NULL, 0); | |
| if (buf_size < 1024) buf_size = 1024; | |
| if (iters < 1) iters = 1; | |
| unsigned char *buf = (unsigned char *)xmalloc(buf_size); | |
| // Fill buffer with pseudo-random bytes (fixed seed => deterministic) | |
| uint64_t st = seed; | |
| for (size_t i = 0; i < buf_size; i += 8) { | |
| uint64_t r = splitmix64_next(&st); | |
| size_t rem = buf_size - i; | |
| if (rem >= 8) { | |
| __builtin_memcpy(buf + i, &r, 8); | |
| } else { | |
| __builtin_memcpy(buf + i, &r, rem); | |
| } | |
| } | |
| // Prepare queries: | |
| // - offsets distributed across buffer | |
| // - lengths vary (small and large) | |
| // - needles vary, including some that are likely absent in the sliced range | |
| const size_t queries = 1u << 16; // 65536 queries | |
| size_t *offsets = (size_t *)xmalloc(queries * sizeof(size_t)); | |
| size_t *lengths = (size_t *)xmalloc(queries * sizeof(size_t)); | |
| unsigned char *needles = (unsigned char *)xmalloc(queries * sizeof(unsigned char)); | |
| st = seed ^ 0xdeadbeefcafebabeULL; | |
| for (size_t i = 0; i < queries; ++i) { | |
| uint64_t r1 = splitmix64_next(&st); | |
| uint64_t r2 = splitmix64_next(&st); | |
| uint64_t r3 = splitmix64_next(&st); | |
| size_t off = (size_t)(r1 % buf_size); | |
| // Choose a length up to 4 KiB, but clamp to end of buffer. | |
| size_t maxlen = buf_size - off; | |
| size_t len = (size_t)(r2 % (4096 + 1)); | |
| if (len > maxlen) len = maxlen; | |
| // Mix in some larger searches (up to 1 MiB) | |
| if ((r2 & 0xF) == 0) { | |
| size_t big = (size_t)(r2 % (1024 * 1024 + 1)); | |
| if (big > maxlen) big = maxlen; | |
| len = big; | |
| } | |
| // Needles: | |
| // - Half the time, pick a byte guaranteed present in the range (if len>0) | |
| // - Half the time, pick an arbitrary byte (may or may not be present) | |
| unsigned char c; | |
| if (len > 0 && (r3 & 1)) { | |
| c = buf[off + (size_t)(r3 % len)]; | |
| } else { | |
| c = (unsigned char)(r3 & 0xFF); | |
| } | |
| offsets[i] = off; | |
| lengths[i] = len; | |
| needles[i] = c; | |
| } | |
| // Verify correctness across all queries once (fast, but not free) | |
| verify_equal(buf, buf_size, offsets, lengths, needles, queries); | |
| // Benchmark | |
| printf("buf_size=%zu bytes, queries=%zu, iters=%d, seed=%" PRIu64 "\n", | |
| buf_size, queries, iters, seed); | |
| uint64_t t_glibc = bench_one("glibc_memchr", memchr, buf, buf_size, offsets, lengths, needles, queries, iters); | |
| uint64_t t_simple = bench_one("simple_memchr", (memchr_fn) simple_memchr, buf, buf_size, offsets, lengths, needles, queries, iters); | |
| uint64_t t_m2 = bench_one("memchr2", (memchr_fn) memchr2, buf, buf_size, offsets, lengths, needles, queries, iters); | |
| // Throughput-ish summary | |
| // Total bytes searched is not fixed (variable lengths). Compute total searched per outer iteration. | |
| uint64_t total_bytes = 0; | |
| for (size_t i = 0; i < queries; ++i) total_bytes += (uint64_t)lengths[i]; | |
| double total_bytes_all = (double)total_bytes * (double)iters; | |
| auto gbps = [](double bytes, uint64_t ns) -> double { | |
| return (bytes / (double)ns) * 1e9 / (1024.0 * 1024.0 * 1024.0); | |
| }; | |
| printf("\nEstimated throughput (GiB/s):\n"); | |
| printf(" glibc_memchr : %.3f\n", gbps(total_bytes_all, t_glibc)); | |
| printf(" simple_memchr: %.3f\n", gbps(total_bytes_all, t_simple)); | |
| printf(" memchr2 : %.3f\n", gbps(total_bytes_all, t_m2)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment