Skip to content

Instantly share code, notes, and snippets.

@iczelia
Created December 31, 2025 12:57
Show Gist options
  • Select an option

  • Save iczelia/be5b9872216183af71de65b4e2d1b07d to your computer and use it in GitHub Desktop.

Select an option

Save iczelia/be5b9872216183af71de65b4e2d1b07d to your computer and use it in GitHub Desktop.
#define _POSIX_C_SOURCE 200809L
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <inttypes.h>
#include <errno.h>
#if defined(__GNUC__) || defined(__clang__)
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
NOINLINE const char * simple_memchr(const char * text, char target, size_t length) {
for (size_t i = 0; i < length; ++i) {
if (text[i] == target) {
return &text[i];
}
}
return nullptr;
}
NOINLINE const void * memchr2(const void * __restrict__ s, int c, size_t n) {
const unsigned char * __restrict__ p = (const unsigned char * __restrict__) s;
const unsigned char uc = (unsigned char)c;
for (; n && ((uintptr_t)p & 7u); --n, ++p)
if (*p == uc) return (const void *)p;
const uint64_t pattern = 0x0101010101010101ULL * (uint64_t)uc;
for (; n >= 8; n -= 8, p += 8) {
uint64_t w; __builtin_memcpy(&w, p, sizeof(w));
uint64_t x = w ^ pattern;
if (((x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL))
for (int i = 0; i < 8; ++i)
if (p[i] == uc) return (const void *)(p + i);
}
for (; n--; ++p) if (*p == uc) return (const void *)p;
return NULL;
}
static inline uint64_t ns_since(const struct timespec *a, const struct timespec *b) {
uint64_t sec = (uint64_t)(b->tv_sec - a->tv_sec);
uint64_t nsec = (uint64_t)(b->tv_nsec - a->tv_nsec);
return sec * 1000000000ULL + nsec;
}
static inline uint64_t rotl64(uint64_t x, unsigned k) {
return (x << k) | (x >> (64 - k));
}
static inline uint64_t splitmix64_next(uint64_t *state) {
uint64_t z = (*state += 0x9e3779b97f4a7c15ULL);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
return z ^ (z >> 31);
}
static void *xmalloc(size_t n) {
void *p = NULL;
#if defined(_ISOC11_SOURCE)
p = aligned_alloc(64, (n + 63) & ~(size_t)63);
if (!p) {
fprintf(stderr, "aligned_alloc failed: %s\n", strerror(errno));
exit(1);
}
#else
if (posix_memalign(&p, 64, (n + 63) & ~(size_t)63) != 0) {
fprintf(stderr, "posix_memalign failed\n");
exit(1);
}
#endif
return p;
}
typedef const void *(*memchr_fn)(const void *, int, size_t);
static uint64_t bench_one(const char *name,
memchr_fn fn,
const unsigned char *buf,
size_t buf_size,
const size_t *offsets,
const size_t *lengths,
const unsigned char *needles,
size_t queries,
int iters) {
// Warmup (cache + branch predictors)
volatile uintptr_t sink = 0;
for (size_t i = 0; i < queries; ++i) {
const unsigned char *p = buf + offsets[i];
size_t n = lengths[i];
int c = needles[i];
sink ^= (uintptr_t)fn(p, c, n);
}
struct timespec t0, t1;
if (clock_gettime(CLOCK_MONOTONIC, &t0) != 0) {
perror("clock_gettime");
exit(1);
}
for (int it = 0; it < iters; ++it) {
// mix sink each iter to discourage over-optimization
sink = (uintptr_t)rotl64((uint64_t)sink + (uint64_t)it, 13);
for (size_t i = 0; i < queries; ++i) {
const unsigned char *p = buf + offsets[i];
size_t n = lengths[i];
int c = needles[i];
sink ^= (uintptr_t)fn(p, c, n);
}
}
if (clock_gettime(CLOCK_MONOTONIC, &t1) != 0) {
perror("clock_gettime");
exit(1);
}
uint64_t dt = ns_since(&t0, &t1);
// Print checksum so runs are comparable and work isn't optimized away
printf("%-14s time=%" PRIu64 " ns checksum=0x%016" PRIxPTR "\n",
name, dt, (uintptr_t)sink);
return dt;
}
static void verify_equal(const unsigned char *buf,
size_t buf_size,
const size_t *offsets,
const size_t *lengths,
const unsigned char *needles,
size_t queries) {
for (size_t i = 0; i < queries; ++i) {
const unsigned char *p = buf + offsets[i];
size_t n = lengths[i];
int c = needles[i];
const void *a = memchr(p, c, n); // glibc
const void *b = simple_memchr((const char *)p, c, n);
const void *d = memchr2(p, c, n);
if (a != b || a != d) {
ptrdiff_t da = a ? (const unsigned char *)a - p : -1;
ptrdiff_t db = b ? (const unsigned char *)b - p : -1;
ptrdiff_t dd = d ? (const unsigned char *)d - p : -1;
fprintf(stderr,
"Mismatch at query %zu: off=%zu n=%zu c=%u glibc=%td simple=%td memchr2=%td\n",
i, offsets[i], n, (unsigned)(unsigned char)c, da, db, dd);
exit(2);
}
}
}
int main(int argc, char **argv) {
size_t buf_size = (size_t)64 * 1024 * 1024; // 64 MiB
int iters = 500;
uint64_t seed = 1;
if (argc >= 2) buf_size = (size_t)strtoull(argv[1], NULL, 0);
if (argc >= 3) iters = (int)strtol(argv[2], NULL, 0);
if (argc >= 4) seed = (uint64_t)strtoull(argv[3], NULL, 0);
if (buf_size < 1024) buf_size = 1024;
if (iters < 1) iters = 1;
unsigned char *buf = (unsigned char *)xmalloc(buf_size);
// Fill buffer with pseudo-random bytes (fixed seed => deterministic)
uint64_t st = seed;
for (size_t i = 0; i < buf_size; i += 8) {
uint64_t r = splitmix64_next(&st);
size_t rem = buf_size - i;
if (rem >= 8) {
__builtin_memcpy(buf + i, &r, 8);
} else {
__builtin_memcpy(buf + i, &r, rem);
}
}
// Prepare queries:
// - offsets distributed across buffer
// - lengths vary (small and large)
// - needles vary, including some that are likely absent in the sliced range
const size_t queries = 1u << 16; // 65536 queries
size_t *offsets = (size_t *)xmalloc(queries * sizeof(size_t));
size_t *lengths = (size_t *)xmalloc(queries * sizeof(size_t));
unsigned char *needles = (unsigned char *)xmalloc(queries * sizeof(unsigned char));
st = seed ^ 0xdeadbeefcafebabeULL;
for (size_t i = 0; i < queries; ++i) {
uint64_t r1 = splitmix64_next(&st);
uint64_t r2 = splitmix64_next(&st);
uint64_t r3 = splitmix64_next(&st);
size_t off = (size_t)(r1 % buf_size);
// Choose a length up to 4 KiB, but clamp to end of buffer.
size_t maxlen = buf_size - off;
size_t len = (size_t)(r2 % (4096 + 1));
if (len > maxlen) len = maxlen;
// Mix in some larger searches (up to 1 MiB)
if ((r2 & 0xF) == 0) {
size_t big = (size_t)(r2 % (1024 * 1024 + 1));
if (big > maxlen) big = maxlen;
len = big;
}
// Needles:
// - Half the time, pick a byte guaranteed present in the range (if len>0)
// - Half the time, pick an arbitrary byte (may or may not be present)
unsigned char c;
if (len > 0 && (r3 & 1)) {
c = buf[off + (size_t)(r3 % len)];
} else {
c = (unsigned char)(r3 & 0xFF);
}
offsets[i] = off;
lengths[i] = len;
needles[i] = c;
}
// Verify correctness across all queries once (fast, but not free)
verify_equal(buf, buf_size, offsets, lengths, needles, queries);
// Benchmark
printf("buf_size=%zu bytes, queries=%zu, iters=%d, seed=%" PRIu64 "\n",
buf_size, queries, iters, seed);
uint64_t t_glibc = bench_one("glibc_memchr", memchr, buf, buf_size, offsets, lengths, needles, queries, iters);
uint64_t t_simple = bench_one("simple_memchr", (memchr_fn) simple_memchr, buf, buf_size, offsets, lengths, needles, queries, iters);
uint64_t t_m2 = bench_one("memchr2", (memchr_fn) memchr2, buf, buf_size, offsets, lengths, needles, queries, iters);
// Throughput-ish summary
// Total bytes searched is not fixed (variable lengths). Compute total searched per outer iteration.
uint64_t total_bytes = 0;
for (size_t i = 0; i < queries; ++i) total_bytes += (uint64_t)lengths[i];
double total_bytes_all = (double)total_bytes * (double)iters;
auto gbps = [](double bytes, uint64_t ns) -> double {
return (bytes / (double)ns) * 1e9 / (1024.0 * 1024.0 * 1024.0);
};
printf("\nEstimated throughput (GiB/s):\n");
printf(" glibc_memchr : %.3f\n", gbps(total_bytes_all, t_glibc));
printf(" simple_memchr: %.3f\n", gbps(total_bytes_all, t_simple));
printf(" memchr2 : %.3f\n", gbps(total_bytes_all, t_m2));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment