Skip to content

Instantly share code, notes, and snippets.

@bjourne
Last active July 20, 2023 14:47
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bjourne/02df9abb0712584da3a74cb325e4fbe6 to your computer and use it in GitHub Desktop.
Save bjourne/02df9abb0712584da3a74cb325e4fbe6 to your computer and use it in GitHub Desktop.
// Compile and run with:
//
// gcc -O3 -march=native fast-strlen.c -lpthread -o fast-strlen
// && ./fast-strlen
//
// Use gcc because clang is too smart and optimizes away parts of the
// benchmark. Results on Xeon(R) CPU E5-2650 v4 @ 2.20GHz with gcc
// 9.4.0:
//
// Scanning 10 times over 4.00GB...
// strlen_avx2 -> 5.09 seconds, 7.86 GB/s (length: 4000007776)
// threaded_strlen -> 1.17 seconds, 34.09 GB/s (length: 4000007776)
// slow_strlen -> 18.34 seconds, 2.18 GB/s (length: 4000007776)
// strlen -> 4.56 seconds, 8.76 GB/s (length: 4000007776)
//
//
// On Xeon(R) W-2245 CPU @ 3.90GHz with gcc 13.1.1:
//
// Scanning 10 times over 2.00GB...
// strlen_avx2 -> 1.30 seconds, 15.36 GB/s (length: 1999999999)
// strlen_avx512 -> 1.22 seconds, 16.36 GB/s (length: 1999999999)
// threaded_strlen -> 0.46 seconds, 43.56 GB/s (length: 1999999999)
// slow_strlen -> 1.19 seconds, 16.87 GB/s (length: 1999999999)
// strlen -> 1.19 seconds, 16.84 GB/s (length: 1999999999)
//
// How? The function threaded_strlen intentionally triggers a page
// fault to find an upper bound for the length of the string. It
// then has a set of threads run memchr in parallel on non-overlapping
// chunks of memory.
//
// Why? For fun. I was nerd sniped by the {n} times benchmark [1] and
// found that performance was limited by strlen (the best variant [2]
// is within 10% of strlen on the buffer). So if you make strlen
// faster that benchmark can be faster too. Besides, I haven't seen
// any multithreaded strlen's before.
//
// Obviously, this technique heavily relies on undefined behavior and
// shouldn't be used for anything other than beating benchmarks.
//
// [1]: https://owen.cafe/posts/six-times-faster-than-c/
// [2]: https://gist.github.com/Const-me/3ade77faad47f0fbb0538965ae7f8e04
#include <assert.h>
#include <immintrin.h>
#include <pthread.h>
#include <signal.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
// Size of buffer to test with and number of times the benchmarked
// function is called.
#define N_BUF (2L * 1000 * 1000 * 1000)
#define N_REPS 10
#define N_GIG ((double)N_BUF / (1000 * 1000 * 1000))
__attribute__((noinline)) size_t
slow_strlen(const char *s) {
size_t i = 0;
while (s[i++] != '\0');
return --i;
}
#ifdef __AVX2__
__attribute__((noinline)) size_t
strlen_avx2(const char *s) {
assert((uintptr_t)s % 32 == 0);
const __m256i zero = _mm256_setzero_si256();
const char *p = s;
while (true) {
__m256i mask = _mm256_cmpeq_epi8(*(const __m256i *)p, zero);
if (!_mm256_testz_si256(mask, mask)) {
return p - s + slow_strlen(p);
}
p += 32;
}
}
#endif
#ifdef __AVX512F__
__attribute__((noinline)) size_t
strlen_avx512(const char *s) {
assert((uintptr_t)s % 64 == 0);
const __m512i zero = _mm512_setzero_epi32();
const char *p = s;
while (true) {
__m512i v = _mm512_loadu_si512(p);
__mmask64 mask = _mm512_cmpeq_epi8_mask(v, zero);
if (mask) {
return p - s + slow_strlen(p);
}
p += 64;
}
}
#endif
typedef struct {
const char *s;
size_t n_bytes;
size_t cnt;
bool found;
} strlen_job;
// Limit is 256TB
#define SIGSEGV_SWEEP_SPACE 256L * 1000 * 1000 * 1000 * 1000
#define SIGSEGV_SWEEP_STEP 4L * 1000 * 1000;
static char *
sigsegv_addr = NULL;
static void *
thr_sigsegv(void *arg) {
size_t at = SIGSEGV_SWEEP_STEP;
const char *s = (const char *)arg;
size_t cnt = 0;
// Without the conditional the compiler may assume the loop is
// infinite and optimize away the memory read.
while (at < SIGSEGV_SWEEP_SPACE) {
if (!s[at]) {
cnt++;
}
at += SIGSEGV_SWEEP_STEP;
}
return (void *)cnt;
}
// Plain-old memchr appears to work the best when the length is
// limited.
static void *
thr_strlen(void *arg) {
strlen_job *job = (strlen_job *)arg;
const char *s = job->s;
const char *p = memchr(s, '\0', job->n_bytes);
job->found = p != NULL;
job->cnt = p - s;
return NULL;
}
static void
sigsegv_record(int signal, siginfo_t* siginfo, void* uap) {
sigsegv_addr = siginfo->si_addr;
pthread_exit(NULL);
}
static size_t
ceil_div(size_t a, size_t b) {
return a / b + (a % b != 0);
}
__attribute__((noinline)) size_t
threaded_strlen(const char *s) {
struct sigaction act, oldact;
memset(&act, 0, sizeof(struct sigaction));
sigemptyset(&act.sa_mask);
act.sa_flags = SA_SIGINFO | SA_ONSTACK;
act.sa_sigaction = sigsegv_record;
// Launch a thread whose only job is to page fault to give us an
// upper bound on the number of bytes to search.
pthread_t h;
sigaction(SIGSEGV, &act, &oldact);
assert(!pthread_create(&h, NULL, thr_sigsegv, (void *)s));
assert(!pthread_join(h, NULL));
size_t n_bytes = sigsegv_addr - s;
// Launch a thread pool scanning for '\0' in disjoint regions of
// memory in parallel.
int n_threads = sysconf(_SC_NPROCESSORS_ONLN);
size_t n_bytes_per_thread = ceil_div(n_bytes, n_threads);
strlen_job *jobs = malloc(sizeof(strlen_job) * n_threads);
pthread_t *handles = malloc(sizeof(pthread_t) * n_threads);
for (int i = 0; i < n_threads; i++) {
jobs[i] = (strlen_job){
s + i * n_bytes_per_thread,
n_bytes_per_thread,
0,
false
};
assert(!pthread_create(&handles[i],
NULL,
thr_strlen,
(void *)&jobs[i]));
}
for (int i = 0; i < n_threads; i++) {
assert(!pthread_join(handles[i], NULL));
}
// Restore old handler here -- thr_strlen can also fault if
// the buffer is small.
sigaction(SIGSEGV, &oldact, NULL);
for (int i = 0; i < n_threads; i++) {
strlen_job j = jobs[i];
if (j.found) {
free(jobs);
free(handles);
return i * n_bytes_per_thread + j.cnt;
}
}
assert(false);
}
static uint64_t
nano_count() {
struct timespec t;
assert(!clock_gettime(CLOCK_MONOTONIC, &t));
return (uint64_t)t.tv_sec * 1000000000 + t.tv_nsec;
}
void
benchmark(const char *name,
size_t (*func)(const char *s),
const char *s) {
size_t start = nano_count();
size_t cnt = func(s);
for (int i = 1; i < N_REPS; i++) {
assert(func(s) == cnt);
}
size_t end = nano_count();
double secs = (double)(end - start) / (1000 * 1000 * 1000);
double tot_gbs = N_GIG * N_REPS;
printf("%-15s -> %5.2f seconds, %5.2f GB/s (length: %ld)\n",
name, secs, tot_gbs / secs, cnt);
}
int
main(int argc, char *argv[]) {
char *buf = NULL;
assert(!posix_memalign((void *)&buf, 64, N_BUF));
memset(buf, 'A', N_BUF);
buf[N_BUF - 1] = '\0';
printf("Scanning %d times over %.2fGB...\n", N_REPS, N_GIG);
#ifdef __AVX2__
benchmark("strlen_avx2", strlen_avx2, buf);
#endif
#ifdef __AVX512F__
benchmark("strlen_avx512", strlen_avx512, buf);
#endif
benchmark("threaded_strlen", threaded_strlen, buf);
benchmark("slow_strlen", slow_strlen, buf);
benchmark("strlen", strlen, buf);
free(buf);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment