Skip to content

Instantly share code, notes, and snippets.

@bonzini
Last active January 17, 2024 17:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bonzini/9a95b0e02d1ceb60af9e to your computer and use it in GitHub Desktop.
Save bonzini/9a95b0e02d1ceb60af9e to your computer and use it in GitHub Desktop.
Optimizing "is this memory block equal to zero?"
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#define N 100000
char buf1[1024], buf2[1024], buf3[1024], buf4[65536];
static __attribute__((noinline)) bool memeqzero1(const void *data, size_t length)
{
const unsigned char *p = data;
while (length) {
if (*p)
return false;
p++;
length--;
}
return true;
}
static __attribute__((noinline)) bool memeqzero2(const void *data, size_t length)
{
const unsigned char *p = data;
static unsigned long zeroes[16];
while (length > sizeof(zeroes)) {
if (memcmp(zeroes, p, sizeof(zeroes)))
return false;
p += sizeof(zeroes);
length -= sizeof(zeroes);
}
return memcmp(zeroes, p, length) == 0;
}
static __attribute__((noinline)) bool memeqzero3_rusty(const void *data, size_t length)
{
const unsigned char *p = data;
const unsigned long zero = 0;
size_t pre;
pre = (size_t)p % sizeof(unsigned long);
if (pre) {
size_t n = sizeof(unsigned long) - pre;
if (n > length)
n = length;
if (memcmp(p, &zero, n) != 0)
return false;
p += n;
length -= n;
}
while (length > sizeof(zero)) {
if (*(unsigned long *)p != zero)
return false;
p += sizeof(zero);
length -= sizeof(zero);
}
return memcmp(&zero, p, length) == 0;
}
static __attribute__((noinline)) bool memeqzero3_paolo(const void *data, size_t length)
{
const unsigned char *p = data;
unsigned long word;
while (length & (sizeof(word) - 1)) {
if (*p)
return false;
p++;
length--;
}
while (length) {
memcpy(&word, p, sizeof(word));
if (word)
return false;
p += sizeof(word);
length -= sizeof(word);
}
return true;
}
static __attribute__((noinline)) bool memeqzero4_rusty(const void *data, size_t length)
{
const unsigned char *p = data;
size_t len;
/* Check first 16 bytes manually */
for (len = 0; len < 16; len++) {
if (!length)
return true;
if (*p)
return false;
p++;
length--;
}
/* Now we know that's zero, memcmp with self. */
return memcmp(data, p, length) == 0;
}
static __attribute__((noinline)) bool memeqzero4_paolo(const void *data, size_t length)
{
const unsigned char *p = data;
unsigned long word;
if (!length)
return true;
while (__builtin_expect(length & (sizeof(word) - 1), 0)) {
if (*p)
return false;
p++;
length--;
if (!length)
return true;
}
/* We must always read one byte or word, even if everything is aligned!
* Otherwise, memcmp(data, data, length) is trivially true.
*/
for (;;) {
memcpy(&word, p, sizeof(word));
if (word)
return false;
if (__builtin_expect(length & (16 - sizeof(word)), 0) == 0)
break;
p += sizeof(word);
length -= sizeof(word);
if (!length)
return true;
}
/* Now we know that's zero, memcmp with self. */
return memcmp(data, p, length) == 0;
}
static inline unsigned long rdtsc() {
unsigned long cycles;
asm volatile("rdtsc; shlq $32, %%rdx; movl %%eax, %%eax; orq %%rdx, %%rax " : "=A"(cycles));
return cycles;
}
static int bench(char *buf, int size, bool(*memeqzero)(const void *, size_t))
{
int i = N;
int count;
unsigned long start = rdtsc();
while(i--) asm volatile("" : : "r" (memeqzero(buf, size)));
unsigned long end = rdtsc();
i = count = 3000000000.0 * N / (end - start);
start = rdtsc();
while(i--) asm volatile("" : : "r" (memeqzero(buf, size)));
end = rdtsc();
return (end - start) / count;
}
static __attribute__((__flatten__))
int run(bool(*memeqzero)(const void *, size_t))
{
printf ("%d\t%d\t%d\t%d\n",
bench(buf1, 1, memeqzero), bench(buf2, 8, memeqzero),
bench(buf3, 512, memeqzero), bench(buf4, 65536, memeqzero));
}
int main()
{
run(memeqzero1);
run(memeqzero2);
run(memeqzero3_rusty);
run(memeqzero3_paolo);
run(memeqzero4_rusty);
run(memeqzero4_paolo);
}
@effolkronium
Copy link

And what the result ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment