Skip to content

Instantly share code, notes, and snippets.

@tpruzina
Last active June 5, 2018 16:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tpruzina/440a60c1ae910e0c7be850bbb6d4d766 to your computer and use it in GitHub Desktop.
Save tpruzina/440a60c1ae910e0c7be850bbb6d4d766 to your computer and use it in GitHub Desktop.
You just can't beat the compiler
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
// play with these
#define EXPECT_SIZE 4096
//#define ALIGN
//#define FUNROLL
//#define RESTRICT
#if !defined(NAIVE) && !defined(INTMAXCPY) && !defined(SOCPY)
#define MAIN_TEST_ALL
#define NAIVECPY
#define INTMAXCPY
#define SOCPY
#define SSECPY
#endif
// preprocessor directives
#define EXPECT(expr, result) __builtin_expect((expr), (result))
#define LIKELY(condition) __builtin_expect(!!(condition), 1)
#define UNLIKELY(condition) __builtin_expect((condition), 0)
#define INLINE __inline__ __attribute__((always_inline))
// define this to work with aligned alloc
#ifdef ALIGN
#define ALIGN_BYTES 8
#define ALIGNED __attribute__((aligned(ALIGN_BYTES)))
#else
#define ALIGNED
#endif
// tell the compiler to unroll loops
#ifdef FUNROLL
#define UNROLL_ENABLED __attribute__((optimize("unroll-loops")))
#else
#define UNROLL_ENABLED
#endif
// tell compiler about aliasing (see "C restrict" keyword on wiki)
#ifdef RESTRICT
#define RESTRICTED __restrict__
#else
#define RESTRICTED
#endif
#ifdef NAIVECPY
UNROLL_ENABLED void
naive_cpy(uint8_t * RESTRICTED ALIGNED src,
uint8_t * RESTRICTED ALIGNED dst, size_t size)
{
size_t i = EXPECT(size, EXPECT_SIZE);
for (; UNLIKELY(i); i--)
*dst++ = *src++;
}
#endif
#ifdef INTMAXCPY
UNROLL_ENABLED void
intmax_cpy(uint8_t * RESTRICTED ALIGNED src,
uint8_t * RESTRICTED ALIGNED dst, size_t size)
{
uintmax_t *si = (uintmax_t *) src;
uintmax_t *di = (uintmax_t *) dst;
uint8_t *end = src + size;
size_t i = EXPECT(size, EXPECT_SIZE) / sizeof(uintmax_t);
for (; i > 0; i--)
*di++ = *si++;
dst = (typeof(dst)) di;
src = (typeof(dst)) si;
while (src < end)
*dst++ = *src++;
}
#endif
#ifdef SOCPY
typedef struct {
uint8_t dummy[32];
} DT;
void SO_cpy(uint8_t * dst, uint8_t * src, size_t s)
{
// okay this marks the end of src
uint8_t *end = src + s;
// this is pure autism
// sizeof(DT*) == 8/4B (32/64bit), effectively we loaded (void*)(src_ptr - 8)
DT *d1 = (DT *) dst - 1;
DT *s1 = (DT *) src - 1;
// divide size by copy window
size_t si = s / sizeof(DT);
si = (si + 7) / 8;
switch (si % 8) {
case 0:
do {
*++d1 = *++s1;
case 7:
*++d1 = *++s1;
case 6:
*++d1 = *++s1;
case 5:
*++d1 = *++s1;
case 4:
*++d1 = *++s1;
case 3:
*++d1 = *++s1;
case 2:
*++d1 = *++s1;
case 1:
*++d1 = *++s1;
}
while (--si > 0);
}
dst = (uint8_t *) d1;
src = (uint8_t *) s1;
while (src < end) {
*++dst = *++src;
}
}
#endif
#ifdef SSECPY
#include <emmintrin.h>
static INLINE void sse_cpy(uint8_t * d, uint8_t * s, size_t size)
{
size_t pad;
void *dst = d;
const void *src = s;
// this might not matter for powers of two, but it was
// a bug in testing random buffer sizes. TODO: think about this.
pad = (16 - (((size_t) dst) & 15)) & 15;
if (pad > 0) {
// TODO: I am not sure this is bug free... or the right way to do this.
__m128i top = _mm_loadu_si128((const __m128i *) src);
_mm_storeu_si128((__m128i *) dst, top);
src += pad;
dst += pad;
size -= pad;
}
_mm_prefetch((const char *) (src), _MM_HINT_NTA);
for (; size >= 128; size -= 128) {
__m128i w0, w1, w2, w3, w4, w5, w6, w7;
w0 = _mm_load_si128(((const __m128i *) src) + 0);
w1 = _mm_load_si128(((const __m128i *) src) + 1);
w2 = _mm_load_si128(((const __m128i *) src) + 2);
w3 = _mm_load_si128(((const __m128i *) src) + 3);
w4 = _mm_load_si128(((const __m128i *) src) + 4);
w5 = _mm_load_si128(((const __m128i *) src) + 5);
w6 = _mm_load_si128(((const __m128i *) src) + 6);
w7 = _mm_load_si128(((const __m128i *) src) + 7);
_mm_prefetch((const char *) (src + 256), _MM_HINT_NTA);
src += 128;
_mm_stream_si128((((__m128i *) dst) + 0), w0);
_mm_stream_si128((((__m128i *) dst) + 1), w1);
_mm_stream_si128((((__m128i *) dst) + 2), w2);
_mm_stream_si128((((__m128i *) dst) + 3), w3);
_mm_stream_si128((((__m128i *) dst) + 4), w4);
_mm_stream_si128((((__m128i *) dst) + 5), w5);
_mm_stream_si128((((__m128i *) dst) + 6), w6);
_mm_stream_si128((((__m128i *) dst) + 7), w7);
dst += 128;
}
// TODO: Last 128 bytes matter when not aligned?
// Docs said we need this?
_mm_sfence();
}
#endif // SSECPY
typedef uint64_t ts;
static __inline__ uint64_t rdtsc(void)
{
uint32_t a, d;
__asm volatile ("rdtsc":"=a" (a), "=d"(d));
return ((uint64_t) a) | (((uint64_t) d) << 32);
}
#ifdef MAIN_TEST_ALL
// reset, copy in loop, compare last result and print out time it took to copy data
void
time_cpy(typeof(naive_cpy) cpy, uint8_t * dst, uint8_t * src, size_t size,
size_t loops, const char *msg)
{
memset(src, 0, size);
memset(dst, 0xFF, size);
ts then = rdtsc();
for (uint_fast32_t i = 0; i < loops; i++)
cpy(dst, src, size);
ts now = rdtsc();
if (memcmp(src, dst, size))
fprintf(stderr, "we dun goofed soomewhere\n");
else
printf("%12d clocks [%s]\n", (int) (now - then), msg);
}
int main(int argc, char **argv)
{
// These actually produce different code based on whether size is known at compile time
// size_t size = 4096;
// size_t size = atoi(argv[1]);
size_t size, loops;
if (argc == 2) {
size = atoi(argv[1]);
loops = 1000;
} else if (argc == 3) {
size = atoi(argv[1]);
loops = atoi(argv[2]);
} else {
size = 4096;
loops = 1000;
}
#ifdef ALIGN
uint8_t *src = (uint8_t *) aligned_alloc(ALIGN_BYTES, size);
uint8_t *dst = (uint8_t *) aligned_alloc(ALIGN_BYTES, size);
#else
uint8_t *src = (uint8_t *) malloc(size);
uint8_t *dst = (uint8_t *) malloc(size);
#endif
#ifdef ALIGN
printf("[aligned]");
#endif
#ifdef RESTRICT
printf("[restrict]");
#endif
#ifdef FUNROLL
printf("[funroll attr]");
#endif
printf(" size=%u loops=%u\n", size, loops);
time_cpy(naive_cpy, dst, src, size, loops, "NAIVE");
time_cpy(intmax_cpy, dst, src, size, loops, "INTMAX");
time_cpy(SO_cpy, dst, src, size, loops, "SOCPY");
time_cpy(sse_cpy, dst, src, size, loops, "SSE");
free(src);
free(dst);
return 0;
}
#endif //MAIN_TEST_ALL
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment