Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save scraimer/0e8965a38cb235e0565a to your computer and use it in GitHub Desktop.
Save scraimer/0e8965a38cb235e0565a to your computer and use it in GitHub Desktop.
A StackOverflow question was complaining about copying an array of uint8 to an array of uint16 as a "bottleneck". That sounded like a latency problem, and while I was looking into it, I finally got to write down the magic anti-optimizer functions that Chandler Carruth mentioned in his CppCon talk, for preventing code/data from being optimized away.
#include <cstdint>
#include <chrono>
#include <iostream>
/* The following two functions are from a talk by Chandler Carruth:
*
* CppCon 2015: Chandler Carruth "Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My!"
*
* [https://www.youtube.com/watch?v=nXaxk27zwlk]
*/
// Magical escape function to prevent the opitimizer from assuming p can be optimized away
static void escape(void *p)
{
asm volatile("" : : "g"(p) : "memory");
}
// Magical function to convince the compiler that ALL the memory has been written to
static void clobber()
{
asm volatile("" : : : "memory");
}
/* By combining escape() and clobber() we convince the compiler to not optimize
* away references to data that we never use. Becuase escape says "hey, I might
* be using this somewhere", and clobber() says "hey, I just touched ALL the
* memory". So the compiler can't assume we didn't touch the bit of memory we
* marked as escaped. And thus can't get rid of it. */
struct original
{
int8_t a[128];
int16_t b[128];
void go()
{
escape(a);
escape(b);
for (int i=0;i<128;i++)
{
b[i] = a[i];
}
clobber();
}
};
struct aligned64
{
struct alignas(64) { int8_t v; } a[128];
struct alignas(64) { int16_t v; } b[128];
void go()
{
escape(a);
escape(b);
for (int i=0;i<128;i++)
{
b[i].v = a[i].v;
}
clobber();
}
};
struct unrolled_8s
{
int8_t a[128];
int16_t b[128];
void go()
{
escape(a);
escape(b);
for (int i=0;i<128;i+=8)
{
b[i] = a[i];
b[i+1] = a[i+1];
b[i+2] = a[i+2];
b[i+3] = a[i+3];
b[i+4] = a[i+4];
b[i+5] = a[i+5];
b[i+6] = a[i+6];
b[i+7] = a[i+7];
}
clobber();
}
};
struct unrolled_64s
{
int8_t a[128];
int16_t b[128];
void go()
{
escape(a);
escape(b);
for (int i=0;i<128;i+=64)
{
b[i] = a[i];
b[i+1] = a[i+1];
b[i+2] = a[i+2];
b[i+3] = a[i+3];
b[i+4] = a[i+4];
b[i+5] = a[i+5];
b[i+6] = a[i+6];
b[i+7] = a[i+7];
b[i+8] = a[i+8];
b[i+9] = a[i+9];
b[i+10] = a[i+10];
b[i+11] = a[i+11];
b[i+12] = a[i+12];
b[i+13] = a[i+13];
b[i+14] = a[i+14];
b[i+15] = a[i+15];
b[i+16] = a[i+16];
b[i+17] = a[i+17];
b[i+18] = a[i+18];
b[i+19] = a[i+19];
b[i+20] = a[i+20];
b[i+21] = a[i+21];
b[i+22] = a[i+22];
b[i+23] = a[i+23];
b[i+24] = a[i+24];
b[i+25] = a[i+25];
b[i+26] = a[i+26];
b[i+27] = a[i+27];
b[i+28] = a[i+28];
b[i+29] = a[i+29];
b[i+30] = a[i+30];
b[i+31] = a[i+31];
b[i+32] = a[i+32];
b[i+33] = a[i+33];
b[i+34] = a[i+34];
b[i+35] = a[i+35];
b[i+36] = a[i+36];
b[i+37] = a[i+37];
b[i+38] = a[i+38];
b[i+39] = a[i+39];
b[i+40] = a[i+40];
b[i+41] = a[i+41];
b[i+42] = a[i+42];
b[i+43] = a[i+43];
b[i+44] = a[i+44];
b[i+45] = a[i+45];
b[i+46] = a[i+46];
b[i+47] = a[i+47];
b[i+48] = a[i+48];
b[i+49] = a[i+49];
b[i+50] = a[i+50];
b[i+51] = a[i+51];
b[i+52] = a[i+52];
b[i+53] = a[i+53];
b[i+54] = a[i+54];
b[i+55] = a[i+55];
b[i+56] = a[i+56];
b[i+57] = a[i+57];
b[i+58] = a[i+58];
b[i+59] = a[i+59];
b[i+60] = a[i+60];
b[i+61] = a[i+61];
b[i+62] = a[i+62];
b[i+63] = a[i+63];
}
clobber();
}
};
int main(int const argc, char const * const * const argv)
{
uint64_t const iter_num = 10000000;
original orig;
auto start = std::chrono::steady_clock::now();
for (uint64_t i=0; i<iter_num; ++i)
{
orig.go();
}
auto end = std::chrono::steady_clock::now();
auto delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "original: " << (delta / (double)iter_num) << "usec" << std::endl;
/////////////////////////////////////////////////////
aligned64 al64;
start = std::chrono::steady_clock::now();
for (uint64_t i=0; i<iter_num; ++i)
{
al64.go();
}
end = std::chrono::steady_clock::now();
delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "aligned64: " << (delta / (double)iter_num) << "usec" << std::endl;
/////////////////////////////////////////////////////
unrolled_8s u8;
start = std::chrono::steady_clock::now();
for (uint64_t i=0; i<iter_num; ++i)
{
u8.go();
}
end = std::chrono::steady_clock::now();
delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "unrolled_8s: " << (delta / (double)iter_num) << "usec" << std::endl;
/////////////////////////////////////////////////////
unrolled_64s u64;
start = std::chrono::steady_clock::now();
for (uint64_t i=0; i<iter_num; ++i)
{
u64.go();
}
end = std::chrono::steady_clock::now();
delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "unrolled_64s: " << (delta / (double)iter_num) << "usec" << std::endl;
return 0;
}
@scraimer
Copy link
Author

Output on my 3.1Ghz CPU:

original: 0.0905usec
aligned64: 0.1191usec
unrolled_8s: 0.0625usec
unrolled_64s: 0.0497usec

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment