Skip to content

Instantly share code, notes, and snippets.

@e673
Created January 16, 2017 20:56
Show Gist options
  • Save e673/ded24a3cf958f55a6d5a63785d52ed12 to your computer and use it in GitHub Desktop.
Save e673/ded24a3cf958f55a6d5a63785d52ed12 to your computer and use it in GitHub Desktop.
Float vs Byte performance in C++
const int N = 512;
Image<float> img1f(N, N), img2f(N, N);
Image<byte> img1b(N, N), img2b(N, N);
for (int j = 0; j < img1f.Height(); j++)
for (int i = 0; i < img1f.Width(); i++)
{
img1f(i, j) = img2f(i, j) = 3 * i + j;
img1b(i, j) = img2b(i, j) = 3 * i + j;
}
long long freq = _Query_perf_frequency();
long long t0 = _Query_perf_counter();
for (int j = 1; j < img1f.Height() - 1; j++)
{
float *l0 = img1f.pixeladdr(1, j - 1);
float *l1 = img1f.pixeladdr(1, j);
float *l2 = img1f.pixeladdr(1, j + 1);
float *dst = img2f.pixeladdr(1, j);
for (int i = 1; i < img1f.Width(); i++)
{
float q1 = l0[-1] + l0[1] + l2[-1] + l2[1];
float q2 = l0[0] + l1[-1] + l1[1] + l2[0];
float q4 = l1[0];
*dst = (q1 + 2.0f * q2 + 4.0f * q4) * (1.0f / 16.0f);
l0++;
l1++;
l2++;
dst++;
}
}
check(img2f(10, 10) == 40.0f);
long long t1 = _Query_perf_counter();
for (int j = 1; j < img1b.Height() - 1; j++)
{
byte *l0 = img1b.pixeladdr(1, j - 1);
byte *l1 = img1b.pixeladdr(1, j);
byte *l2 = img1b.pixeladdr(1, j + 1);
byte *dst = img2b.pixeladdr(1, j);
for (int i = 1; i < img1f.Width(); i++)
{
int q1 = l0[-1] + l0[1] + l2[-1] + l2[1];
int q2 = l0[0] + l1[-1] + l1[1] + l2[0];
int q4 = l1[0];
*dst = (q1 + 2 * q2 + 4 * q4) >> 4;
l0++;
l1++;
l2++;
dst++;
}
}
check(img2b(10, 10) == 40);
long long t2 = _Query_perf_counter();
const __m128 m16 = _mm_set1_ps(1.0f / 16.0f);
for (int j = 1; j < img1f.Height() - 1; j++)
{
float *l0 = img1f.pixeladdr(1, j - 1);
float *l1 = img1f.pixeladdr(1, j);
float *l2 = img1f.pixeladdr(1, j + 1);
float *dst = img2f.pixeladdr(1, j);
// We do not care about boundary processing. Just benchmark.
for (int i = 1; i < img1f.Width() - 4; i += 4)
{
__m128 q1 = _mm_add_ps(_mm_add_ps(_mm_loadu_ps(l0 - 1), _mm_loadu_ps(l0 + 1)), _mm_add_ps(_mm_loadu_ps(l2 - 1), _mm_loadu_ps(l2 + 1)));
__m128 q2 = _mm_add_ps(_mm_add_ps(_mm_loadu_ps(l0), _mm_loadu_ps(l2)), _mm_add_ps(_mm_loadu_ps(l1 - 1), _mm_loadu_ps(l1 + 1)));
__m128 q4 = _mm_loadu_ps(l1);
q2 = _mm_add_ps(q2, q2);
q4 = _mm_add_ps(q4, q4);
__m128 res = _mm_mul_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(q4, q4), q2), q1), m16);
_mm_storeu_ps(dst, res);
l0 += 4;
l1 += 4;
l2 += 4;
dst += 4;
}
}
check(img2f(10, 10) == 40.0f);
long long t3 = _Query_perf_counter();
const __m256 q16 = _mm256_set1_ps(1.0f / 16.0f);
for (int j = 1; j < img1f.Height() - 1; j++)
{
float *l0 = img1f.pixeladdr(1, j - 1);
float *l1 = img1f.pixeladdr(1, j);
float *l2 = img1f.pixeladdr(1, j + 1);
float *dst = img2f.pixeladdr(1, j);
// We do not care about boundary processing. Just benchmark.
for (int i = 1; i < img1f.Width() - 8; i += 8)
{
__m256 q1 = _mm256_add_ps(_mm256_add_ps(_mm256_loadu_ps(l0 - 1), _mm256_loadu_ps(l0 + 1)), _mm256_add_ps(_mm256_loadu_ps(l2 - 1), _mm256_loadu_ps(l2 + 1)));
__m256 q2 = _mm256_add_ps(_mm256_add_ps(_mm256_loadu_ps(l0), _mm256_loadu_ps(l2)), _mm256_add_ps(_mm256_loadu_ps(l1 - 1), _mm256_loadu_ps(l1 + 1)));
__m256 q4 = _mm256_loadu_ps(l1);
q2 = _mm256_add_ps(q2, q2);
q4 = _mm256_add_ps(q4, q4);
__m256 res = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(q4, q4), q2), q1), q16);
_mm256_storeu_ps(dst, res);
l0 += 8;
l1 += 8;
l2 += 8;
dst += 8;
}
}
check(img2f(10, 10) == 40.0f);
long long t4 = _Query_perf_counter();
for (int j = 1; j < img1b.Height() - 1; j++)
{
byte *l0 = img1b.pixeladdr(1, j - 1);
byte *l1 = img1b.pixeladdr(1, j);
byte *l2 = img1b.pixeladdr(1, j + 1);
byte *dst = img2b.pixeladdr(1, j);
for (int i = 1; i < img1f.Width() - 8; i += 8)
{
__m128i v1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l0 - 1)), _mm_setzero_si128());
__m128i v2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l0 + 1)), _mm_setzero_si128());
__m128i v3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l2 - 1)), _mm_setzero_si128());
__m128i v4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l2 + 1)), _mm_setzero_si128());
__m128i q1 = _mm_add_epi16(_mm_add_epi16(v1, v2), _mm_add_epi16(v3, v4));
__m128i v5 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l0)), _mm_setzero_si128());
__m128i v6 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l2)), _mm_setzero_si128());
__m128i v7 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l1 - 1)), _mm_setzero_si128());
__m128i v8 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l1 + 1)), _mm_setzero_si128());
__m128i q2 = _mm_add_epi16(_mm_add_epi16(v5, v6), _mm_add_epi16(v7, v8));
__m128i q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(l1)), _mm_setzero_si128());
q2 = _mm_add_epi16(q2, q2);
q4 = _mm_add_epi16(q4, q4);
__m128i res = _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(q4, q4), q2), q1), 4), _mm_setzero_si128());
_mm_storel_epi64((__m128i*)dst, res);
l0 += 8;
l1 += 8;
l2 += 8;
dst += 8;
}
}
check(img2b(10, 10) == 40);
long long t5 = _Query_perf_counter();
printf("float: %lld us\n", (t1 - t0) * 1000000 / freq);
printf("byte: %lld us\n", (t2 - t1) * 1000000 / freq);
printf("float sse: %lld us\n", (t3 - t2) * 1000000 / freq);
printf("float avx: %lld us\n", (t4 - t3) * 1000000 / freq);
printf("byte sse: %lld us\n", (t5 - t4) * 1000000 / freq);
getchar();
return 0;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment