Skip to content

Instantly share code, notes, and snippets.

@daramkun
Created January 6, 2023 07:10
Show Gist options
  • Save daramkun/26fc51b3d42010b2b76a82d8feeaa165 to your computer and use it in GitHub Desktop.
Save daramkun/26fc51b3d42010b2b76a82d8feeaa165 to your computer and use it in GitHub Desktop.
#include <intrin.h>
#include <immintrin.h>
#include <emmintrin.h>
#include <cstdio>
#include <cstdint>
#include <cmath>
#include <cfloat>
#include <Windows.h>
double current_ticks()
{
LARGE_INTEGER freq, counter;
QueryPerformanceFrequency (&freq);
QueryPerformanceCounter (&counter);
return counter.QuadPart / static_cast<double>(freq.QuadPart);
}
uint16_t convert_f32_to_f16_plain(const float value)
{
uint16_t word;
if (fabs(value) <= FLT_EPSILON)
{
word = 0;
return word;
}
const uint32_t& i = *reinterpret_cast<const uint32_t*>(&value);
const int sign = (i >> 16) & 0x8000;
const int exp = ((i >> 23) & 0xff) - (0x7f - 0x0f);
const int frac = i & 0x007fffff;
if (exp < 31)
{
word = 0x7e00;
return word;
}
if (exp <= 0)
{
word = static_cast<uint16_t>(sign);
return word;
}
word = static_cast<uint16_t>(sign | (exp << 10) | frac);
return word;
}
void convert_f32pair_to_f16pair_plain(const float value[4], uint16_t result[4])
{
for (auto i = 0; i < 4; ++i)
result[i] = convert_f32_to_f16_plain(value[i]);
}
uint16_t convert_f32_to_f16_f16c(const float value)
{
uint16_t temp[8];
const auto f16 = _mm_cvtps_ph(_mm_set_ss(value), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
_mm_store_si128(reinterpret_cast<__m128i*>(temp), f16);
return temp[0];
}
void convert_f32pair_to_f16pair_f16c(const float value[4], uint16_t result[4])
{
uint16_t temp[8];
const auto f16 = _mm_cvtps_ph(_mm_load_ps(value), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
_mm_store_si128(reinterpret_cast<__m128i*>(temp), f16);
memcpy(result, temp, sizeof(uint16_t) * 4);
}
int main(int argc, char* argv[])
{
constexpr int loop_count = 100000;
constexpr float value = 10.10f;
constexpr float values[4] = { 1, 2, 3, 4 };
uint16_t result[4];
double last_tick, current_tick;
printf("plane:\n");
last_tick = current_ticks();
for(auto i = 0; i < loop_count;++i)
convert_f32_to_f16_plain(value);
current_tick = current_ticks();
printf(" %lf\n", current_tick - last_tick);
printf("f16c:\n");
last_tick = current_ticks();
for(auto i = 0; i < loop_count;++i)
convert_f32_to_f16_f16c(value);
current_tick = current_ticks();
printf(" %lf\n", current_tick - last_tick);
printf("planex4:\n");
last_tick = current_ticks();
for (auto i = 0; i < loop_count; ++i)
convert_f32pair_to_f16pair_plain(values, result);
current_tick = current_ticks();
printf(" %lf\n", current_tick - last_tick);
printf("f16cx4:\n");
last_tick = current_ticks();
for (auto i = 0; i < loop_count; ++i)
convert_f32pair_to_f16pair_f16c(values, result);
current_tick = current_ticks();
printf(" %lf\n", current_tick - last_tick);
return 0;
}
@daramkun
Copy link
Author

daramkun commented Jan 6, 2023

plain:
  0.001287
f16c:
  0.002467
plainx4:
  0.007847
f16cx4:
  0.002802

@daramkun
Copy link
Author

daramkun commented Jan 6, 2023

  • one for one : plain win
  • x4 for x4 : intrinsic win

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment