Skip to content

Instantly share code, notes, and snippets.

@umezawatakeshi
Last active November 28, 2021 17:00
Show Gist options
  • Save umezawatakeshi/407b1607b880c5c6a6257d80e61d8353 to your computer and use it in GitHub Desktop.
Save umezawatakeshi/407b1607b880c5c6a6257d80e61d8353 to your computer and use it in GitHub Desktop.
count UTF-8 codepoint
#include <functional>
#include <utility>
#include <algorithm>
#define NOMINMAX
#include <Windows.h>
#include <intrin.h>
#include <cstdlib>
const double clock_cycle = 3.4 * 1000 * 1000 * 1000; // Core i7-4770 @3.4GHz
class timecounter
{
private:
LARGE_INTEGER liStart, liStop;
static double freq;
public:
void start() { QueryPerformanceCounter(&liStart); }
void stop() { QueryPerformanceCounter(&liStop); }
double get() { return (double)(liStop.QuadPart - liStart.QuadPart) / freq; }
};
double timecounter::freq = []() {
LARGE_INTEGER li;
QueryPerformanceFrequency(&li);
return (double)li.QuadPart;
}();
#define FUNCNAME(fn) { #fn, fn }
size_t reference_implementation(const char* mem, size_t sz)
{
size_t ret = 0;
for (size_t i = 0; i < sz; ++i)
if ((mem[i] & 0x80) == 0x00 || (mem[i] & 0xc0) == 0xC0)
++ret;
return ret;
}
inline int32_t avx2_horizontal_sum_epi8(__m256i x)
{
__m256i sumhi = _mm256_unpackhi_epi8(x, _mm256_setzero_si256());
__m256i sumlo = _mm256_unpacklo_epi8(x, _mm256_setzero_si256());
__m256i sum16x16 = _mm256_add_epi16(sumhi, sumlo);
__m256i sum16x8 = _mm256_add_epi16(sum16x16, _mm256_permute2x128_si256(sum16x16, sum16x16, 1));
__m256i sum16x4 = _mm256_add_epi16(sum16x8, _mm256_shuffle_epi32(sum16x8, _MM_SHUFFLE(0, 0, 2, 3)));
uint64_t tmp = _mm256_extract_epi64(sum16x4, 0);
int32_t result = 0;
result += (tmp >> 0) & 0xffff;
result += (tmp >> 16) & 0xffff;
result += (tmp >> 32) & 0xffff;
result += (tmp >> 48) & 0xffff;
return result;
}
#ifdef __AVX512F__
inline int32_t avx512_horizontal_sum_epi8(__m512i x)
{
__m512i sumhi = _mm512_unpackhi_epi8(x, _mm512_setzero_si512());
__m512i sumlo = _mm512_unpacklo_epi8(x, _mm512_setzero_si512());
__m512i sum16x32 = _mm512_add_epi16(sumhi, sumlo);
__m256i sum16x16 = _mm256_add_epi16(_mm512_castsi512_si256(sum16x32), _mm512_extracti64x4_epi64(sum16x32, 1));
__m128i sum16x8 = _mm_add_epi16(_mm256_castsi256_si128(sum16x16), _mm256_extracti128_si256(sum16x16, 1));
__m128i sum16x4 = _mm_add_epi16(sum16x8, _mm_srli_si128(sum16x8, 8));
uint64_t tmp = _mm_cvtsi128_si64(sum16x4);
tmp += (tmp >> 32);
tmp += (tmp >> 16);
return tmp & 0xffff;
}
#endif
size_t avx_count_utf8_codepoint(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i + 31 < sz;) {
__m256i sum = _mm256_setzero_si256();
size_t j = 0;
for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) {
const __m256i table = _mm256_setr_epi8(
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
0, 0, 0, 0, // 0x8 .. 0xB
1, 1, 1, 1, // 0xC .. 0xF
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
0, 0, 0, 0, // 0x8 .. 0xB
1, 1, 1, 1 // 0xC .. 0xF
);
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F));
s = _mm256_shuffle_epi8(table, s);
sum = _mm256_add_epi8(sum, s);
}
i += j;
result += avx2_horizontal_sum_epi8(sum);
}
return result;
}
size_t avx_count_utf8_codepoint_loopend(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz;) {
__m256i sum = _mm256_setzero_si256();
size_t j = 0;
size_t limit = std::min<size_t>(255 * 32, sz - i);
for (; j < limit; j += 32) {
const __m256i table = _mm256_setr_epi8(
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
0, 0, 0, 0, // 0x8 .. 0xB
1, 1, 1, 1, // 0xC .. 0xF
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7
0, 0, 0, 0, // 0x8 .. 0xB
1, 1, 1, 1 // 0xC .. 0xF
);
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F));
s = _mm256_shuffle_epi8(table, s);
sum = _mm256_add_epi8(sum, s);
}
i += j;
result += avx2_horizontal_sum_epi8(sum);
}
return result;
}
size_t opt_innermost_content(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i + 31 < sz;) {
__m256i sum = _mm256_setzero_si256();
size_t j = 0;
for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) {
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41)));
}
i += j;
result += avx2_horizontal_sum_epi8(sum);
}
return result;
}
size_t opt_innermost_content_loopend(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz;) {
__m256i sum = _mm256_setzero_si256();
size_t j = 0;
size_t limit = std::min<size_t>(255 * 32, sz - i);
for (; j < limit; j += 32) {
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j));
sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41)));
}
i += j;
result += avx2_horizontal_sum_epi8(sum);
}
return result;
}
#ifdef __AVX512F__
size_t avx512_cmpgt_popcnt(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz; i += 64) {
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i));
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41));
result += _mm_popcnt_u64(m);
}
return result;
}
size_t avx512_cmpgt_popcnt_nestedloop(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz; ) {
size_t sum = 0;
size_t j = 0;
size_t limit = std::min<size_t>(256 * 64, sz - i);
for (; j < limit; j += 64) {
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j));
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41));
sum += _mm_popcnt_u64(m);
}
i += j;
result += sum;
}
return result;
}
size_t avx512_cmpgt_movm2b_sub(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz;) {
__m512i sum = _mm512_setzero_si512();
size_t j = 0;
size_t limit = std::min<size_t>(255 * 64, sz - i);
for (; j < limit; j += 64) {
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j));
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41));
sum = _mm512_sub_epi8(sum, _mm512_movm_epi8(m));
}
i += j;
result += avx512_horizontal_sum_epi8(sum);
}
return result;
}
size_t avx512_cmpgt_movdqu8_add(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz;) {
__m512i sum = _mm512_setzero_si512();
size_t j = 0;
size_t limit = std::min<size_t>(255 * 64, sz - i);
for (; j < limit; j += 64) {
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j));
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41));
sum = _mm512_add_epi8(sum, _mm512_maskz_mov_epi8(m, _mm512_set1_epi8(1)));
}
i += j;
result += avx512_horizontal_sum_epi8(sum);
}
return result;
}
size_t avx512_cmpgt_maskadd(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz;) {
__m512i sum = _mm512_setzero_si512();
size_t j = 0;
size_t limit = std::min<size_t>(255 * 64, sz - i);
for (; j < limit; j += 64) {
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j));
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41));
sum = _mm512_mask_add_epi8(sum, m, sum, _mm512_set1_epi8(1));
}
i += j;
result += avx512_horizontal_sum_epi8(sum);
}
return result;
}
size_t avx512_cmpgt_maskadd128(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz;) {
__m512i sum0 = _mm512_setzero_si512();
__m512i sum1 = _mm512_setzero_si512();
size_t j = 0;
size_t limit = std::min<size_t>(255 * 128, sz - i);
for (; j < limit; j += 128) {
__m512i s0 = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j));
__m512i s1 = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j + 64));
__mmask64 m0 = _mm512_cmpgt_epi8_mask(s0, _mm512_set1_epi8(-0x41));
__mmask64 m1 = _mm512_cmpgt_epi8_mask(s1, _mm512_set1_epi8(-0x41));
sum0 = _mm512_mask_add_epi8(sum0, m0, sum0, _mm512_set1_epi8(1));
sum1 = _mm512_mask_add_epi8(sum1, m1, sum1, _mm512_set1_epi8(1));
}
i += j;
result += avx512_horizontal_sum_epi8(sum0);
result += avx512_horizontal_sum_epi8(sum1);
}
return result;
}
size_t avx512_pshufb_add(const char *p, size_t sz)
{
size_t result = 0;
for (size_t i = 0; i < sz;) {
__m512i sum = _mm512_setzero_si512();
size_t j = 0;
size_t limit = std::min<size_t>(255 * 64, sz - i);
for (; j < limit; j += 64) {
const __m512i table = _mm512_set_epi8(
1, 1, 1, 1, // 0xF .. 0xC
0, 0, 0, 0, // 0xB .. 0x8
1, 1, 1, 1, 1, 1, 1, 1, // 0x7 ..
1, 1, 1, 1, // 0xF .. 0xC
0, 0, 0, 0, // 0xB .. 0x8
1, 1, 1, 1, 1, 1, 1, 1, // 0x7 ..
1, 1, 1, 1, // 0xF .. 0xC
0, 0, 0, 0, // 0xB .. 0x8
1, 1, 1, 1, 1, 1, 1, 1, // 0x7 ..
1, 1, 1, 1, // 0xF .. 0xC
0, 0, 0, 0, // 0xB .. 0x8
1, 1, 1, 1, 1, 1, 1, 1 // 0x7 ..
);
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j));
s = _mm512_and_si512(_mm512_srli_epi16(s, 4), _mm512_set1_epi8(0x0F));
s = _mm512_shuffle_epi8(table, s);
sum = _mm512_add_epi8(sum, s);
}
i += j;
result += avx512_horizontal_sum_epi8(sum);
}
return result;
}
#endif
const std::pair<size_t, int> sizereps[] = {
{ 32 * 256 * 2, 1 },
#ifndef _DEBUG
// { 32 * 255 * 2, 10000000 },
{ 32 * 256 * 2, 10000000 },
// { 32 * 255 * 28, 1000000 },
{ 32 * 256 * 28, 1000000 },
// { 32 * 255 * 768, 40000 },
{ 32 * 256 * 768, 40000 },
// { 32 * 255 * 16384, 1000 },
{ 32 * 256 * 16384, 1000 },
#endif
};
/* const をつけると clang の最適化がすごい ※1 */ std::pair<const char*, size_t(*)(const char*, size_t)> funcs[] = {
// FUNCNAME(reference_implementation),
FUNCNAME(avx_count_utf8_codepoint),
FUNCNAME(avx_count_utf8_codepoint_loopend),
FUNCNAME(opt_innermost_content),
FUNCNAME(opt_innermost_content_loopend),
#ifdef __AVX512F__
FUNCNAME(avx512_cmpgt_popcnt),
FUNCNAME(avx512_cmpgt_popcnt_nestedloop),
FUNCNAME(avx512_cmpgt_movm2b_sub),
FUNCNAME(avx512_cmpgt_movdqu8_add),
FUNCNAME(avx512_cmpgt_maskadd),
FUNCNAME(avx512_cmpgt_maskadd128),
FUNCNAME(avx512_pshufb_add),
#endif
};
int main()
{
for (const auto [sz, rep] : sizereps)
{
printf("size %zu loop %d\n", sz, rep);
char* mem = (char*)_aligned_malloc(sz, 64);
for (size_t i = 0; i < sz; ++i)
mem[i] = rand();
const size_t ncodepoint = reference_implementation(mem, sz);
for (const auto[name, fn] : funcs)
{
timecounter tc;
size_t r;
printf(" %-32s: ", name);
tc.start();
for (int i = 0; i < rep; ++i)
r = fn(mem, sz); // ※1 のところで const を付けると clang がここの呼び出しが純粋関数になることを利用してループせずに1回だけの実行になってしまう
tc.stop();
if (r != ncodepoint)
{
printf("wrong answer!\n");
return 1;
}
printf("%10.3fus %5.1fGB/s %5.2fclk/32B\n",
tc.get() / rep * (1000 * 1000),
((double)sz) * rep / tc.get() / (1000 * 1000 * 1000),
tc.get() * clock_cycle / sz / rep * 32);
}
_aligned_free(mem);
}
}
@starboy-3
Copy link

looks like shit

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment