This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <algorithm> | |
| #include <chrono> | |
| #include <cmath> | |
| #include <cstdint> | |
| #include <cstring> | |
| #include <filesystem> | |
| #include <fstream> | |
| #include <iostream> | |
| #include <memory> | |
| #include <random> | |
| #include <vector> | |
| #ifdef __ARM_NEON | |
| #include <arm_neon.h> | |
| #endif | |
| uint64_t nano() { | |
| return std::chrono::duration_cast<::std::chrono::nanoseconds>( | |
| std::chrono::steady_clock::now().time_since_epoch()) | |
| .count(); | |
| } | |
| template <typename PROCEDURE> | |
| double bench(PROCEDURE f, uint64_t threshold = 200'000'000) { | |
| uint64_t start = nano(); | |
| uint64_t finish = start; | |
| size_t count{0}; | |
| for (; finish - start < threshold;) { | |
| count += f(); | |
| finish = nano(); | |
| } | |
| return double(finish - start) / count; | |
| } | |
| void to_string_backlinear(uint64_t x, char *out) { | |
| for (int z = 0; z < 16; z++) { | |
| out[15 - z] = (x % 10) + 0x30; | |
| x /= 10; | |
| } | |
| } | |
| void to_string_linear(uint64_t x, char *out) { | |
| out[0] = x / 1000000000000000 + 0x30; | |
| x %= 1000000000000000; | |
| out[1] = x / 100000000000000 + 0x30; | |
| x %= 100000000000000; | |
| out[2] = x / 10000000000000 + 0x30; | |
| x %= 10000000000000; | |
| out[3] = x / 1000000000000 + 0x30; | |
| x %= 1000000000000; | |
| out[4] = x / 100000000000 + 0x30; | |
| x %= 100000000000; | |
| out[5] = x / 10000000000 + 0x30; | |
| x %= 10000000000; | |
| out[6] = x / 1000000000 + 0x30; | |
| x %= 1000000000; | |
| out[7] = x / 100000000 + 0x30; | |
| x %= 100000000; | |
| out[8] = x / 10000000 + 0x30; | |
| x %= 10000000; | |
| out[9] = x / 1000000 + 0x30; | |
| x %= 1000000; | |
| out[10] = x / 100000 + 0x30; | |
| x %= 100000; | |
| out[11] = x / 10000 + 0x30; | |
| x %= 10000; | |
| out[12] = x / 1000 + 0x30; | |
| x %= 1000; | |
| out[13] = x / 100 + 0x30; | |
| x %= 100; | |
| out[14] = x / 10 + 0x30; | |
| x %= 10; | |
| out[15] = x + 0x30; | |
| } | |
| // credit: Paul Khuong | |
| uint64_t encode_ten_thousands(uint64_t hi, uint64_t lo) { | |
| uint64_t merged = hi | (lo << 32); | |
| /* Truncate division by 100: 10486 / 2**20 ~= 1/100. */ | |
| uint64_t top = ((merged * 10486ULL) >> 20) & ((0x7FULL << 32) | 0x7FULL); | |
| /* Trailing 2 digits in the 1e4 chunks. */ | |
| uint64_t bot = merged - 100ULL * top; | |
| uint64_t hundreds; | |
| uint64_t tens; | |
| /* | |
| * We now have 4 radix-100 digits in little-endian order, each | |
| * in its own 16 bit area. | |
| */ | |
| hundreds = (bot << 16) + top; | |
| /* Divide and mod by 10 all 4 radix-100 digits in parallel. */ | |
| tens = (hundreds * 103ULL) >> 10; | |
| tens &= (0xFULL << 48) | (0xFULL << 32) | (0xFULL << 16) | 0xFULL; | |
| tens += (hundreds - 10ULL * tens) << 8; | |
| return tens; | |
| } | |
| uint64_t encode_ten_thousands2(uint64_t hi, uint64_t lo) { | |
| uint64_t merged = hi | (lo << 32); | |
| /* Truncate division by 100: 10486 / 2**20 ~= 1/100. */ | |
| uint64_t top = ((merged * 0x147b) >> 19) & ((0x7FULL << 32) | 0x7FULL); | |
| /* Trailing 2 digits in the 1e4 chunks. */ | |
| uint64_t bot = merged - 100ULL * top; | |
| uint64_t hundreds; | |
| uint64_t tens; | |
| hundreds = (bot << 16) + top; | |
| /* Divide and mod by 10 all 4 radix-100 digits in parallel. */ | |
| tens = (hundreds * 103ULL) >> 10; | |
| tens &= (0xFULL << 48) | (0xFULL << 32) | (0xFULL << 16) | 0xFULL; | |
| tens += (hundreds - 10ULL * tens) << 8; | |
| return tens; | |
| } | |
| void to_string_khuong(uint64_t x, char *out) { | |
| uint64_t top = x / 100000000; | |
| uint64_t bottom = x % 100000000; | |
| uint64_t first = | |
| 0x3030303030303030 + encode_ten_thousands(top / 10000, top % 10000); | |
| memcpy(out, &first, sizeof(first)); | |
| uint64_t second = | |
| 0x3030303030303030 + encode_ten_thousands(bottom / 10000, bottom % 10000); | |
| memcpy(out + 8, &second, sizeof(second)); | |
| } | |
| #ifdef __ARM_NEON | |
| // umulh x11, x10, x8 | |
| // lsr x11, x11, #26 | |
| // madd w10, w11, w9, w10 | |
| // bfi x11, x10, #32, #32 | |
| // fmov d16, x11 | |
| // sqdmulh.2s v17, v16, v0 | |
| // ushr.2s v17, v17, #0x9 | |
| // mla.2s v16, v17, v1 | |
| // zip1.8h v16, v16, v4 | |
| // sqdmulh.4s v17, v16, v2 | |
| // mla.4s v16, v17, v3 | |
| // sqdmulh.8h v17, v16, v5 | |
| // add.16b v16, v16, v6 | |
| // mla.8h v16, v17, v7 | |
| // rev64.16b v16, v16 | |
| void to_string_neon(uint64_t v, char *out) { | |
| // Equivalent to hi = v / 100000000, lo = v % 100000000, but for some reason | |
| // clang emits a UDIV. | |
| uint32_t hi = ((__uint128_t)v * 0xabcc77118461cefd) >> 90; | |
| uint32_t lo = v - hi * 100000000; | |
| uint64x1_t hundredmillions = { hi | ((uint64_t)lo << 32) }; | |
| int32x2_t high_10000 = vshr_n_u32(vqdmulh_s32(hundredmillions, vdup_n_s32(0x68db8bb)), 9); | |
| int32x2_t tenthousands = vmla_s32(hundredmillions, high_10000, vdup_n_s32(-10000 + 0x10000)); | |
| // Equivalent to `extended = vshll_n_u16(tenthousands, 0)`, but clang can see through | |
| // that and breaks the subsequent MLA into UADDW + MUL. | |
| int32x2_t zero = { 0, 0 }; | |
| int32x4_t extended = vzip1q_u16(vcombine_u16(tenthousands, zero), vcombine_u16(zero, zero)); | |
| int32x4_t high_100 = vqdmulhq_s32(extended, vdupq_n_s32(0x147b000)); | |
| int32x4_t hundreds = vmlaq_s32(extended, high_100, vdupq_n_s32(-100 + 0x10000)); | |
| int16x8_t high_10 = vqdmulhq_s16(hundreds, vdupq_n_s16(0xce0)); | |
| int16x8_t digits = vmlaq_s16(vaddq_s8(hundreds, vdupq_n_s8('0')), high_10, vdupq_n_s16(-10 + 0x100)); | |
| int8x16_t result = vrev64q_u8(digits); | |
| memcpy(out, &result, sizeof(result)); | |
| } | |
| #endif | |
| // take a 16-digit integer, < 10000000000000000, | |
| // and write it out. | |
| void to_string_tree(uint64_t x, char *out) { | |
| uint64_t top = x / 100000000; | |
| uint64_t bottom = x % 100000000; | |
| // | |
| uint64_t toptop = top / 10000; | |
| uint64_t topbottom = top % 10000; | |
| uint64_t bottomtop = bottom / 10000; | |
| uint64_t bottombottom = bottom % 10000; | |
| // | |
| uint64_t toptoptop = toptop / 100; | |
| uint64_t toptopbottom = toptop % 100; | |
| uint64_t topbottomtop = topbottom / 100; | |
| uint64_t topbottombottom = topbottom % 100; | |
| uint64_t bottomtoptop = bottomtop / 100; | |
| uint64_t bottomtopbottom = bottomtop % 100; | |
| uint64_t bottombottomtop = bottombottom / 100; | |
| uint64_t bottombottombottom = bottombottom % 100; | |
| // | |
| out[0] = toptoptop / 10 + 0x30; | |
| out[1] = toptoptop % 10 + 0x30; | |
| out[2] = toptopbottom / 10 + 0x30; | |
| out[3] = toptopbottom % 10 + 0x30; | |
| out[4] = topbottomtop / 10 + 0x30; | |
| out[5] = topbottomtop % 10 + 0x30; | |
| out[6] = topbottombottom / 10 + 0x30; | |
| out[7] = topbottombottom % 10 + 0x30; | |
| out[8] = bottomtoptop / 10 + 0x30; | |
| out[9] = bottomtoptop % 10 + 0x30; | |
| out[10] = bottomtopbottom / 10 + 0x30; | |
| out[11] = bottomtopbottom % 10 + 0x30; | |
| out[12] = bottombottomtop / 10 + 0x30; | |
| out[13] = bottombottomtop % 10 + 0x30; | |
| out[14] = bottombottombottom / 10 + 0x30; | |
| out[15] = bottombottombottom % 10 + 0x30; | |
| } | |
| void write_tenthousand(uint64_t z, char *out) { | |
| z = 429538 * z; | |
| out[0] = 0x30 + ((z * 10) >> 32); | |
| z = (z * 10) & 0xffffffff; | |
| out[1] = 0x30 + ((z * 10) >> 32); | |
| z = (z * 10) & 0xffffffff; | |
| out[2] = 0x30 + ((z * 10) >> 32); | |
| z = (z * 10) & 0xffffffff; | |
| out[3] = 0x30 + ((z * 10) >> 32); | |
| } | |
| void to_string_tree_str(uint64_t x, char *out) { | |
| uint64_t top = x / 100000000; | |
| uint64_t bottom = x % 100000000; | |
| // | |
| uint64_t toptop = top / 10000; | |
| uint64_t topbottom = top % 10000; | |
| uint64_t bottomtop = bottom / 10000; | |
| uint64_t bottombottom = bottom % 10000; | |
| // | |
| write_tenthousand(toptop, out); | |
| write_tenthousand(topbottom, out + 4); | |
| write_tenthousand(bottomtop, out + 8); | |
| write_tenthousand(bottombottom, out + 12); | |
| } | |
| void to_string_tree_table(uint64_t x, char *out) { | |
| static const char table[200] = { | |
| 0x30, 0x30, 0x30, 0x31, 0x30, 0x32, 0x30, 0x33, 0x30, 0x34, 0x30, 0x35, | |
| 0x30, 0x36, 0x30, 0x37, 0x30, 0x38, 0x30, 0x39, 0x31, 0x30, 0x31, 0x31, | |
| 0x31, 0x32, 0x31, 0x33, 0x31, 0x34, 0x31, 0x35, 0x31, 0x36, 0x31, 0x37, | |
| 0x31, 0x38, 0x31, 0x39, 0x32, 0x30, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, | |
| 0x32, 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, 0x32, 0x38, 0x32, 0x39, | |
| 0x33, 0x30, 0x33, 0x31, 0x33, 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, | |
| 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, 0x33, 0x39, 0x34, 0x30, 0x34, 0x31, | |
| 0x34, 0x32, 0x34, 0x33, 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, 0x34, 0x37, | |
| 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, 0x35, 0x31, 0x35, 0x32, 0x35, 0x33, | |
| 0x35, 0x34, 0x35, 0x35, 0x35, 0x36, 0x35, 0x37, 0x35, 0x38, 0x35, 0x39, | |
| 0x36, 0x30, 0x36, 0x31, 0x36, 0x32, 0x36, 0x33, 0x36, 0x34, 0x36, 0x35, | |
| 0x36, 0x36, 0x36, 0x37, 0x36, 0x38, 0x36, 0x39, 0x37, 0x30, 0x37, 0x31, | |
| 0x37, 0x32, 0x37, 0x33, 0x37, 0x34, 0x37, 0x35, 0x37, 0x36, 0x37, 0x37, | |
| 0x37, 0x38, 0x37, 0x39, 0x38, 0x30, 0x38, 0x31, 0x38, 0x32, 0x38, 0x33, | |
| 0x38, 0x34, 0x38, 0x35, 0x38, 0x36, 0x38, 0x37, 0x38, 0x38, 0x38, 0x39, | |
| 0x39, 0x30, 0x39, 0x31, 0x39, 0x32, 0x39, 0x33, 0x39, 0x34, 0x39, 0x35, | |
| 0x39, 0x36, 0x39, 0x37, 0x39, 0x38, 0x39, 0x39, | |
| }; | |
| uint64_t top = x / 100000000; | |
| uint64_t bottom = x % 100000000; | |
| // | |
| uint64_t toptop = top / 10000; | |
| uint64_t topbottom = top % 10000; | |
| uint64_t bottomtop = bottom / 10000; | |
| uint64_t bottombottom = bottom % 10000; | |
| // | |
| uint64_t toptoptop = toptop / 100; | |
| uint64_t toptopbottom = toptop % 100; | |
| uint64_t topbottomtop = topbottom / 100; | |
| uint64_t topbottombottom = topbottom % 100; | |
| uint64_t bottomtoptop = bottomtop / 100; | |
| uint64_t bottomtopbottom = bottomtop % 100; | |
| uint64_t bottombottomtop = bottombottom / 100; | |
| uint64_t bottombottombottom = bottombottom % 100; | |
| // | |
| memcpy(out, &table[2 * toptoptop], 2); | |
| memcpy(out + 2, &table[2 * toptopbottom], 2); | |
| memcpy(out + 4, &table[2 * topbottomtop], 2); | |
| memcpy(out + 6, &table[2 * topbottombottom], 2); | |
| memcpy(out + 8, &table[2 * bottomtoptop], 2); | |
| memcpy(out + 10, &table[2 * bottomtopbottom], 2); | |
| memcpy(out + 12, &table[2 * bottombottomtop], 2); | |
| memcpy(out + 14, &table[2 * bottombottombottom], 2); | |
| } | |
| // http://www.cs.uiowa.edu/~jones/bcd/decimal.html | |
| /* | |
| void putdec( uint64_t n, char *out ) { | |
| uint32_t d4, d3, d2, d1, d0, q; | |
| d0 = n & 0xFFFF; | |
| d1 = (n>>16) & 0xFFFF; | |
| d2 = (n>>32) & 0xFFFF; | |
| d3 = (n>>48) & 0xFFFF; | |
| d0 = 656 * d3 + 7296 * d2 + 5536 * d1 + d0; | |
| q = d0 / 10000; | |
| d0 = d0 % 10000; | |
| d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1; | |
| q = d1 / 10000; | |
| d1 = d1 % 10000; | |
| d2 = q + 4749 * d3 + 42 * d2; | |
| q = d2 / 10000; | |
| d2 = d2 % 10000; | |
| d3 = q + 281 * d3; | |
| q = d3 / 10000; | |
| d3 = d3 % 10000; | |
| d4 = q; | |
| printf( "%4.4u", d4 ); | |
| printf( "%4.4u", d3 ); | |
| printf( "%4.4u", d2 ); | |
| printf( "%4.4u", d1 ); | |
| printf( "%4.4u", d0 ); | |
| } | |
| */ | |
| #ifdef __SSE2__ | |
| // mula | |
| #include <x86intrin.h> | |
| void to_string_sse2(uint64_t v, char *out) { | |
| // v is 16-digit number = abcdefghijklmnop | |
| const __m128i div_10000 = _mm_set1_epi32(0xd1b71759); | |
| const __m128i mul_10000 = _mm_set1_epi32(10000); | |
| const int div_10000_shift = 45; | |
| const __m128i div_100 = _mm_set1_epi16(0x147b); | |
| const __m128i mul_100 = _mm_set1_epi16(100); | |
| const int div_100_shift = 3; | |
| const __m128i div_10 = _mm_set1_epi16(0x199a); | |
| const __m128i mul_10 = _mm_set1_epi16(10); | |
| const __m128i ascii0 = _mm_set1_epi8('0'); | |
| // can't be easliy done in SSE | |
| const uint32_t a = v / 100000000; // 8-digit number: abcdefgh | |
| const uint32_t b = v % 100000000; // 8-digit number: ijklmnop | |
| // [ 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | |
| // | 0 ] | |
| // x = [ 0 | ijklmnop | 0 | abcdefgh ] | |
| __m128i x = _mm_set_epi64x(b, a); | |
| // x div 10^4 = [ 0 | ijkl | 0 | abcd ] | |
| __m128i x_div_10000; | |
| x_div_10000 = _mm_mul_epu32(x, div_10000); | |
| x_div_10000 = _mm_srli_epi64(x_div_10000, div_10000_shift); | |
| // x mod 10^4 = [ 0 | mnop | 0 | efgh ] | |
| __m128i x_mod_10000; | |
| x_mod_10000 = _mm_mul_epu32(x_div_10000, mul_10000); | |
| x_mod_10000 = _mm_sub_epi32(x, x_mod_10000); | |
| // y = [ mnop | ijkl | efgh | abcd ] | |
| __m128i y = _mm_or_si128(x_div_10000, _mm_slli_epi64(x_mod_10000, 32)); | |
| // y_div_100 = [ 0 | mn | 0 | ij | 0 | ef | 0 | ab | |
| // ] | |
| __m128i y_div_100; | |
| y_div_100 = _mm_mulhi_epu16(y, div_100); | |
| y_div_100 = _mm_srli_epi16(y_div_100, div_100_shift); | |
| // y_mod_100 = [ 0 | op | 0 | kl | 0 | gh | 0 | cd | |
| // ] | |
| __m128i y_mod_100; | |
| y_mod_100 = _mm_mullo_epi16(y_div_100, mul_100); | |
| y_mod_100 = _mm_sub_epi16(y, y_mod_100); | |
| // z = [ mn | op | ij | kl | ef | gh | ab | cd | |
| // ] | |
| __m128i z = _mm_or_si128(y_div_100, _mm_slli_epi32(y_mod_100, 16)); | |
| // z_div_10 = [ 0 | m | 0 | o | 0 | i | 0 | k | 0 | e | 0 | g | 0 | a | 0 | |
| // | c ] | |
| __m128i z_div_10 = _mm_mulhi_epu16(z, div_10); | |
| // z_mod_10 = [ 0 | n | 0 | p | 0 | j | 0 | l | 0 | f | 0 | h | 0 | b | 0 | |
| // | d ] | |
| __m128i z_mod_10; | |
| z_mod_10 = _mm_mullo_epi16(z_div_10, mul_10); | |
| z_mod_10 = _mm_sub_epi16(z, z_mod_10); | |
| // interleave z_mod_10 and z_div_10 - | |
| // tmp = [ m | n | o | p | i | j | k | l | e | f | g | h | a | b | c | |
| // | d ] | |
| __m128i tmp = _mm_or_si128(z_div_10, _mm_slli_epi16(z_mod_10, 8)); | |
| // convert to ascii | |
| tmp = _mm_add_epi8(tmp, ascii0); | |
| // and save result | |
| _mm_storeu_si128((__m128i *)out, tmp); | |
| } | |
| #endif | |
| #ifdef __SSE4_1__ | |
| void to_string_sse2__pshufb(uint64_t v, char *out) { | |
| // v is 16-digit number = abcdefghijklmnop | |
| const __m128i div_10000 = _mm_set1_epi32(0xd1b71759); | |
| const __m128i mul_10000 = _mm_set1_epi32(10000); | |
| const int div_10000_shift = 45; | |
| const __m128i div_100 = _mm_set1_epi16(0x147b); | |
| const __m128i mul_100 = _mm_set1_epi16(100); | |
| const int div_100_shift = 3; | |
| const __m128i div_10 = _mm_set1_epi16(0x199a); | |
| const __m128i mul_10 = _mm_set1_epi16(10); | |
| const __m128i ascii0 = _mm_set1_epi8('0'); | |
| // can't be easliy done in SSE | |
| const uint32_t a = v / 100000000; // 8-digit number: abcdefgh | |
| const uint32_t b = v % 100000000; // 8-digit number: ijklmnop | |
| // [ 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | |
| // | 0 ] | |
| // x = [ 0 | ijklmnop | 0 | abcdefgh ] | |
| __m128i x = _mm_set_epi64x(b, a); | |
| // x div 10^4 = [ 0 | ijkl | 0 | abcd ] | |
| __m128i x_div_10000; | |
| x_div_10000 = _mm_mul_epu32(x, div_10000); | |
| x_div_10000 = _mm_srli_epi64(x_div_10000, div_10000_shift); | |
| // x mod 10^4 = [ 0 | mnop | 0 | efgh ] | |
| __m128i x_mod_10000; | |
| x_mod_10000 = _mm_mul_epu32(x_div_10000, mul_10000); | |
| x_mod_10000 = _mm_sub_epi32(x, x_mod_10000); | |
| // y = [ mnop | ijkl | efgh | abcd ] | |
| __m128i y = _mm_or_si128(x_div_10000, _mm_slli_epi64(x_mod_10000, 32)); | |
| // y_div_100 = [ 0 | mn | 0 | ij | 0 | ef | 0 | ab | |
| // ] | |
| __m128i y_div_100; | |
| y_div_100 = _mm_mulhi_epu16(y, div_100); | |
| y_div_100 = _mm_srli_epi16(y_div_100, div_100_shift); | |
| // y_mod_100 = [ 0 | op | 0 | kl | 0 | gh | 0 | cd | |
| // ] | |
| __m128i y_mod_100; | |
| y_mod_100 = _mm_mullo_epi16(y_div_100, mul_100); | |
| y_mod_100 = _mm_sub_epi16(y, y_mod_100); | |
| // z = [ mn | ij | ef | ab | op | kl | gh | cd | |
| // ] | |
| __m128i z = _mm_packus_epi32(y_div_100, y_mod_100); | |
| // z_div_10 = [ 0 | m | 0 | i | 0 | e | 0 | a | 0 | o | 0 | k | 0 | g | 0 | |
| // | c ] | |
| __m128i z_div_10 = _mm_mulhi_epu16(z, div_10); | |
| // z_mod_10 = [ 0 | n | 0 | j | 0 | f | 0 | b | 0 | p | 0 | l | 0 | h | 0 | |
| // | d ] | |
| __m128i z_mod_10; | |
| z_mod_10 = _mm_mullo_epi16(z_div_10, mul_10); | |
| z_mod_10 = _mm_sub_epi16(z, z_mod_10); | |
| // interleave z_mod_10 and z_div_10 - | |
| // tmp = [ m | i | e | a | o | k | g | c | n | j | f | b | p | l | h | |
| // | d ] | |
| __m128i tmp = _mm_packus_epi16(z_div_10, z_mod_10); | |
| const __m128i reorder = | |
| _mm_set_epi8(15, 7, 11, 3, 14, 6, 10, 2, 13, 5, 9, 1, 12, 4, 8, 0); | |
| tmp = _mm_shuffle_epi8(tmp, reorder); | |
| // convert to ascii | |
| tmp = _mm_add_epi8(tmp, ascii0); | |
| // and save result | |
| _mm_storeu_si128((__m128i *)out, tmp); | |
| } | |
| #endif | |
| #ifdef __AVX2__ | |
| void to_string_avx2(uint64_t v, char *out) { | |
| // begin: copy of to_string_sse2 | |
| // v is 16-digit number = abcdefghijklmnop | |
| const __m128i div_10000 = _mm_set1_epi32(0xd1b71759); | |
| const __m128i mul_10000 = _mm_set1_epi32(10000); | |
| const int div_10000_shift = 45; | |
| // can't be easliy done in SSE | |
| const uint32_t a = v / 100000000; // 8-digit number: abcdefgh | |
| const uint32_t b = v % 100000000; // 8-digit number: ijklmnop | |
| // [ 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | |
| // | 0 ] | |
| // x = [ 0 | ijklmnop | 0 | abcdefgh ] | |
| __m128i x = _mm_set_epi64x(b, a); | |
| // x div 10^4 = [ 0 | ijkl | 0 | abcd ] | |
| __m128i x_div_10000; | |
| x_div_10000 = _mm_mul_epu32(x, div_10000); | |
| x_div_10000 = _mm_srli_epi64(x_div_10000, div_10000_shift); | |
| // x mod 10^4 = [ 0 | mnop | 0 | efgh ] | |
| __m128i x_mod_10000; | |
| x_mod_10000 = _mm_mul_epu32(x_div_10000, mul_10000); | |
| x_mod_10000 = _mm_sub_epi32(x, x_mod_10000); | |
| // y = [ mnop | ijkl | efgh | abcd ] | |
| __m128i y = _mm_or_si128(x_div_10000, _mm_slli_epi64(x_mod_10000, 32)); | |
| // end of copy, AVX2 code now | |
| #include "bigtable.h" | |
| const __m128i ascii = | |
| _mm_i32gather_epi32(reinterpret_cast<int const *>(&bigtable), y, 4); | |
| _mm_storeu_si128((__m128i *)out, ascii); | |
| } | |
| #endif | |
| void to_string_tree_bigtable(uint64_t x, char *out) { | |
| #include "bigtable.h" | |
| uint64_t top = x / 100000000; | |
| uint64_t bottom = x % 100000000; | |
| // | |
| uint64_t toptop = top / 10000; | |
| uint64_t topbottom = top % 10000; | |
| uint64_t bottomtop = bottom / 10000; | |
| uint64_t bottombottom = bottom % 10000; | |
| memcpy(out, &bigtable[4 * toptop], 4); | |
| memcpy(out + 4, &bigtable[4 * topbottom], 4); | |
| memcpy(out + 8, &bigtable[4 * bottomtop], 4); | |
| memcpy(out + 12, &bigtable[4 * bottombottom], 4); | |
| } | |
| int main() { | |
| std::vector<uint64_t> data; | |
| for (size_t i = 0; i < 50000; i++) { | |
| data.push_back(rand() % 10000000000000000); | |
| } | |
| char *buf = new char[data.size() * 16]; | |
| auto backlinear_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_backlinear(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| auto linear_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_linear(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| auto tree_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_tree(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| auto tree_table_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_tree_table(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| auto tree_str_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_tree_str(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| auto tree_bigtable_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_tree_bigtable(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| auto khuong_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_khuong(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| #ifdef __ARM_NEON | |
| auto neon_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_neon(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| #endif | |
| #ifdef __SSE2__ | |
| auto sse2_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_sse2(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| #endif | |
| #ifdef __SSE4_1__ | |
| auto sse2_approach_v2 = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_sse2__pshufb(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| #endif | |
| #ifdef __AVX2__ | |
| auto avx2_approach = [&data, buf]() -> size_t { | |
| char *b = buf; | |
| for (auto val : data) { | |
| to_string_avx2(val, b); | |
| b += 16; | |
| } | |
| return data.size(); | |
| }; | |
| #endif | |
| for (size_t i = 0; i < 3; i++) { | |
| std::cout << "khuong "; | |
| std::cout << bench(khuong_approach) << std::endl; | |
| #ifdef __ARM_NEON | |
| std::cout << "neon "; | |
| std::cout << bench(neon_approach) << std::endl; | |
| #endif | |
| std::cout << "backlinear "; | |
| std::cout << bench(backlinear_approach) << std::endl; | |
| std::cout << "linear "; | |
| std::cout << bench(linear_approach) << std::endl; | |
| std::cout << "tree "; | |
| std::cout << bench(tree_approach) << std::endl; | |
| std::cout << "treetst "; | |
| std::cout << bench(tree_str_approach) << std::endl; | |
| std::cout << "treest "; | |
| std::cout << bench(tree_table_approach) << std::endl; | |
| std::cout << "treebt "; | |
| std::cout << bench(tree_bigtable_approach) << std::endl; | |
| #ifdef __SSE2__ | |
| std::cout << "sse2 "; | |
| std::cout << bench(sse2_approach) << std::endl; | |
| #endif | |
| #ifdef __SSE4_1__ | |
| std::cout << "sse2(2) "; | |
| std::cout << bench(sse2_approach_v2) << std::endl; | |
| #endif | |
| #ifdef __AVX2__ | |
| std::cout << "avx2 "; | |
| std::cout << bench(avx2_approach) << std::endl; | |
| #endif | |
| std::cout << std::endl; | |
| } | |
| delete[] buf; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment