View neon-pmovmskb-interleaved
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { | |
const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, | |
0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10}; | |
const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, | |
0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20}; | |
const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, | |
0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40}; | |
const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, | |
0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80}; |
View neon-pmovmskb
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { | |
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, | |
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; | |
uint8x16_t t0 = vandq_u8(p0, bitmask); | |
uint8x16_t t1 = vandq_u8(p1, bitmask); | |
uint8x16_t t2 = vandq_u8(p2, bitmask); | |
uint8x16_t t3 = vandq_u8(p3, bitmask); | |
uint8x16_t sum0 = vpaddq_u8(t0, t1); | |
uint8x16_t sum1 = vpaddq_u8(t2, t3); | |
sum0 = vpaddq_u8(sum0, sum1); |
View old-quote-detection
//////////////////////////////////////////////////////////////////////////////////////////// | |
// Step 2: detect insides of quote pairs | |
//////////////////////////////////////////////////////////////////////////////////////////// | |
u64 quote_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"')); | |
quote_bits = quote_bits & ~odd_ends; | |
dumpbits(quote_bits, "quote_bits"); | |
// pdep pattern is alternating 0 and 1 bits, starting with 0 or 1 depending on whether | |
// we're in a quote-pair from the previous iteration | |
u64 pdep_pattern = even_bits ^ prev_iter_inside_quote; |
View quotes-fragment.cpp
really_inline uint64_t find_quote_mask_and_bits( | |
__m256i input_lo, __m256i input_hi, uint64_t odd_ends, | |
uint64_t &prev_iter_inside_quote, uint64_t "e_bits) { | |
quote_bits = | |
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"')); | |
quote_bits = quote_bits & ~odd_ends; | |
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128( | |
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0)); | |
quote_mask ^= prev_iter_inside_quote; | |
// right shift of a signed value expected to be well-defined and standard |
View smh_throughput.cpp
template <typename T> | |
void match_multiple_smh(T & smh, std::vector<u8 *> & buffers, std::vector<size_t> & lengths, | |
std::vector<u32> & results) { | |
u32 i = 0; | |
#ifndef NO_UNROLL | |
for (; i+7 < buffers.size(); i+=8) { | |
results[i+0] = smh.match(buffers[i+0], lengths[i+0]); LFENCE | |
results[i+1] = smh.match(buffers[i+1], lengths[i+1]); LFENCE | |
results[i+2] = smh.match(buffers[i+2], lengths[i+2]); LFENCE | |
results[i+3] = smh.match(buffers[i+3], lengths[i+3]); LFENCE |
View smh_latency.cpp
template <typename T> | |
void match_multiple_smh_latency_test(T & smh, std::vector<u8 *> & buffers, std::vector<size_t> & lengths, | |
std::vector<u32> & results) { | |
u32 i = 0; | |
u32 tmp = 0; | |
#ifndef NO_UNROLL | |
// NOTE: experimental code only. Note that the addition of 'tmp' - being the id of a possible | |
// match - could take us RIGHT outside our buffer if we actually matched something. We aren't | |
// in this particular run, but so it goes. Saner would be to build up an all-zero id vector | |
for (; i+7 < buffers.size(); i+=8) { |
View gist:2b873b7b166cafadfba164e58323b73e
109 111 117 115:101 0 0 0| 0 0 0 0: 0 0 0 0|109 111 117 115:101 0 0 0| 0 0 0 0: 0 0 0 0| input | |
0 1 2 3: 4 128 0 1| 2 3 4 128: 0 1 2 128| 0 1 2 128:128 128 128 128|128 128 128 128:128 128 128 128| shuf_mask | |
109 111 117 115:101 0 109 111|117 115 101 0:109 111 117 0|109 111 117 0: 0 0 0 0| 0 0 0 0: 0 0 0 0| shuf result | |
109 111 111 115:101 255 109 111|117 115 101 255: 99 97 116 255|100 111 103 255:255 255 255 255|255 255 255 255:255 255 255 255| cmp_mask | |
255 255 0 255:255 0 255 255|255 255 255 0: 0 0 0 0| 0 255 0 0: 0 0 0 0| 0 0 0 0: 0 0 0 0| cmp result | |
11_11_11111______1______________________________________________ input to gpr-smh | |
_____1_____1___1___1____________________________________________ hi | |
1_____1_____1___1_______________________________________________ low | |
__111______11___11______________________________________________ after_add |
View smh good bits
struct SIMD_SMH_PART { | |
m256 shuf_mask; | |
m256 cmp_mask; | |
m256 and_mask; // not yet used | |
m256 sub_mask; // not yet used | |
u32 doit(m256 d) { | |
return _mm256_movemask_epi8( | |
_mm256_cmpeq_epi8(_mm256_shuffle_epi8(d, shuf_mask), | |
cmp_mask)); | |
} |
View sheng-output
$ sudo nice --20 taskset -c 1 ./sheng | |
Sheng | |
0/1 1/1 2/1 3/1 4/1 5/1 6/1 7/1 8/1 9/1 | |
10/1 11/1 12/1 13/1 14/1 15/1 16/1 17/1 18/1 19/1 | |
20/1 21/1 22/1 23/1 24/1 25/2 26/3 27/4 28/5 29/5 | |
30/5 31/5 32/5 33/5 34/5 35/5 36/5 37/5 38/5 39/5 | |
40/5 41/5 42/5 43/5 44/5 45/5 46/5 47/5 48/5 49/5 | |
50/5 51/5 52/5 53/5 54/5 55/5 56/5 57/5 58/5 59/5 | |
60/6 61/7 62/8 63/9 64/10 65/1 66/1 67/1 68/1 |
View sheng-gist
struct Sheng { | |
typedef m128 State; | |
m128 transitions[256]; | |
State start_state; | |
Sheng(std::vector<std::tuple<u32, u32, u8>> & trans_vec, u8 start_state_, u8 default_state) { | |
// fill all transitions with default state | |
for (u32 i = 0; i < 256; ++i) { | |
transitions[i] = _mm_set1_epi8(default_state); | |
} |
NewerOlder