Skip to content

Instantly share code, notes, and snippets.

Avatar

geofflangdale

View GitHub Profile
@geofflangdale
geofflangdale / neon-pmovmskb-interleaved
Created Apr 1, 2019
ARM NEON PMOVMSKB substitute to turn 4 _interleaved_ predicate results over 128-bits to a single 64-bit value
View neon-pmovmskb-interleaved
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10,
0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10};
const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20,
0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20};
const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40,
0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40};
const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80,
0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80};
@geofflangdale
geofflangdale / neon-pmovmskb
Last active Apr 1, 2019
ARM NEON PMOVMSKB substitute to turn 4 predicate results over 128-bits to a single 64-bit value
View neon-pmovmskb
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) {
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
uint8x16_t t0 = vandq_u8(p0, bitmask);
uint8x16_t t1 = vandq_u8(p1, bitmask);
uint8x16_t t2 = vandq_u8(p2, bitmask);
uint8x16_t t3 = vandq_u8(p3, bitmask);
uint8x16_t sum0 = vpaddq_u8(t0, t1);
uint8x16_t sum1 = vpaddq_u8(t2, t3);
sum0 = vpaddq_u8(sum0, sum1);
@geofflangdale
geofflangdale / old-quote-detection
Created Mar 13, 2019
Ponderous older version of our "are we inside quotes" code
View old-quote-detection
////////////////////////////////////////////////////////////////////////////////////////////
// Step 2: detect insides of quote pairs
////////////////////////////////////////////////////////////////////////////////////////////
u64 quote_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
dumpbits(quote_bits, "quote_bits");
// pdep pattern is alternating 0 and 1 bits, starting with 0 or 1 depending on whether
// we're in a quote-pair from the previous iteration
u64 pdep_pattern = even_bits ^ prev_iter_inside_quote;
@geofflangdale
geofflangdale / quotes-fragment.cpp
Created Mar 6, 2019
Finding quote pairs with PCLMULQDQ
View quotes-fragment.cpp
really_inline uint64_t find_quote_mask_and_bits(
__m256i input_lo, __m256i input_hi, uint64_t odd_ends,
uint64_t &prev_iter_inside_quote, uint64_t &quote_bits) {
quote_bits =
cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
_mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
// right shift of a signed value expected to be well-defined and standard
View smh_throughput.cpp
template <typename T>
void match_multiple_smh(T & smh, std::vector<u8 *> & buffers, std::vector<size_t> & lengths,
std::vector<u32> & results) {
u32 i = 0;
#ifndef NO_UNROLL
for (; i+7 < buffers.size(); i+=8) {
results[i+0] = smh.match(buffers[i+0], lengths[i+0]); LFENCE
results[i+1] = smh.match(buffers[i+1], lengths[i+1]); LFENCE
results[i+2] = smh.match(buffers[i+2], lengths[i+2]); LFENCE
results[i+3] = smh.match(buffers[i+3], lengths[i+3]); LFENCE
View smh_latency.cpp
template <typename T>
void match_multiple_smh_latency_test(T & smh, std::vector<u8 *> & buffers, std::vector<size_t> & lengths,
std::vector<u32> & results) {
u32 i = 0;
u32 tmp = 0;
#ifndef NO_UNROLL
// NOTE: experimental code only. Note that the addition of 'tmp' - being the id of a possible
// match - could take us RIGHT outside our buffer if we actually matched something. We aren't
// in this particular run, but so it goes. Saner would be to build up an all-zero id vector
for (; i+7 < buffers.size(); i+=8) {
View gist:2b873b7b166cafadfba164e58323b73e
109 111 117 115:101 0 0 0| 0 0 0 0: 0 0 0 0|109 111 117 115:101 0 0 0| 0 0 0 0: 0 0 0 0| input
0 1 2 3: 4 128 0 1| 2 3 4 128: 0 1 2 128| 0 1 2 128:128 128 128 128|128 128 128 128:128 128 128 128| shuf_mask
109 111 117 115:101 0 109 111|117 115 101 0:109 111 117 0|109 111 117 0: 0 0 0 0| 0 0 0 0: 0 0 0 0| shuf result
109 111 111 115:101 255 109 111|117 115 101 255: 99 97 116 255|100 111 103 255:255 255 255 255|255 255 255 255:255 255 255 255| cmp_mask
255 255 0 255:255 0 255 255|255 255 255 0: 0 0 0 0| 0 255 0 0: 0 0 0 0| 0 0 0 0: 0 0 0 0| cmp result
11_11_11111______1______________________________________________ input to gpr-smh
_____1_____1___1___1____________________________________________ hi
1_____1_____1___1_______________________________________________ low
__111______11___11______________________________________________ after_add
@geofflangdale
geofflangdale / smh good bits
Created May 30, 2018
SMH "good bits", without DEBUG code
View smh good bits
struct SIMD_SMH_PART {
m256 shuf_mask;
m256 cmp_mask;
m256 and_mask; // not yet used
m256 sub_mask; // not yet used
u32 doit(m256 d) {
return _mm256_movemask_epi8(
_mm256_cmpeq_epi8(_mm256_shuffle_epi8(d, shuf_mask),
cmp_mask));
}
View sheng-output
$ sudo nice --20 taskset -c 1 ./sheng
Sheng
0/1 1/1 2/1 3/1 4/1 5/1 6/1 7/1 8/1 9/1
10/1 11/1 12/1 13/1 14/1 15/1 16/1 17/1 18/1 19/1
20/1 21/1 22/1 23/1 24/1 25/2 26/3 27/4 28/5 29/5
30/5 31/5 32/5 33/5 34/5 35/5 36/5 37/5 38/5 39/5
40/5 41/5 42/5 43/5 44/5 45/5 46/5 47/5 48/5 49/5
50/5 51/5 52/5 53/5 54/5 55/5 56/5 57/5 58/5 59/5
60/6 61/7 62/8 63/9 64/10 65/1 66/1 67/1 68/1
View sheng-gist
struct Sheng {
typedef m128 State;
m128 transitions[256];
State start_state;
Sheng(std::vector<std::tuple<u32, u32, u8>> & trans_vec, u8 start_state_, u8 default_state) {
// fill all transitions with default state
for (u32 i = 0; i < 256; ++i) {
transitions[i] = _mm_set1_epi8(default_state);
}