Last active
July 21, 2020 13:12
-
-
Save Const-me/c267b378abdf7f50a3d0fa69953db5b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Count newlines, 32 characters at a time | |
__forceinline void countNewlines( __m256i ascii, __m256i& counters ) | |
{ | |
// Find '\n' characters, 32 bytes at a time | |
__m256i newlines = _mm256_cmpeq_epi8( ascii, _mm256_set1_epi8( '\n' ) ); | |
// Convert 0xFF to 1 | |
newlines = _mm256_and_si256( newlines, _mm256_set1_epi8( 1 ) ); | |
// Accumulate the counters. | |
// The key part is _mm256_sad_epu8 instruction, here’s copy-paste from the docs: | |
// Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", | |
// then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, | |
// and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". | |
counters = _mm256_add_epi64( counters, _mm256_sad_epu8( newlines, _mm256_setzero_si256() ) ); | |
} | |
// Horizontal sum of 64-bit integer lanes. | |
// You should only call this once, after the complete file is processed. | |
int64_t horizontalAdd( __m256i counters ) | |
{ | |
// Add 4 lanes into 2 | |
__m128i tmp = _mm_add_epi64( _mm256_castsi256_si128( counters ), _mm256_extracti128_si256( counters, 1 ) ); | |
// Add 2 lanes into 1 | |
tmp = _mm_add_epi64( tmp, _mm_unpackhi_epi64( tmp, tmp ) ); | |
// Return the result | |
return _mm_cvtsi128_si64( tmp ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment