Skip to content

Instantly share code, notes, and snippets.

@amirshukayev
Created May 15, 2023 20:34
Show Gist options
  • Save amirshukayev/9f603359e6c92730572ce8bebf220b7f to your computer and use it in GitHub Desktop.
Save amirshukayev/9f603359e6c92730572ce8bebf220b7f to your computer and use it in GitHub Desktop.
Count uint8
#include <iostream>
#include <unistd.h>
#include <sys/mman.h>
#include <cstdint>
#include <immintrin.h>
using namespace std;
int main() {
off_t fsize = lseek(0, 0, SEEK_END);
char* buffer = (char*)mmap(0, fsize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, 0, 0);
__m256i v1 = _mm256_set1_epi8(127); // set a 256-bit value with all bytes being 127
uint32_t count = 0;
int i;
for (i = 0; i + 128 <= fsize; i += 128) {
__m256i v2_1 = _mm256_load_si256((__m256i*)&buffer[i]); // load 32 bytes from buffer
__m256i v2_2 = _mm256_load_si256((__m256i*)&buffer[i+32]); // load the next 32 bytes from buffer
__m256i v2_3 = _mm256_load_si256((__m256i*)&buffer[i+64]); // load the next 32 bytes from buffer
__m256i v2_4 = _mm256_load_si256((__m256i*)&buffer[i+96]); // load the next 32 bytes from buffer
__m256i v3_1 = _mm256_cmpeq_epi8(v1, v2_1);
__m256i v3_2 = _mm256_cmpeq_epi8(v1, v2_2);
__m256i v3_3 = _mm256_cmpeq_epi8(v1, v2_3);
__m256i v3_4 = _mm256_cmpeq_epi8(v1, v2_4);
uint32_t mask1 = _mm256_movemask_epi8(v3_1);
uint32_t mask2 = _mm256_movemask_epi8(v3_2);
uint32_t mask3 = _mm256_movemask_epi8(v3_3);
uint32_t mask4 = _mm256_movemask_epi8(v3_4);
count += _mm_popcnt_u32(mask1) + _mm_popcnt_u32(mask2) + _mm_popcnt_u32(mask3) + _mm_popcnt_u32(mask4);
}
// Process the remaining bytes
for (; i < fsize; i++) {
if (buffer[i] == 127) count++;
}
cout << count << endl;
return 0;
}
#include <iostream>
#include <unistd.h>
#include <sys/mman.h>
#include <cstdint>
#include <immintrin.h>
using namespace std;
int main() {
off_t fsize = lseek(0, 0, SEEK_END);
char* buffer = (char*)mmap(0, fsize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, 0, 0);
__m256i v1 = _mm256_set1_epi8(127); // set a 256-bit value with all bytes being 127
uint32_t count = 0;
int i;
for (i = 0; i + 256 <= fsize; i += 256) {
__m256i v2_1 = _mm256_load_si256((__m256i*)&buffer[i]); // load 32 bytes from buffer
__m256i v2_2 = _mm256_load_si256((__m256i*)&buffer[i+32]); // load the next 32 bytes from buffer
__m256i v2_3 = _mm256_load_si256((__m256i*)&buffer[i+64]); // load the next 32 bytes from buffer
__m256i v2_4 = _mm256_load_si256((__m256i*)&buffer[i+96]); // load the next 32 bytes from buffer
__m256i v2_5 = _mm256_load_si256((__m256i*)&buffer[i+128]); // load the next 32 bytes from buffer
__m256i v2_6 = _mm256_load_si256((__m256i*)&buffer[i+160]); // load the next 32 bytes from buffer
__m256i v2_7 = _mm256_load_si256((__m256i*)&buffer[i+192]); // load the next 32 bytes from buffer
__m256i v2_8 = _mm256_load_si256((__m256i*)&buffer[i+224]); // load the next 32 bytes from buffer
__m256i v3_1 = _mm256_cmpeq_epi8(v1, v2_1);
__m256i v3_2 = _mm256_cmpeq_epi8(v1, v2_2);
__m256i v3_3 = _mm256_cmpeq_epi8(v1, v2_3);
__m256i v3_4 = _mm256_cmpeq_epi8(v1, v2_4);
uint32_t mask1 = _mm256_movemask_epi8(v3_1);
uint32_t mask2 = _mm256_movemask_epi8(v3_2);
uint32_t mask3 = _mm256_movemask_epi8(v3_3);
uint32_t mask4 = _mm256_movemask_epi8(v3_4);
count += _mm_popcnt_u32(mask1) + _mm_popcnt_u32(mask2) + _mm_popcnt_u32(mask3) + _mm_popcnt_u32(mask4);
__m256i v3_5 = _mm256_cmpeq_epi8(v1, v2_5);
__m256i v3_6 = _mm256_cmpeq_epi8(v1, v2_6);
__m256i v3_7 = _mm256_cmpeq_epi8(v1, v2_7);
__m256i v3_8 = _mm256_cmpeq_epi8(v1, v2_8);
uint32_t mask5 = _mm256_movemask_epi8(v3_5);
uint32_t mask6 = _mm256_movemask_epi8(v3_6);
uint32_t mask7 = _mm256_movemask_epi8(v3_7);
uint32_t mask8 = _mm256_movemask_epi8(v3_8);
count += _mm_popcnt_u32(mask5) + _mm_popcnt_u32(mask6) + _mm_popcnt_u32(mask7) + _mm_popcnt_u32(mask8);
}
// Process the remaining bytes
for (; i < fsize; i++) {
if (buffer[i] == 127) count++;
}
cout << count << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment