Skip to content

Instantly share code, notes, and snippets.

@skejeton
Last active July 25, 2022 16:47
Show Gist options
  • Save skejeton/495464204f3db8f34d01e6b2ee92a2bb to your computer and use it in GitHub Desktop.
Save skejeton/495464204f3db8f34d01e6b2ee92a2bb to your computer and use it in GitHub Desktop.
My first attempt at SIMD :P
#include <emmintrin.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <immintrin.h>
#include <time.h>
#define Benchmark(name, times) for (int t = 1; t;) for (double final; t;) for(clock_t start = clock(); t; final = (double)(clock()-start) / CLOCKS_PER_SEC, printf(" <%s> %gs / %dtimes = %gs\n", name, final, times, final/times), t = 0) for (int i = 0; i < times; ++i)
#define Utf8_BYTES_PER_ASCII 32
typedef int Rune;
typedef Rune Utf8_RuneDestination[Utf8_BYTES_PER_ASCII];
typedef Rune Utf8_RuneDestinationSSE2[16];
struct Utf8_Streaming {
const char *string;
size_t string_len;
}
typedef Utf8_Streaming;
Utf8_Streaming Utf8_begin_stream(const char *string) {
return (Utf8_Streaming) {
.string = string,
.string_len = strlen(string)
};
}
void Utf8_printpack_base(int *pack, int max) {
for (int i = 0; i < max; ++i) {
printf("%08x ", pack[i]);
}
printf("\n");
}
void Utf8_printpack_32x8(__m256i p) {
int pack[8];
_mm256_storeu_si256((__m256i_u*)pack, p);
Utf8_printpack_base(pack, 8);
}
void Utf8_printpack_32x4(__m128i p) {
int pack[4];
_mm_storeu_si128((__m128i_u*)pack, p);
Utf8_printpack_base(pack, 4);
}
void Utf8_printpack_8x16(__m128i p) {
char spack[16];
int dpack[16];
_mm_storeu_si128((__m128i_u*)spack, p);
for (int i = 0; i < 16; ++i) dpack[i] = spack[i];
Utf8_printpack_base(dpack, 16);
}
static inline
void Utf8_decode_ascii_sse2(Utf8_Streaming *stream, Utf8_RuneDestinationSSE2 dest) {
if (stream->string_len >= 16) {
__m128i data = _mm_loadu_si128((const __m128i_u*)stream->string);
__m128i lo = _mm_unpacklo_epi8(data, _mm_set1_epi32(0));
__m128i hi = _mm_unpackhi_epi8(data, _mm_set1_epi32(0));
// unpack 8x16 acii chars into 32x16 destination
_mm_storeu_si128((__m128i_u*)dest+0, _mm_unpacklo_epi16(lo, _mm_set1_epi32(0)));
_mm_storeu_si128((__m128i_u*)dest+1, _mm_unpackhi_epi16(lo, _mm_set1_epi32(0)));
_mm_storeu_si128((__m128i_u*)dest+2, _mm_unpacklo_epi16(hi, _mm_set1_epi32(0)));
_mm_storeu_si128((__m128i_u*)dest+3, _mm_unpackhi_epi16(hi, _mm_set1_epi32(0)));
stream->string_len -= 16;
stream->string += 16;
} else {
// eh
for (size_t i = 0; i < stream->string_len; ++i) {
dest[i] = stream->string[i];
}
stream->string += stream->string_len;
dest[stream->string_len] = 0;
stream->string_len = 0;
}
}
static inline
void Utf8_decode_ascii_naive(Utf8_Streaming *stream, Utf8_RuneDestination dest) {
// eh
size_t i;
for (i = 0; i < stream->string_len && i < 32; ++i) {
dest[i] = stream->string[i];
}
if (i < 32) {
dest[i] = 0;
}
stream->string += i;
stream->string_len -= i;
}
static inline
void Utf8_decode_ascii(Utf8_Streaming *stream, Utf8_RuneDestination dest) {
if (stream->string_len >= Utf8_BYTES_PER_ASCII) {
__m256i data = _mm256_loadu_si256((const __m256i_u*)stream->string);
__m256i lo = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(data, 0));
__m256i hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(data, 1));
__m256i e1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(lo, 0));
__m256i e2 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(lo, 1));
__m256i e3 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(hi, 0));
__m256i e4 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(hi, 1));
_mm256_storeu_si256((__m256i_u*)dest+0, e1);
_mm256_storeu_si256((__m256i_u*)dest+1, e2);
_mm256_storeu_si256((__m256i_u*)dest+2, e3);
_mm256_storeu_si256((__m256i_u*)dest+3, e4);
stream->string_len -= Utf8_BYTES_PER_ASCII;
stream->string += Utf8_BYTES_PER_ASCII;
} else {
// eh
for (size_t i = 0; i < stream->string_len; ++i) {
dest[i] = stream->string[i];
}
stream->string += stream->string_len;
dest[stream->string_len] = 0;
stream->string_len = 0;
}
}
bool Utf8_stream_running(Utf8_Streaming *stream) {
return stream->string_len;
}
char *read_file(const char *path) {
FILE *f = fopen(path, "r");
fseek(f, 0, SEEK_END);
size_t size = ftell(f);
fseek(f, 0, SEEK_SET);
char *s = malloc(size + 1);
s[fread(s, 1, size, f)] = 0;
return s;
}
int main() {
char *file = read_file("TestFile.txt");
Utf8_Streaming streamA = Utf8_begin_stream(file);
Utf8_Streaming streamB = Utf8_begin_stream(file);
printf("String length: %zu\n", streamA.string_len);
Benchmark("BLANK", 10000) {}
Benchmark("SSE2", 10000) {
Utf8_Streaming stream = streamA;
while (Utf8_stream_running(&stream)) {
Utf8_RuneDestinationSSE2 dest;
Utf8_decode_ascii_sse2(&stream, dest);
}
}
Benchmark("NAIVE", 10000) {
Utf8_Streaming stream = streamA;
while (Utf8_stream_running(&stream)) {
Utf8_RuneDestination dest;
Utf8_decode_ascii_naive(&stream, dest);
}
}
Benchmark("AVX2", 10000) {
Utf8_Streaming stream = streamA;
while (Utf8_stream_running(&stream)) {
Utf8_RuneDestination dest;
Utf8_decode_ascii(&stream, dest);
}
}
free(file);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment