Skip to content

Instantly share code, notes, and snippets.

@bluetech
Last active July 11, 2018 15:03
Show Gist options
  • Save bluetech/36ac1d0b21864a4f42fa723de569e5f8 to your computer and use it in GitHub Desktop.
Save bluetech/36ac1d0b21864a4f42fa723de569e5f8 to your computer and use it in GitHub Desktop.
Websocket masking comparison
// gcc -O3 -march=native mask.c -o mask
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <immintrin.h>
// Approach used by wsaccel.
static void mask_8_by_8(uint8_t *data, size_t len, const uint8_t mask[4]) {
for (size_t i = 0; i < len; i++) {
data[i] ^= mask[i & 3];
}
}
static void mask_32_by_32(uint8_t *data, size_t len, const uint8_t mask[4]) {
// For simplicity we assume that `data` is already 4-bytes aligned.
// Real general code should handle the case where it's not aligned.
assert((uintptr_t) data % 4 == 0);
uint32_t *data32 = (uint32_t *) data;
uint32_t mask32 = *((uint32_t *) mask);
for (size_t i = 0; i < len / 4; i++) {
data32[i] ^= mask32;
}
mask_8_by_8(data + len - (len % 4), len % 4, mask);
}
static void mask_64_by_64(uint8_t *data, size_t len, const uint8_t mask[4]) {
// For simplicity we assume that `data` is already 8-bytes aligned.
// Real general code should handle the case where it's not aligned.
assert((uintptr_t) data % 8 == 0);
uint64_t *data64 = (uint64_t *) data;
uint32_t mask32 = *((uint32_t *) mask);
uint64_t mask64 = ((uint64_t) mask32 << 32) | mask32;
for (size_t i = 0; i < len / 8; i++) {
data64[i] ^= mask64;
}
mask_8_by_8(data + len - (len % 8), len % 8, mask);
}
static void mask_128_by_128(uint8_t *data, size_t len, const uint8_t mask[4]) {
// For simplicity we assume that `data` is already 16-bytes aligned.
// Real general code should handle the case where it's not aligned.
assert((uintptr_t) data % 16 == 0);
__m128i *data128 = (__m128i *) data;
uint32_t mask32 = *((uint32_t *) mask);
__m128i mask128 = _mm_set1_epi32(mask32);
for (size_t i = 0; i < len / 16; i++) {
_mm_store_si128(data128 + i, _mm_xor_si128(_mm_load_si128(data128 + i), mask128));
}
mask_8_by_8(data + len - (len % 16), len % 16, mask);
}
static void mask_256_by_256(uint8_t *data, size_t len, const uint8_t mask[4]) {
// For simplicity we assume that `data` is already 32-bytes aligned.
// Real general code should handle the case where it's not aligned.
assert((uintptr_t) data % 32 == 0);
__m256i *data256 = (__m256i *) data;
uint32_t mask32 = *((uint32_t *) mask);
__m256i mask256 = _mm256_set1_epi32(mask32);
for (size_t i = 0; i < len / 32; i++) {
_mm256_store_si256(data256 + i, _mm256_xor_si256(_mm256_load_si256(data256 + i), mask256));
}
mask_8_by_8(data + len - (len % 32), len % 32, mask);
}
static void check(uint8_t *data, size_t len) {
for (size_t i = 0; i < len; i++) {
assert(data[i] == 0xff);
}
}
int main(void) {
const uint8_t mask[] = {0xff, 0xff, 0xff, 0xff};
const size_t SIZE = 100000;
const int ITERS = 10000;
uint8_t *data = aligned_alloc(32, SIZE);
clock_t start;
memset(data, 0, SIZE);
mask_8_by_8(data, SIZE, mask);
check(data, SIZE);
memset(data, 0, SIZE);
start = clock();
for (int i = 0; i < ITERS; i++) {
mask_8_by_8(data, SIZE, mask);
}
printf("8 by 8: %u\n", clock() - start);
memset(data, 0, SIZE);
mask_32_by_32(data, SIZE, mask);
check(data, SIZE);
memset(data, 0, SIZE);
start = clock();
for (int i = 0; i < ITERS; i++) {
mask_32_by_32(data, SIZE, mask);
}
printf("32 by 32 : %lu\n", clock() - start);
memset(data, 0, SIZE);
mask_64_by_64(data, SIZE, mask);
check(data, SIZE);
memset(data, 0, SIZE);
start = clock();
for (int i = 0; i < ITERS; i++) {
mask_64_by_64(data, SIZE, mask);
}
printf("64 by 64 : %lu\n", clock() - start);
memset(data, 0, SIZE);
mask_128_by_128(data, SIZE, mask);
check(data, SIZE);
memset(data, 0, SIZE);
start = clock();
for (int i = 0; i < ITERS; i++) {
mask_128_by_128(data, SIZE, mask);
}
printf("128 by 128: %lu\n", clock() - start);
memset(data, 0, SIZE);
mask_256_by_256(data, SIZE, mask);
check(data, SIZE);
memset(data, 0, SIZE);
start = clock();
for (int i = 0; i < ITERS; i++) {
mask_256_by_256(data, SIZE, mask);
}
printf("256 by 256: %lu\n", clock() - start);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment