Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@kozlowsqi
Last active March 4, 2021 03:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kozlowsqi/b7f7e501176eefb7497ef4f13eb82897 to your computer and use it in GitHub Desktop.
Save kozlowsqi/b7f7e501176eefb7497ef4f13eb82897 to your computer and use it in GitHub Desktop.
Broken Swift SIMD
#include <stdint.h>
#include <smmintrin.h>
#include <immintrin.h>
#include <emmintrin.h>
#define __VECTOR_REPEATING_X16(NAME, VALUE) \
char NAME##_x16[] __attribute__ ((aligned(16))) = { \
VALUE, VALUE, VALUE, VALUE, \
VALUE, VALUE, VALUE, VALUE, \
VALUE, VALUE, VALUE, VALUE, \
VALUE, VALUE, VALUE, VALUE \
}
#define __COMPACTING_MASK_LITERAL_X16(NAME, VALUE) \
char NAME##_x16[] __attribute__ ((aligned(16))) = { \
VALUE, 0x00, 0x00, 0x00, \
0x00, 0x00, 0x00, 0x00, \
VALUE, 0x00, 0x00, 0x00, \
0x00, 0x00, 0x00, 0x00 \
}
#define __LOAD_STATIC_VECTOR_X16(NAME, LITERALNAME)\
const __m128i NAME = _mm_load_si128((__m128i const*) LITERALNAME##_x16)
__VECTOR_REPEATING_X16(__b32_charactermask_literal, 0x5F);
__VECTOR_REPEATING_X16(__b32_letteroffset_literal, 0x41);
__VECTOR_REPEATING_X16(__b32_numberoffset_literal, 0x08);
__VECTOR_REPEATING_X16(__b32_blendmask_literal, 0x41);
__COMPACTING_MASK_LITERAL_X16(__b32_amask0_literal, 0xF8);
__COMPACTING_MASK_LITERAL_X16(__b32_amask1_literal, 0x07);
__COMPACTING_MASK_LITERAL_X16(__b32_bmask0_literal, 0xC0);
__COMPACTING_MASK_LITERAL_X16(__b32_bmask1_literal, 0x3E);
__COMPACTING_MASK_LITERAL_X16(__b32_bmask2_literal, 0x01);
__COMPACTING_MASK_LITERAL_X16(__b32_cmask0_literal, 0xF0);
__COMPACTING_MASK_LITERAL_X16(__b32_cmask1_literal, 0x0F);
__COMPACTING_MASK_LITERAL_X16(__b32_dmask0_literal, 0x80);
__COMPACTING_MASK_LITERAL_X16(__b32_dmask1_literal, 0x7C);
__COMPACTING_MASK_LITERAL_X16(__b32_dmask2_literal, 0x03);
__COMPACTING_MASK_LITERAL_X16(__b32_emask0_literal, 0xE0);
__COMPACTING_MASK_LITERAL_X16(__b32_emask1_literal, 0x1F);
extern inline void
__unsafe_b32_decode_intel_x16(void const * const _in_buffer, void *_out_buffer)
{
__LOAD_STATIC_VECTOR_X16(charactermask, __b32_charactermask_literal);
__LOAD_STATIC_VECTOR_X16(letteroffset, __b32_letteroffset_literal);
__LOAD_STATIC_VECTOR_X16(numberoffset, __b32_numberoffset_literal);
__LOAD_STATIC_VECTOR_X16(blendmask, __b32_blendmask_literal);
__LOAD_STATIC_VECTOR_X16(amask0, __b32_amask0_literal);
__LOAD_STATIC_VECTOR_X16(amask1, __b32_amask1_literal);
__LOAD_STATIC_VECTOR_X16(bmask0, __b32_bmask0_literal);
__LOAD_STATIC_VECTOR_X16(bmask1, __b32_bmask1_literal);
__LOAD_STATIC_VECTOR_X16(bmask2, __b32_bmask2_literal);
__LOAD_STATIC_VECTOR_X16(cmask0, __b32_cmask0_literal);
__LOAD_STATIC_VECTOR_X16(cmask1, __b32_cmask1_literal);
__LOAD_STATIC_VECTOR_X16(dmask0, __b32_dmask0_literal);
__LOAD_STATIC_VECTOR_X16(dmask1, __b32_dmask1_literal);
__LOAD_STATIC_VECTOR_X16(dmask2, __b32_dmask2_literal);
__LOAD_STATIC_VECTOR_X16(emask0, __b32_emask0_literal);
__LOAD_STATIC_VECTOR_X16(emask1, __b32_emask1_literal);
__m128i input = _mm_lddqu_si128((__m128i const *) _in_buffer);
__m128i charactervalue = _mm_and_si128(input, charactermask);
__m128i lettervalues = _mm_sub_epi8(charactervalue, letteroffset);
__m128i numbervalues = _mm_add_epi8(charactervalue, numberoffset);
__m128i is_number = _mm_cmpgt_epi8(blendmask, charactervalue);
__m128i value = _mm_blendv_epi8(lettervalues, numbervalues, is_number);
__m128i partial0, partial1, partial2, result;
// A
partial0 = _mm_and_si128(_mm_slli_epi64(value, 0x03), amask0);
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x0A), amask1);
result = _mm_or_si128(partial0, partial1);
*(char *) (_out_buffer + 0x00) = (char) (_mm_extract_epi64(result, 0));
*(char *) (_out_buffer + 0x05) = (char) (_mm_extract_epi64(result, 1));
// B
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x02), bmask0);
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x0F), bmask1);
partial2 = _mm_and_si128(_mm_srli_epi64(value, 0x1C), bmask2);
result = _mm_or_si128(_mm_or_si128(partial0, partial1), partial2);
*(char *) (_out_buffer + 0x01) = (char) (_mm_extract_epi64(result, 0));
*(char *) (_out_buffer + 0x06) = (char) (_mm_extract_epi64(result, 1));
// C
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x14), cmask0);
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x21), cmask1);
result = _mm_or_si128(partial0, partial1);
*(char *) (_out_buffer + 0x02) = (char) (_mm_extract_epi64(result, 0));
*(char *) (_out_buffer + 0x07) = (char) (_mm_extract_epi64(result, 1));
// D
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x19), dmask0);
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x26), dmask1);
partial2 = _mm_and_si128(_mm_srli_epi64(value, 0x33), dmask2);
result = _mm_or_si128(_mm_or_si128(partial0, partial1), partial2);
*(char *) (_out_buffer + 0x03) = (char) (_mm_extract_epi64(result, 0));
*(char *) (_out_buffer + 0x08) = (char) (_mm_extract_epi64(result, 1));
// E
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x2B), emask0);
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x38), emask1);
result = _mm_or_si128(partial0, partial1);
*(char *) (_out_buffer + 0x04) = (char) (_mm_extract_epi64(result, 0));
*(char *) (_out_buffer + 0x09) = (char) (_mm_extract_epi64(result, 1));
}
void
__unsafe_b32_decode(uint8_t const * _in_buffer, uint8_t *_out_buffer, int count)
{
while(count >= 16) {
__unsafe_b32_decode_intel_x16(_in_buffer, _out_buffer);
_in_buffer += 16;
_out_buffer += 10;
count -= 16;
}
}
//
// The original swift-only implementation
//
import Foundation
class Box<T> {
let value: T
init(_ value: T) {
self.value = value
}
}
public func unsafeDecode(
base32Encoded data: UnsafePointer<UInt8>,
into buffer: UnsafeMutablePointer<UInt8>,
_ count: Int
) {
var count = count
var data = data
var buffer = buffer
// while count >= 64 {
// let vector = SIMD64(
// data[dataIndex..<dataIndex+64]
// )
// UnsafeDecodeBase32x64.decode(
// base32Encoded: vector,
// into: buffer
// )
// buffer = buffer.advanced(by: 40)
// dataIndex += 64
// count -= 64
// }
//
// while count >= 32 {
// let vector = unsafeBitCast(data, to: Box<SIMD32<UInt8>>.self)
//
// UnsafeDecodeBase32x32.decode(
// base32Encoded: vector.value,
// into: buffer
// )
// buffer = buffer.advanced(by: 20)
// data = data.advanced(by: 32)
// count -= 32
// }
while count >= 16 {
let vector = unsafeBitCast(data, to: Box<SIMD16<UInt8>>.self)
UnsafeDecodeBase32x16.decode(
base32Encoded: vector.value,
into: buffer
)
buffer = buffer.advanced(by: 10)
data = data.advanced(by: 16)
count -= 16
}
while count >= 8 {
let vector = unsafeBitCast(data, to: Box<SIMD8<UInt8>>.self)
UnsafeDecodeBase32x8.decode(
base32Encoded: vector.value,
into: buffer
)
buffer = buffer.advanced(by: 5)
data = data.advanced(by: 8)
count -= 8
}
}
struct UnsafeDecodeBase32x8 {
static let valueMask = SIMD8<UInt8>(repeating: 0x5F)
static let letters = SIMD8<UInt8>(repeating: 0xBF)
static let numbers = SIMD8<UInt8>(repeating: 0x08)
static let characterMask = SIMD8<UInt8>(repeating: 0x41)
static func decode(
base32Encoded input: SIMD8<UInt8>,
into buffer: UnsafeMutablePointer<UInt8>
) {
var value = input & valueMask
value &+= letters
.replacing(with: numbers, where: characterMask .> value)
let bits = unsafeBitCast(value, to: UInt64.self)
buffer[0] |= UInt8(truncatingIfNeeded: bits &<< 0x03 & 0xF8)
buffer[0] |= UInt8(truncatingIfNeeded: bits &>> 0x0A & 0x07)
buffer[1] |= UInt8(truncatingIfNeeded: bits &>> 0x02 & 0xC0)
buffer[1] |= UInt8(truncatingIfNeeded: bits &>> 0x0F & 0x3E)
buffer[1] |= UInt8(truncatingIfNeeded: bits &>> 0x1C & 0x01)
buffer[2] |= UInt8(truncatingIfNeeded: bits &>> 0x14 & 0xF0)
buffer[2] |= UInt8(truncatingIfNeeded: bits &>> 0x21 & 0x0F)
buffer[3] |= UInt8(truncatingIfNeeded: bits &>> 0x19 & 0x80)
buffer[3] |= UInt8(truncatingIfNeeded: bits &>> 0x26 & 0x7C)
buffer[3] |= UInt8(truncatingIfNeeded: bits &>> 0x33 & 0x03)
buffer[4] |= UInt8(truncatingIfNeeded: bits &>> 0x2B & 0xE0)
buffer[4] |= UInt8(truncatingIfNeeded: bits &>> 0x38 & 0x1F)
}
}
struct UnsafeDecodeBase32x16 {
static let valueMask = SIMD16<UInt8>(repeating: 0x5F)
static let letters = SIMD16<UInt8>(repeating: 0xBF)
static let numbers = SIMD16<UInt8>(repeating: 0x08)
static let characterMask = SIMD16<UInt8>(repeating: 0x41)
static let ashift: (SIMD2<UInt64>, SIMD2<UInt64>) =
([0x03, 0x03], [0x0A, 0x0A])
static let amask: (SIMD2<UInt64>, SIMD2<UInt64>) =
([0xF8, 0xF8], [0x07, 0x07])
static let bshift: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) =
([0x02, 0x02], [0x0F, 0x0F], [0x1C, 0x1C])
static let bmask: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) =
([0xC0, 0xC0], [0x3E, 0x3E], [0x01, 0x01])
static let cshift: (SIMD2<UInt64>, SIMD2<UInt64>) =
([0x14, 0x14], [0x21, 0x21])
static let cmask: (SIMD2<UInt64>, SIMD2<UInt64>) =
([0xF0, 0xF0], [0x0F, 0x0F])
static let dshift: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) =
([0x19, 0x19], [0x26, 0x26], [0x33, 0x33])
static let dmask: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) =
([0x80, 0x80], [0x7C, 0x7C], [0x03, 0x03])
static let eshift: (SIMD2<UInt64>, SIMD2<UInt64>) =
([0x2B, 0x2B], [0x38, 0x38])
static let emask: (SIMD2<UInt64>, SIMD2<UInt64>) =
([0xE0, 0xE0], [0x1F, 0x1F])
static func decode(
base32Encoded input: SIMD16<UInt8>,
into buffer: UnsafeMutablePointer<UInt8>
) {
var value = input & valueMask
value &+= letters
.replacing(with: numbers, where: characterMask .> value)
let bits = unsafeBitCast(value, to: SIMD2<UInt64>.self)
var // A
v = bits &<< ashift.0 & amask.0
v |= bits &>> ashift.1 & amask.1
buffer[0] = UInt8(truncatingIfNeeded: v[0])
buffer[5] = UInt8(truncatingIfNeeded: v[1])
// B
v = bits &>> bshift.0 & bmask.0
v |= bits &>> bshift.1 & bmask.1
v |= bits &>> bshift.2 & bmask.2
buffer[1] = UInt8(truncatingIfNeeded: v[0])
buffer[6] = UInt8(truncatingIfNeeded: v[1])
// C
v = bits &>> cshift.0 & cmask.0
v |= bits &>> cshift.1 & cmask.1
buffer[2] = UInt8(truncatingIfNeeded: v[0])
buffer[7] = UInt8(truncatingIfNeeded: v[1])
// D
v = bits &>> dshift.0 & dmask.0
v |= bits &>> dshift.1 & dmask.1
v |= bits &>> dshift.2 & dmask.2
buffer[3] = UInt8(truncatingIfNeeded: v[0])
buffer[8] = UInt8(truncatingIfNeeded: v[1])
// E
v = bits &>> eshift.0 & emask.0
v |= bits &>> eshift.1 & emask.1
buffer[4] = UInt8(truncatingIfNeeded: v[0])
buffer[9] = UInt8(truncatingIfNeeded: v[1])
}
}
let data = Array(string.utf8)
let buffer = UnsafeMutableBufferPointer<UInt8>
.allocate(capacity: data.count * 5 / 8)
defer { buffer.deallocate() }
print("----")
print("BASE32 DECODE")
print("----")
print("+ size: \(data.count)")
print("+ allocated: \(buffer.count)")
for _ in 0..<(1024) {
data.withContiguousStorageIfAvailable {
unsafeDecode(
base32Encoded: $0.baseAddress.unsafelyUnwrapped,
into: buffer.baseAddress.unsafelyUnwrapped,
data.count
)
}
}
//
// The swift to C implementation
//
let data = Array(string.utf8)
let buffer = UnsafeMutableBufferPointer<UInt8>
.allocate(capacity: data.count * 5 / 8)
defer { buffer.deallocate() }
print("----")
print("BASE32 DECODE")
print("----")
print("+ size: \(data.count)")
print("+ allocated: \(buffer.count)")
for _ in 0..<(1024) {
data.withContiguousStorageIfAvailable {
__unsafe_b32_decode($0.baseAddress, buffer.baseAddress, data.count)
}
}
@kozlowsqi
Copy link
Author

Screen Shot 2020-03-12 at 18 04 03

Screen Shot 2020-03-12 at 18 04 23

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment