-
-
Save kozlowsqi/b7f7e501176eefb7497ef4f13eb82897 to your computer and use it in GitHub Desktop.
Broken Swift SIMD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <smmintrin.h> | |
#include <immintrin.h> | |
#include <emmintrin.h> | |
#define __VECTOR_REPEATING_X16(NAME, VALUE) \ | |
char NAME##_x16[] __attribute__ ((aligned(16))) = { \ | |
VALUE, VALUE, VALUE, VALUE, \ | |
VALUE, VALUE, VALUE, VALUE, \ | |
VALUE, VALUE, VALUE, VALUE, \ | |
VALUE, VALUE, VALUE, VALUE \ | |
} | |
#define __COMPACTING_MASK_LITERAL_X16(NAME, VALUE) \ | |
char NAME##_x16[] __attribute__ ((aligned(16))) = { \ | |
VALUE, 0x00, 0x00, 0x00, \ | |
0x00, 0x00, 0x00, 0x00, \ | |
VALUE, 0x00, 0x00, 0x00, \ | |
0x00, 0x00, 0x00, 0x00 \ | |
} | |
#define __LOAD_STATIC_VECTOR_X16(NAME, LITERALNAME)\ | |
const __m128i NAME = _mm_load_si128((__m128i const*) LITERALNAME##_x16) | |
__VECTOR_REPEATING_X16(__b32_charactermask_literal, 0x5F); | |
__VECTOR_REPEATING_X16(__b32_letteroffset_literal, 0x41); | |
__VECTOR_REPEATING_X16(__b32_numberoffset_literal, 0x08); | |
__VECTOR_REPEATING_X16(__b32_blendmask_literal, 0x41); | |
__COMPACTING_MASK_LITERAL_X16(__b32_amask0_literal, 0xF8); | |
__COMPACTING_MASK_LITERAL_X16(__b32_amask1_literal, 0x07); | |
__COMPACTING_MASK_LITERAL_X16(__b32_bmask0_literal, 0xC0); | |
__COMPACTING_MASK_LITERAL_X16(__b32_bmask1_literal, 0x3E); | |
__COMPACTING_MASK_LITERAL_X16(__b32_bmask2_literal, 0x01); | |
__COMPACTING_MASK_LITERAL_X16(__b32_cmask0_literal, 0xF0); | |
__COMPACTING_MASK_LITERAL_X16(__b32_cmask1_literal, 0x0F); | |
__COMPACTING_MASK_LITERAL_X16(__b32_dmask0_literal, 0x80); | |
__COMPACTING_MASK_LITERAL_X16(__b32_dmask1_literal, 0x7C); | |
__COMPACTING_MASK_LITERAL_X16(__b32_dmask2_literal, 0x03); | |
__COMPACTING_MASK_LITERAL_X16(__b32_emask0_literal, 0xE0); | |
__COMPACTING_MASK_LITERAL_X16(__b32_emask1_literal, 0x1F); | |
extern inline void | |
__unsafe_b32_decode_intel_x16(void const * const _in_buffer, void *_out_buffer) | |
{ | |
__LOAD_STATIC_VECTOR_X16(charactermask, __b32_charactermask_literal); | |
__LOAD_STATIC_VECTOR_X16(letteroffset, __b32_letteroffset_literal); | |
__LOAD_STATIC_VECTOR_X16(numberoffset, __b32_numberoffset_literal); | |
__LOAD_STATIC_VECTOR_X16(blendmask, __b32_blendmask_literal); | |
__LOAD_STATIC_VECTOR_X16(amask0, __b32_amask0_literal); | |
__LOAD_STATIC_VECTOR_X16(amask1, __b32_amask1_literal); | |
__LOAD_STATIC_VECTOR_X16(bmask0, __b32_bmask0_literal); | |
__LOAD_STATIC_VECTOR_X16(bmask1, __b32_bmask1_literal); | |
__LOAD_STATIC_VECTOR_X16(bmask2, __b32_bmask2_literal); | |
__LOAD_STATIC_VECTOR_X16(cmask0, __b32_cmask0_literal); | |
__LOAD_STATIC_VECTOR_X16(cmask1, __b32_cmask1_literal); | |
__LOAD_STATIC_VECTOR_X16(dmask0, __b32_dmask0_literal); | |
__LOAD_STATIC_VECTOR_X16(dmask1, __b32_dmask1_literal); | |
__LOAD_STATIC_VECTOR_X16(dmask2, __b32_dmask2_literal); | |
__LOAD_STATIC_VECTOR_X16(emask0, __b32_emask0_literal); | |
__LOAD_STATIC_VECTOR_X16(emask1, __b32_emask1_literal); | |
__m128i input = _mm_lddqu_si128((__m128i const *) _in_buffer); | |
__m128i charactervalue = _mm_and_si128(input, charactermask); | |
__m128i lettervalues = _mm_sub_epi8(charactervalue, letteroffset); | |
__m128i numbervalues = _mm_add_epi8(charactervalue, numberoffset); | |
__m128i is_number = _mm_cmpgt_epi8(blendmask, charactervalue); | |
__m128i value = _mm_blendv_epi8(lettervalues, numbervalues, is_number); | |
__m128i partial0, partial1, partial2, result; | |
// A | |
partial0 = _mm_and_si128(_mm_slli_epi64(value, 0x03), amask0); | |
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x0A), amask1); | |
result = _mm_or_si128(partial0, partial1); | |
*(char *) (_out_buffer + 0x00) = (char) (_mm_extract_epi64(result, 0)); | |
*(char *) (_out_buffer + 0x05) = (char) (_mm_extract_epi64(result, 1)); | |
// B | |
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x02), bmask0); | |
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x0F), bmask1); | |
partial2 = _mm_and_si128(_mm_srli_epi64(value, 0x1C), bmask2); | |
result = _mm_or_si128(_mm_or_si128(partial0, partial1), partial2); | |
*(char *) (_out_buffer + 0x01) = (char) (_mm_extract_epi64(result, 0)); | |
*(char *) (_out_buffer + 0x06) = (char) (_mm_extract_epi64(result, 1)); | |
// C | |
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x14), cmask0); | |
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x21), cmask1); | |
result = _mm_or_si128(partial0, partial1); | |
*(char *) (_out_buffer + 0x02) = (char) (_mm_extract_epi64(result, 0)); | |
*(char *) (_out_buffer + 0x07) = (char) (_mm_extract_epi64(result, 1)); | |
// D | |
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x19), dmask0); | |
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x26), dmask1); | |
partial2 = _mm_and_si128(_mm_srli_epi64(value, 0x33), dmask2); | |
result = _mm_or_si128(_mm_or_si128(partial0, partial1), partial2); | |
*(char *) (_out_buffer + 0x03) = (char) (_mm_extract_epi64(result, 0)); | |
*(char *) (_out_buffer + 0x08) = (char) (_mm_extract_epi64(result, 1)); | |
// E | |
partial0 = _mm_and_si128(_mm_srli_epi64(value, 0x2B), emask0); | |
partial1 = _mm_and_si128(_mm_srli_epi64(value, 0x38), emask1); | |
result = _mm_or_si128(partial0, partial1); | |
*(char *) (_out_buffer + 0x04) = (char) (_mm_extract_epi64(result, 0)); | |
*(char *) (_out_buffer + 0x09) = (char) (_mm_extract_epi64(result, 1)); | |
} | |
void | |
__unsafe_b32_decode(uint8_t const * _in_buffer, uint8_t *_out_buffer, int count) | |
{ | |
while(count >= 16) { | |
__unsafe_b32_decode_intel_x16(_in_buffer, _out_buffer); | |
_in_buffer += 16; | |
_out_buffer += 10; | |
count -= 16; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// The original swift-only implementation | |
// | |
import Foundation | |
class Box<T> { | |
let value: T | |
init(_ value: T) { | |
self.value = value | |
} | |
} | |
public func unsafeDecode( | |
base32Encoded data: UnsafePointer<UInt8>, | |
into buffer: UnsafeMutablePointer<UInt8>, | |
_ count: Int | |
) { | |
var count = count | |
var data = data | |
var buffer = buffer | |
// while count >= 64 { | |
// let vector = SIMD64( | |
// data[dataIndex..<dataIndex+64] | |
// ) | |
// UnsafeDecodeBase32x64.decode( | |
// base32Encoded: vector, | |
// into: buffer | |
// ) | |
// buffer = buffer.advanced(by: 40) | |
// dataIndex += 64 | |
// count -= 64 | |
// } | |
// | |
// while count >= 32 { | |
// let vector = unsafeBitCast(data, to: Box<SIMD32<UInt8>>.self) | |
// | |
// UnsafeDecodeBase32x32.decode( | |
// base32Encoded: vector.value, | |
// into: buffer | |
// ) | |
// buffer = buffer.advanced(by: 20) | |
// data = data.advanced(by: 32) | |
// count -= 32 | |
// } | |
while count >= 16 { | |
let vector = unsafeBitCast(data, to: Box<SIMD16<UInt8>>.self) | |
UnsafeDecodeBase32x16.decode( | |
base32Encoded: vector.value, | |
into: buffer | |
) | |
buffer = buffer.advanced(by: 10) | |
data = data.advanced(by: 16) | |
count -= 16 | |
} | |
while count >= 8 { | |
let vector = unsafeBitCast(data, to: Box<SIMD8<UInt8>>.self) | |
UnsafeDecodeBase32x8.decode( | |
base32Encoded: vector.value, | |
into: buffer | |
) | |
buffer = buffer.advanced(by: 5) | |
data = data.advanced(by: 8) | |
count -= 8 | |
} | |
} | |
struct UnsafeDecodeBase32x8 { | |
static let valueMask = SIMD8<UInt8>(repeating: 0x5F) | |
static let letters = SIMD8<UInt8>(repeating: 0xBF) | |
static let numbers = SIMD8<UInt8>(repeating: 0x08) | |
static let characterMask = SIMD8<UInt8>(repeating: 0x41) | |
static func decode( | |
base32Encoded input: SIMD8<UInt8>, | |
into buffer: UnsafeMutablePointer<UInt8> | |
) { | |
var value = input & valueMask | |
value &+= letters | |
.replacing(with: numbers, where: characterMask .> value) | |
let bits = unsafeBitCast(value, to: UInt64.self) | |
buffer[0] |= UInt8(truncatingIfNeeded: bits &<< 0x03 & 0xF8) | |
buffer[0] |= UInt8(truncatingIfNeeded: bits &>> 0x0A & 0x07) | |
buffer[1] |= UInt8(truncatingIfNeeded: bits &>> 0x02 & 0xC0) | |
buffer[1] |= UInt8(truncatingIfNeeded: bits &>> 0x0F & 0x3E) | |
buffer[1] |= UInt8(truncatingIfNeeded: bits &>> 0x1C & 0x01) | |
buffer[2] |= UInt8(truncatingIfNeeded: bits &>> 0x14 & 0xF0) | |
buffer[2] |= UInt8(truncatingIfNeeded: bits &>> 0x21 & 0x0F) | |
buffer[3] |= UInt8(truncatingIfNeeded: bits &>> 0x19 & 0x80) | |
buffer[3] |= UInt8(truncatingIfNeeded: bits &>> 0x26 & 0x7C) | |
buffer[3] |= UInt8(truncatingIfNeeded: bits &>> 0x33 & 0x03) | |
buffer[4] |= UInt8(truncatingIfNeeded: bits &>> 0x2B & 0xE0) | |
buffer[4] |= UInt8(truncatingIfNeeded: bits &>> 0x38 & 0x1F) | |
} | |
} | |
struct UnsafeDecodeBase32x16 { | |
static let valueMask = SIMD16<UInt8>(repeating: 0x5F) | |
static let letters = SIMD16<UInt8>(repeating: 0xBF) | |
static let numbers = SIMD16<UInt8>(repeating: 0x08) | |
static let characterMask = SIMD16<UInt8>(repeating: 0x41) | |
static let ashift: (SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0x03, 0x03], [0x0A, 0x0A]) | |
static let amask: (SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0xF8, 0xF8], [0x07, 0x07]) | |
static let bshift: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0x02, 0x02], [0x0F, 0x0F], [0x1C, 0x1C]) | |
static let bmask: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0xC0, 0xC0], [0x3E, 0x3E], [0x01, 0x01]) | |
static let cshift: (SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0x14, 0x14], [0x21, 0x21]) | |
static let cmask: (SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0xF0, 0xF0], [0x0F, 0x0F]) | |
static let dshift: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0x19, 0x19], [0x26, 0x26], [0x33, 0x33]) | |
static let dmask: (SIMD2<UInt64>, SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0x80, 0x80], [0x7C, 0x7C], [0x03, 0x03]) | |
static let eshift: (SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0x2B, 0x2B], [0x38, 0x38]) | |
static let emask: (SIMD2<UInt64>, SIMD2<UInt64>) = | |
([0xE0, 0xE0], [0x1F, 0x1F]) | |
static func decode( | |
base32Encoded input: SIMD16<UInt8>, | |
into buffer: UnsafeMutablePointer<UInt8> | |
) { | |
var value = input & valueMask | |
value &+= letters | |
.replacing(with: numbers, where: characterMask .> value) | |
let bits = unsafeBitCast(value, to: SIMD2<UInt64>.self) | |
var // A | |
v = bits &<< ashift.0 & amask.0 | |
v |= bits &>> ashift.1 & amask.1 | |
buffer[0] = UInt8(truncatingIfNeeded: v[0]) | |
buffer[5] = UInt8(truncatingIfNeeded: v[1]) | |
// B | |
v = bits &>> bshift.0 & bmask.0 | |
v |= bits &>> bshift.1 & bmask.1 | |
v |= bits &>> bshift.2 & bmask.2 | |
buffer[1] = UInt8(truncatingIfNeeded: v[0]) | |
buffer[6] = UInt8(truncatingIfNeeded: v[1]) | |
// C | |
v = bits &>> cshift.0 & cmask.0 | |
v |= bits &>> cshift.1 & cmask.1 | |
buffer[2] = UInt8(truncatingIfNeeded: v[0]) | |
buffer[7] = UInt8(truncatingIfNeeded: v[1]) | |
// D | |
v = bits &>> dshift.0 & dmask.0 | |
v |= bits &>> dshift.1 & dmask.1 | |
v |= bits &>> dshift.2 & dmask.2 | |
buffer[3] = UInt8(truncatingIfNeeded: v[0]) | |
buffer[8] = UInt8(truncatingIfNeeded: v[1]) | |
// E | |
v = bits &>> eshift.0 & emask.0 | |
v |= bits &>> eshift.1 & emask.1 | |
buffer[4] = UInt8(truncatingIfNeeded: v[0]) | |
buffer[9] = UInt8(truncatingIfNeeded: v[1]) | |
} | |
} | |
let data = Array(string.utf8) | |
let buffer = UnsafeMutableBufferPointer<UInt8> | |
.allocate(capacity: data.count * 5 / 8) | |
defer { buffer.deallocate() } | |
print("----") | |
print("BASE32 DECODE") | |
print("----") | |
print("+ size: \(data.count)") | |
print("+ allocated: \(buffer.count)") | |
for _ in 0..<(1024) { | |
data.withContiguousStorageIfAvailable { | |
unsafeDecode( | |
base32Encoded: $0.baseAddress.unsafelyUnwrapped, | |
into: buffer.baseAddress.unsafelyUnwrapped, | |
data.count | |
) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// The swift to C implementation | |
// | |
let data = Array(string.utf8) | |
let buffer = UnsafeMutableBufferPointer<UInt8> | |
.allocate(capacity: data.count * 5 / 8) | |
defer { buffer.deallocate() } | |
print("----") | |
print("BASE32 DECODE") | |
print("----") | |
print("+ size: \(data.count)") | |
print("+ allocated: \(buffer.count)") | |
for _ in 0..<(1024) { | |
data.withContiguousStorageIfAvailable { | |
__unsafe_b32_decode($0.baseAddress, buffer.baseAddress, data.count) | |
} | |
} |
Author
kozlowsqi
commented
Mar 12, 2020
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment