Skip to content

Instantly share code, notes, and snippets.

@emmansun
Last active November 2, 2023 02:33
Show Gist options
  • Save emmansun/c0f174a614a005f80f51b033500fd7fc to your computer and use it in GitHub Desktop.
Save emmansun/c0f174a614a005f80f51b033500fd7fc to your computer and use it in GitHub Desktop.
package main
import (
"encoding/base64"
"encoding/binary"
"fmt"
)
type __m128i struct {
bytes [16]byte
}
func set64(hi, lo uint64) (m __m128i) {
binary.LittleEndian.PutUint64(m.bytes[:], lo)
binary.LittleEndian.PutUint64(m.bytes[8:], hi)
return
}
func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) {
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_and_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] & b.bytes[i]
}
return
}
func mm_or_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] | b.bytes[i]
}
return
}
func mm_andnot_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = (^a.bytes[i]) & b.bytes[i]
}
return
}
func mm_shuffle_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if b.bytes[i]&0x80 == 0x80 {
m.bytes[i] = 0
} else {
idx := b.bytes[i] & 0x0f
m.bytes[i] = a.bytes[idx]
}
}
return
}
func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) {
e0 := binary.LittleEndian.Uint32(a.bytes[:])
e1 := binary.LittleEndian.Uint32(a.bytes[4:])
e2 := binary.LittleEndian.Uint32(a.bytes[8:])
e3 := binary.LittleEndian.Uint32(a.bytes[12:])
if imm8 > 31 {
e0 = 0
e1 = 0
e2 = 0
e3 = 0
} else {
e0 = e0 >> imm8
e1 = e1 >> imm8
e2 = e2 >> imm8
e3 = e3 >> imm8
}
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) {
e0 := binary.LittleEndian.Uint32(a.bytes[:])
e1 := binary.LittleEndian.Uint32(a.bytes[4:])
e2 := binary.LittleEndian.Uint32(a.bytes[8:])
e3 := binary.LittleEndian.Uint32(a.bytes[12:])
if imm8 > 31 {
e0 = 0
e1 = 0
e2 = 0
e3 = 0
} else {
e0 = e0 << imm8
e1 = e1 << imm8
e2 = e2 << imm8
e3 = e3 << imm8
}
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) {
lo := binary.LittleEndian.Uint64(a.bytes[:])
hi := binary.LittleEndian.Uint64(a.bytes[8:])
if imm8 > 63 {
lo = 0
hi = 0
} else {
lo = lo >> imm8
hi = hi >> imm8
}
binary.LittleEndian.PutUint64(m.bytes[:], lo)
binary.LittleEndian.PutUint64(m.bytes[8:], hi)
return
}
func mm_set_epi8(in ...byte) (m __m128i) {
n := len(in)
if n > 16 {
n = 16
}
for i := 0; i < n; i++ {
m.bytes[15-i] = in[i]
}
return
}
func mm_setr_epi8(in ...byte) (m __m128i) {
n := len(in)
if n > 16 {
n = 16
}
for i := 0; i < n; i++ {
m.bytes[i] = in[i]
}
return
}
func mm_set1_epi32(a uint32) (m __m128i) {
return mm_set_epi32(a, a, a, a)
}
func mm_cmplt_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if a.bytes[i] < b.bytes[i] {
m.bytes[i] = 0xff
} else {
m.bytes[i] = 0x00
}
}
return
}
func mm_cmpgt_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if a.bytes[i] > b.bytes[i] {
m.bytes[i] = 0xff
} else {
m.bytes[i] = 0x00
}
}
return
}
func mm_cmpeq_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if a.bytes[i] == b.bytes[i] {
m.bytes[i] = 0xff
} else {
m.bytes[i] = 0x00
}
}
return
}
func mm_set1_epi8(a int8) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = byte(a)
}
return
}
func xor(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] ^ b.bytes[i]
}
return
}
func mm_subs_epu8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = 0
if a.bytes[i] > b.bytes[i] {
m.bytes[i] = a.bytes[i] - b.bytes[i]
}
}
return
}
func mm_add_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] + b.bytes[i]
}
return
}
func mm_sub_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = byte(int8(a.bytes[i]) - int8(b.bytes[i]))
}
return
}
func mm_mulhi_epu16(a, b __m128i) (m __m128i) {
for i := 0; i < 8; i++ {
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
t3 := (t1 * t2) >> 16
m.bytes[2*i] = byte(t3)
m.bytes[2*i+1] = byte(t3 >> 8)
}
return
}
func mm_mullo_epi16(a, b __m128i) (m __m128i) {
for i := 0; i < 8; i++ {
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
t3 := int32(t1) * int32(t2)
m.bytes[2*i] = byte(t3)
m.bytes[2*i+1] = byte(t3 >> 8)
}
return
}
func enc_reshuffle(in __m128i) __m128i {
// Input, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a
in = mm_shuffle_epi8(in, mm_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1))
fmt.Printf("%x\n", (in.bytes[:]))
// in, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b
t0 := mm_and_si128(in, mm_set1_epi32(0x0FC0FC00))
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
fmt.Printf("t0=%x\n", (t0.bytes[:]))
t1 := mm_mulhi_epu16(t0, mm_set1_epi32(0x04000040))
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
fmt.Printf("t1=%x\n", (t1.bytes[:]))
t2 := mm_and_si128(in, mm_set1_epi32(0x003F03F0))
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
fmt.Printf("t2=%x\n", (t2.bytes[:]))
t3 := mm_mullo_epi16(t2, mm_set1_epi32(0x01000010))
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
fmt.Printf("t3=%x\n", (t3.bytes[:]))
return mm_or_si128(t1, t3)
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}
func enc_translate(in __m128i) __m128i {
lut := mm_setr_epi8(
65, 71, 252, 252,
252, 252, 252, 252,
252, 252, 252, 252,
237, 240, 0, 0)
fmt.Printf("lut=%x\n", (lut.bytes[:]))
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from the input. The index for range #0 is right,
// others are 1 less than expected:
indices := mm_subs_epu8(in, mm_set1_epi8(51))
fmt.Printf("indices=%x\n", (indices.bytes[:]))
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
mask := mm_cmpgt_epi8(in, mm_set1_epi8(25))
fmt.Printf("mask=%x\n", (mask.bytes[:]))
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
// now correct:
indices = mm_sub_epi8(indices, mask)
fmt.Printf("indices=%x\n", (indices.bytes[:]))
// Add offsets to input values:
return mm_add_epi8(in, mm_shuffle_epi8(lut, indices))
}
func mm_movemask_epi8(in __m128i) int {
ret := 0
for i := 0; i < 16; i++ {
ret |= int((in.bytes[i]&0x80)>>7) << i
}
return ret
}
func mm_maddubs_epi16(a, b __m128i) (m __m128i) {
for i := 0; i < 8; i++ {
ret := int16(a.bytes[2*i+1])*int16(b.bytes[2*i+1]) + int16(a.bytes[2*i])*int16(b.bytes[2*i])
m.bytes[2*i] = byte(ret)
m.bytes[2*i+1] = byte(ret >> 8)
}
return
}
func mm_madd_epi16(a, b __m128i) (m __m128i) {
for i := 0; i < 4; i++ {
ah := int32(a.bytes[4*i+2]) | (int32(a.bytes[4*i+3]) << 8)
al := int32(a.bytes[4*i]) | (int32(a.bytes[4*i+1]) << 8)
bh := int32(b.bytes[4*i+2]) | (int32(b.bytes[4*i+3]) << 8)
bl := int32(b.bytes[4*i]) | (int32(b.bytes[4*i+1]) << 8)
ret := ah*bh + al*bl
m.bytes[4*i] = byte(ret)
m.bytes[4*i+1] = byte(ret >> 8)
m.bytes[4*i+2] = byte(ret >> 16)
m.bytes[4*i+3] = byte(ret >> 24)
}
return
}
func dec_reshuffle(in __m128i) __m128i {
// in, bits, upper case are most significant bits, lower case are least significant bits
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
merge_ab_and_bc := mm_maddubs_epi16(in, mm_set1_epi32(0x01400140))
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
out := mm_madd_epi16(merge_ab_and_bc, mm_set1_epi32(0x00011000))
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
return mm_shuffle_epi8(out, mm_setr_epi8(2, 1, 0,
6, 5, 4,
10, 9, 8,
14, 13, 12,
255, 255, 255, 255))
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
}
// The input consists of six character sets in the Base64 alphabet, which we
// need to map back to the 6-bit values they represent. There are three ranges,
// two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
//
// (6) Everything else => invalid input
//
// We will use lookup tables for character validation and offset computation.
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
// allows to mask with 0x2F instead of 0x0F and thus save one constant
// declaration (register and/or memory access).
//
// For offsets:
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
// 0000 = garbage
// 0001 = /
// 0010 = +
// 0011 = 0-9
// 0100 = A-Z
// 0101 = A-Z
// 0110 = a-z
// 0111 = a-z
// 1000 >= garbage
//
// For validation, here's the table.
// A character is valid if and only if the AND of the 2 lookups equals 0:
//
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
//
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
//
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
//
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
//
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
//
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
//
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
//
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
//
// 0100 0x04 char @ A B C D E F G H I J K L M N O
//
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
//
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 0110 0x04 char ` a b c d e f g h i j k l m n o
//
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0111 0x08 char p q r s t u v w x y z { | } ~
//
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
func dec_translate_std(in __m128i) (m __m128i, ret int) {
lut_lo := mm_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A)
lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10)
lut_roll := mm_setr_epi8(0, 16, 19, 4, 256-65, 256-65, 256-71, 256-71,
0, 0, 0, 0, 0, 0, 0, 0)
mask_2F := mm_set1_epi8(0x2F)
hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_2F)
lo_nibbles := mm_and_si128(in, mask_2F)
hi := mm_shuffle_epi8(lut_hi, hi_nibbles)
lo := mm_shuffle_epi8(lut_lo, lo_nibbles)
if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 {
return
}
eq_2F := mm_cmpeq_epi8(in, mask_2F)
roll := mm_shuffle_epi8(lut_roll, mm_add_epi8(eq_2F, hi_nibbles))
// Now simply add the delta values to the input:
return mm_add_epi8(in, roll), 1
}
// The input consists of six character sets in the Base64 alphabet, which we
// need to map back to the 6-bit values they represent. There are three ranges,
// two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [45] [62] +17 -
// 2 [48..57] [52..61] +4 0..9
// 3 [65..90] [0..25] -65 A..Z
// 4 [95] [63] -32 _
// 5 [97..122] [26..51] -71 a..z
//
// (6) Everything else => invalid input
//
// We will use lookup tables for character validation and offset computation.
//
// For offsets:
// Perfect hash for lut = ((src >> 4) & 0x0F) - ((src > 0x5e) ? 0xFF : 0x00)
// 0000 = garbage
// 0001 = garbage
// 0010 = -
// 0011 = 0-9
// 0100 = A-Z
// 0101 = A-Z
// 0110 = _
// 0111 = a-z
// 1000 = a-z
// 1000 > garbage
//
// For validation, here's the table.
// A character is valid if and only if the AND of the 2 lookups equals 0:
//
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
//
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1B 0x1B 0x1A 0x1B 0x33
//
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
//
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
//
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
//
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01
//
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
//
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
//
// 0100 0x04 char @ A B C D E F G H I J K L M N O
//
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
//
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x00
//
// 0110 0x04 char ` a b c d e f g h i j k l m n o
//
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0111 0x28 char p q r s t u v w x y z { | } ~
//
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x20
//
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
func dec_translate_url(in __m128i) (m __m128i, ret int) {
lut_lo := mm_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1B, 0x1B, 0x1A, 0x1B, 0x33)
lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x28,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10)
lut_roll := mm_setr_epi8(0, 0, 17, 4, 256-65, 256-65, 256-32, 256-71, 256-71, 0, 0, 0, 0, 0, 0, 0)
mask_0F := mm_set1_epi8(0x0F)
hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_0F)
lo_nibbles := mm_and_si128(in, mask_0F)
hi := mm_shuffle_epi8(lut_hi, hi_nibbles)
lo := mm_shuffle_epi8(lut_lo, lo_nibbles)
if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 {
return
}
gt_5e := mm_cmpgt_epi8(in, mm_set1_epi8(0x5E))
roll := mm_shuffle_epi8(lut_roll, mm_sub_epi8(hi_nibbles, gt_5e))
// Now simply add the delta values to the input:
return mm_add_epi8(in, roll), 1
}
func main() {
// std
encoded := mm_setr_epi8([]byte(base64.StdEncoding.EncodeToString([]byte("abcdefghijkl")))...)
decoded, ret := dec_translate_std(encoded)
if ret == 1 {
decoded = dec_reshuffle(decoded)
fmt.Printf("%v\n", string(decoded.bytes[:12]))
} else {
fmt.Println("invalid base64 encoded")
}
// url
encoded = mm_setr_epi8([]byte(base64.URLEncoding.EncodeToString([]byte("!?$*&()'-=@~")))...)
decoded, ret = dec_translate_url(encoded)
if ret == 1 {
decoded = dec_reshuffle(decoded)
fmt.Printf("%v\n", string(decoded.bytes[:12]))
} else {
fmt.Println("invalid base64 encoded")
}
}
package main
import (
"encoding/base64"
"encoding/binary"
"fmt"
)
type __m128i struct {
bytes [16]byte
}
func set64(hi, lo uint64) (m __m128i) {
binary.LittleEndian.PutUint64(m.bytes[:], lo)
binary.LittleEndian.PutUint64(m.bytes[8:], hi)
return
}
func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) {
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_and_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] & b.bytes[i]
}
return
}
func mm_or_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] | b.bytes[i]
}
return
}
func mm_andnot_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = (^a.bytes[i]) & b.bytes[i]
}
return
}
func mm_shuffle_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if b.bytes[i]&0x80 == 0x80 {
m.bytes[i] = 0
} else {
idx := b.bytes[i] & 0x0f
m.bytes[i] = a.bytes[idx]
}
}
return
}
func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) {
e0 := binary.LittleEndian.Uint32(a.bytes[:])
e1 := binary.LittleEndian.Uint32(a.bytes[4:])
e2 := binary.LittleEndian.Uint32(a.bytes[8:])
e3 := binary.LittleEndian.Uint32(a.bytes[12:])
if imm8 > 31 {
e0 = 0
e1 = 0
e2 = 0
e3 = 0
} else {
e0 = e0 >> imm8
e1 = e1 >> imm8
e2 = e2 >> imm8
e3 = e3 >> imm8
}
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) {
e0 := binary.LittleEndian.Uint32(a.bytes[:])
e1 := binary.LittleEndian.Uint32(a.bytes[4:])
e2 := binary.LittleEndian.Uint32(a.bytes[8:])
e3 := binary.LittleEndian.Uint32(a.bytes[12:])
if imm8 > 31 {
e0 = 0
e1 = 0
e2 = 0
e3 = 0
} else {
e0 = e0 << imm8
e1 = e1 << imm8
e2 = e2 << imm8
e3 = e3 << imm8
}
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) {
lo := binary.LittleEndian.Uint64(a.bytes[:])
hi := binary.LittleEndian.Uint64(a.bytes[8:])
if imm8 > 63 {
lo = 0
hi = 0
} else {
lo = lo >> imm8
hi = hi >> imm8
}
binary.LittleEndian.PutUint64(m.bytes[:], lo)
binary.LittleEndian.PutUint64(m.bytes[8:], hi)
return
}
func mm_setr_epi8(in []byte) (m __m128i) {
n := len(in)
if n > 16 {
n = 16
}
for i := 0; i < n; i++ {
m.bytes[i] = in[i]
}
return
}
func mm_set1_epi32(a uint32) (m __m128i) {
return mm_set_epi32(a, a, a, a)
}
func mm_cmplt_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if a.bytes[i] < b.bytes[i] {
m.bytes[i] = 0xff
} else {
m.bytes[i] = 0x00
}
}
return
}
func mm_set1_epi8(a int8) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = byte(a)
}
return
}
func mm_add_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] + b.bytes[i]
}
return
}
func xor(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] ^ b.bytes[i]
}
return
}
func main() {
// Load string:
str := mm_setr_epi8([]byte("ABCDEFGHIJKLMMMM"))
// Reorder to 32-bit big-endian, duplicating the third byte in every block of four.
// This copies the third byte to its final destination, so we can include it later
// by just masking instead of shifting and masking.
// The workset must be in big-endian, otherwise the shifted bits do not carry over
// properly among adjacent bytes:
str = mm_shuffle_epi8(str,
mm_setr_epi8([]byte{2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9}))
// Mask to pass through only the lower 6 bits of one byte;
mask := mm_set1_epi32(0x3F000000)
// Shift bits by 2, mask in only the first byte:
res := mm_srli_epi32(str, 2)
res = mm_and_si128(res, mask)
mask = mm_srli_epi32(mask, 8)
// Shift bits by 4, mask in only the second byte:
res = mm_or_si128(mm_and_si128(mm_srli_epi32(str, 4), mask), res)
mask = mm_srli_epi32(mask, 8)
// Shift bits by 6, mask in only the third byte:
res = mm_or_si128(mm_and_si128(mm_srli_epi32(str, 6), mask), res)
mask = mm_srli_epi32(mask, 8)
// No shift necessary for the fourth byte because we duplicated
// the third byte to this position; just mask:
res = mm_or_si128(mm_and_si128(str, mask), res)
// Reorder to 32-bit little-endian:
res = mm_shuffle_epi8(res,
mm_setr_epi8([]byte{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}))
// set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
s1mask := mm_cmplt_epi8(res, mm_set1_epi8(26))
blockmask := s1mask
// set 2: 26..51, "abcdefghijklmnopqrstuvwxyz"
s2mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(52)))
blockmask = mm_or_si128(s2mask, blockmask)
// set 3: 52..61, "0123456789"
s3mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(62)))
blockmask = mm_or_si128(s3mask, blockmask)
s4mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(63)))
blockmask = mm_or_si128(s4mask, blockmask)
// Create the masked character sets:
s1 := mm_and_si128(s1mask, mm_add_epi8(res, mm_set1_epi8('A')))
s2 := mm_and_si128(s2mask, mm_add_epi8(res, mm_set1_epi8('a'-26)))
s3 := mm_and_si128(s3mask, mm_add_epi8(res, mm_set1_epi8('0'-52)))
s4 := mm_and_si128(s4mask, mm_set1_epi8('+'))
s5 := mm_andnot_si128(blockmask, mm_set1_epi8('/'))
result := mm_or_si128(s1, s2)
result = mm_or_si128(result, s3)
result = mm_or_si128(result, s4)
result = mm_or_si128(result, s5)
fmt.Printf("%s\n", string(result.bytes[:]))
fmt.Printf("%s\n", base64.StdEncoding.EncodeToString([]byte("ABCDEFGHIJKL")))
}
package main
import (
"encoding/base64"
"encoding/binary"
"fmt"
)
type __m128i struct {
bytes [16]byte
}
func set64(hi, lo uint64) (m __m128i) {
binary.LittleEndian.PutUint64(m.bytes[:], lo)
binary.LittleEndian.PutUint64(m.bytes[8:], hi)
return
}
func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) {
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_and_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] & b.bytes[i]
}
return
}
func mm_or_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] | b.bytes[i]
}
return
}
func mm_andnot_si128(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = (^a.bytes[i]) & b.bytes[i]
}
return
}
func mm_shuffle_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if b.bytes[i]&0x80 == 0x80 {
m.bytes[i] = 0
} else {
idx := b.bytes[i] & 0x0f
m.bytes[i] = a.bytes[idx]
}
}
return
}
func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) {
e0 := binary.LittleEndian.Uint32(a.bytes[:])
e1 := binary.LittleEndian.Uint32(a.bytes[4:])
e2 := binary.LittleEndian.Uint32(a.bytes[8:])
e3 := binary.LittleEndian.Uint32(a.bytes[12:])
if imm8 > 31 {
e0 = 0
e1 = 0
e2 = 0
e3 = 0
} else {
e0 = e0 >> imm8
e1 = e1 >> imm8
e2 = e2 >> imm8
e3 = e3 >> imm8
}
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) {
e0 := binary.LittleEndian.Uint32(a.bytes[:])
e1 := binary.LittleEndian.Uint32(a.bytes[4:])
e2 := binary.LittleEndian.Uint32(a.bytes[8:])
e3 := binary.LittleEndian.Uint32(a.bytes[12:])
if imm8 > 31 {
e0 = 0
e1 = 0
e2 = 0
e3 = 0
} else {
e0 = e0 << imm8
e1 = e1 << imm8
e2 = e2 << imm8
e3 = e3 << imm8
}
binary.LittleEndian.PutUint32(m.bytes[:], e0)
binary.LittleEndian.PutUint32(m.bytes[4:], e1)
binary.LittleEndian.PutUint32(m.bytes[8:], e2)
binary.LittleEndian.PutUint32(m.bytes[12:], e3)
return
}
func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) {
lo := binary.LittleEndian.Uint64(a.bytes[:])
hi := binary.LittleEndian.Uint64(a.bytes[8:])
if imm8 > 63 {
lo = 0
hi = 0
} else {
lo = lo >> imm8
hi = hi >> imm8
}
binary.LittleEndian.PutUint64(m.bytes[:], lo)
binary.LittleEndian.PutUint64(m.bytes[8:], hi)
return
}
func mm_set_epi8(in ...byte) (m __m128i) {
n := len(in)
if n > 16 {
n = 16
}
for i := 0; i < n; i++ {
m.bytes[15-i] = in[i]
}
return
}
func mm_setr_epi8(in ...byte) (m __m128i) {
n := len(in)
if n > 16 {
n = 16
}
for i := 0; i < n; i++ {
m.bytes[i] = in[i]
}
return
}
func mm_set1_epi32(a uint32) (m __m128i) {
return mm_set_epi32(a, a, a, a)
}
func mm_cmplt_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if a.bytes[i] < b.bytes[i] {
m.bytes[i] = 0xff
} else {
m.bytes[i] = 0x00
}
}
return
}
func mm_cmpgt_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
if a.bytes[i] > b.bytes[i] {
m.bytes[i] = 0xff
} else {
m.bytes[i] = 0x00
}
}
return
}
func mm_set1_epi8(a int8) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = byte(a)
}
return
}
func xor(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] ^ b.bytes[i]
}
return
}
func mm_subs_epu8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = 0
if a.bytes[i] > b.bytes[i] {
m.bytes[i] = a.bytes[i] - b.bytes[i]
}
}
return
}
func mm_add_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = a.bytes[i] + b.bytes[i]
}
return
}
func mm_sub_epi8(a, b __m128i) (m __m128i) {
for i := 0; i < 16; i++ {
m.bytes[i] = byte(int8(a.bytes[i]) - int8(b.bytes[i]))
}
return
}
func mm_mulhi_epu16(a, b __m128i) (m __m128i) {
for i := 0; i < 8; i++ {
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
t3 := (t1 * t2) >> 16
m.bytes[2*i] = byte(t3)
m.bytes[2*i+1] = byte(t3 >> 8)
}
return
}
func mm_mullo_epi16(a, b __m128i) (m __m128i) {
for i := 0; i < 8; i++ {
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
t3 := int32(t1) * int32(t2)
m.bytes[2*i] = byte(t3)
m.bytes[2*i+1] = byte(t3 >> 8)
}
return
}
func enc_reshuffle(in __m128i) __m128i {
// Input, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a
in = mm_shuffle_epi8(in, mm_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1))
fmt.Printf("%x\n", (in.bytes[:]))
// in, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b
t0 := mm_and_si128(in, mm_set1_epi32(0x0FC0FC00))
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
fmt.Printf("t0=%x\n", (t0.bytes[:]))
t1 := mm_mulhi_epu16(t0, mm_set1_epi32(0x04000040))
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
fmt.Printf("t1=%x\n", (t1.bytes[:]))
t2 := mm_and_si128(in, mm_set1_epi32(0x003F03F0))
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
fmt.Printf("t2=%x\n", (t2.bytes[:]))
t3 := mm_mullo_epi16(t2, mm_set1_epi32(0x01000010))
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
fmt.Printf("t3=%x\n", (t3.bytes[:]))
return mm_or_si128(t1, t3)
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}
func enc_translate(in __m128i) __m128i {
lut := mm_setr_epi8(
65, 71, 252, 252,
252, 252, 252, 252,
252, 252, 252, 252,
237, 240, 0, 0)
fmt.Printf("lut=%x\n", (lut.bytes[:]))
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from the input. The index for range #0 is right,
// others are 1 less than expected:
indices := mm_subs_epu8(in, mm_set1_epi8(51))
fmt.Printf("indices=%x\n", (indices.bytes[:]))
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
mask := mm_cmpgt_epi8(in, mm_set1_epi8(25))
fmt.Printf("mask=%x\n", (mask.bytes[:]))
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
// now correct:
indices = mm_sub_epi8(indices, mask)
fmt.Printf("indices=%x\n", (indices.bytes[:]))
// Add offsets to input values:
return mm_add_epi8(in, mm_shuffle_epi8(lut, indices))
}
func main() {
// Load string:
str := mm_setr_epi8('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', '0', '0', '0', '0')
// Reshuffle:
result := enc_reshuffle(str)
fmt.Printf("resuffle %x\n", (result.bytes[:]))
// Translate reshuffled bytes to the Base64 alphabet:
result = enc_translate(result)
fmt.Printf("%s\n", string(result.bytes[:]))
fmt.Printf("%s\n", base64.StdEncoding.EncodeToString([]byte("abcdefghijkl")))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment