Last active
November 2, 2023 02:33
-
-
Save emmansun/c0f174a614a005f80f51b033500fd7fc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/base64" | |
"encoding/binary" | |
"fmt" | |
) | |
type __m128i struct { | |
bytes [16]byte | |
} | |
func set64(hi, lo uint64) (m __m128i) { | |
binary.LittleEndian.PutUint64(m.bytes[:], lo) | |
binary.LittleEndian.PutUint64(m.bytes[8:], hi) | |
return | |
} | |
func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) { | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_and_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] & b.bytes[i] | |
} | |
return | |
} | |
func mm_or_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] | b.bytes[i] | |
} | |
return | |
} | |
func mm_andnot_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = (^a.bytes[i]) & b.bytes[i] | |
} | |
return | |
} | |
func mm_shuffle_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if b.bytes[i]&0x80 == 0x80 { | |
m.bytes[i] = 0 | |
} else { | |
idx := b.bytes[i] & 0x0f | |
m.bytes[i] = a.bytes[idx] | |
} | |
} | |
return | |
} | |
func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) { | |
e0 := binary.LittleEndian.Uint32(a.bytes[:]) | |
e1 := binary.LittleEndian.Uint32(a.bytes[4:]) | |
e2 := binary.LittleEndian.Uint32(a.bytes[8:]) | |
e3 := binary.LittleEndian.Uint32(a.bytes[12:]) | |
if imm8 > 31 { | |
e0 = 0 | |
e1 = 0 | |
e2 = 0 | |
e3 = 0 | |
} else { | |
e0 = e0 >> imm8 | |
e1 = e1 >> imm8 | |
e2 = e2 >> imm8 | |
e3 = e3 >> imm8 | |
} | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) { | |
e0 := binary.LittleEndian.Uint32(a.bytes[:]) | |
e1 := binary.LittleEndian.Uint32(a.bytes[4:]) | |
e2 := binary.LittleEndian.Uint32(a.bytes[8:]) | |
e3 := binary.LittleEndian.Uint32(a.bytes[12:]) | |
if imm8 > 31 { | |
e0 = 0 | |
e1 = 0 | |
e2 = 0 | |
e3 = 0 | |
} else { | |
e0 = e0 << imm8 | |
e1 = e1 << imm8 | |
e2 = e2 << imm8 | |
e3 = e3 << imm8 | |
} | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) { | |
lo := binary.LittleEndian.Uint64(a.bytes[:]) | |
hi := binary.LittleEndian.Uint64(a.bytes[8:]) | |
if imm8 > 63 { | |
lo = 0 | |
hi = 0 | |
} else { | |
lo = lo >> imm8 | |
hi = hi >> imm8 | |
} | |
binary.LittleEndian.PutUint64(m.bytes[:], lo) | |
binary.LittleEndian.PutUint64(m.bytes[8:], hi) | |
return | |
} | |
func mm_set_epi8(in ...byte) (m __m128i) { | |
n := len(in) | |
if n > 16 { | |
n = 16 | |
} | |
for i := 0; i < n; i++ { | |
m.bytes[15-i] = in[i] | |
} | |
return | |
} | |
func mm_setr_epi8(in ...byte) (m __m128i) { | |
n := len(in) | |
if n > 16 { | |
n = 16 | |
} | |
for i := 0; i < n; i++ { | |
m.bytes[i] = in[i] | |
} | |
return | |
} | |
func mm_set1_epi32(a uint32) (m __m128i) { | |
return mm_set_epi32(a, a, a, a) | |
} | |
func mm_cmplt_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if a.bytes[i] < b.bytes[i] { | |
m.bytes[i] = 0xff | |
} else { | |
m.bytes[i] = 0x00 | |
} | |
} | |
return | |
} | |
func mm_cmpgt_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if a.bytes[i] > b.bytes[i] { | |
m.bytes[i] = 0xff | |
} else { | |
m.bytes[i] = 0x00 | |
} | |
} | |
return | |
} | |
func mm_cmpeq_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if a.bytes[i] == b.bytes[i] { | |
m.bytes[i] = 0xff | |
} else { | |
m.bytes[i] = 0x00 | |
} | |
} | |
return | |
} | |
func mm_set1_epi8(a int8) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = byte(a) | |
} | |
return | |
} | |
func xor(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] ^ b.bytes[i] | |
} | |
return | |
} | |
func mm_subs_epu8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = 0 | |
if a.bytes[i] > b.bytes[i] { | |
m.bytes[i] = a.bytes[i] - b.bytes[i] | |
} | |
} | |
return | |
} | |
func mm_add_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] + b.bytes[i] | |
} | |
return | |
} | |
func mm_sub_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = byte(int8(a.bytes[i]) - int8(b.bytes[i])) | |
} | |
return | |
} | |
func mm_mulhi_epu16(a, b __m128i) (m __m128i) { | |
for i := 0; i < 8; i++ { | |
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8 | |
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8 | |
t3 := (t1 * t2) >> 16 | |
m.bytes[2*i] = byte(t3) | |
m.bytes[2*i+1] = byte(t3 >> 8) | |
} | |
return | |
} | |
func mm_mullo_epi16(a, b __m128i) (m __m128i) { | |
for i := 0; i < 8; i++ { | |
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8 | |
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8 | |
t3 := int32(t1) * int32(t2) | |
m.bytes[2*i] = byte(t3) | |
m.bytes[2*i+1] = byte(t3 >> 8) | |
} | |
return | |
} | |
func enc_reshuffle(in __m128i) __m128i { | |
// Input, bytes MSB to LSB: | |
// 0 0 0 0 l k j i h g f e d c b a | |
in = mm_shuffle_epi8(in, mm_set_epi8( | |
10, 11, 9, 10, | |
7, 8, 6, 7, | |
4, 5, 3, 4, | |
1, 2, 0, 1)) | |
fmt.Printf("%x\n", (in.bytes[:])) | |
// in, bytes MSB to LSB: | |
// k l j k | |
// h i g h | |
// e f d e | |
// b c a b | |
t0 := mm_and_si128(in, mm_set1_epi32(0x0FC0FC00)) | |
// bits, upper case are most significant bits, lower case are least significant bits | |
// 0000kkkk LL000000 JJJJJJ00 00000000 | |
// 0000hhhh II000000 GGGGGG00 00000000 | |
// 0000eeee FF000000 DDDDDD00 00000000 | |
// 0000bbbb CC000000 AAAAAA00 00000000 | |
fmt.Printf("t0=%x\n", (t0.bytes[:])) | |
t1 := mm_mulhi_epu16(t0, mm_set1_epi32(0x04000040)) | |
// 00000000 00kkkkLL 00000000 00JJJJJJ | |
// 00000000 00hhhhII 00000000 00GGGGGG | |
// 00000000 00eeeeFF 00000000 00DDDDDD | |
// 00000000 00bbbbCC 00000000 00AAAAAA | |
fmt.Printf("t1=%x\n", (t1.bytes[:])) | |
t2 := mm_and_si128(in, mm_set1_epi32(0x003F03F0)) | |
// 00000000 00llllll 000000jj KKKK0000 | |
// 00000000 00iiiiii 000000gg HHHH0000 | |
// 00000000 00ffffff 000000dd EEEE0000 | |
// 00000000 00cccccc 000000aa BBBB0000 | |
fmt.Printf("t2=%x\n", (t2.bytes[:])) | |
t3 := mm_mullo_epi16(t2, mm_set1_epi32(0x01000010)) | |
// 00llllll 00000000 00jjKKKK 00000000 | |
// 00iiiiii 00000000 00ggHHHH 00000000 | |
// 00ffffff 00000000 00ddEEEE 00000000 | |
// 00cccccc 00000000 00aaBBBB 00000000 | |
fmt.Printf("t3=%x\n", (t3.bytes[:])) | |
return mm_or_si128(t1, t3) | |
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ | |
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG | |
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD | |
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA | |
} | |
func enc_translate(in __m128i) __m128i { | |
lut := mm_setr_epi8( | |
65, 71, 252, 252, | |
252, 252, 252, 252, | |
252, 252, 252, 252, | |
237, 240, 0, 0) | |
fmt.Printf("lut=%x\n", (lut.bytes[:])) | |
// Translate values 0..63 to the Base64 alphabet. There are five sets: | |
// # From To Abs Index Characters | |
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ | |
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz | |
// 2 [52..61] [48..57] -4 [2..11] 0123456789 | |
// 3 [62] [43] -19 12 + | |
// 4 [63] [47] -16 13 / | |
// Create LUT indices from the input. The index for range #0 is right, | |
// others are 1 less than expected: | |
indices := mm_subs_epu8(in, mm_set1_epi8(51)) | |
fmt.Printf("indices=%x\n", (indices.bytes[:])) | |
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: | |
mask := mm_cmpgt_epi8(in, mm_set1_epi8(25)) | |
fmt.Printf("mask=%x\n", (mask.bytes[:])) | |
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are | |
// now correct: | |
indices = mm_sub_epi8(indices, mask) | |
fmt.Printf("indices=%x\n", (indices.bytes[:])) | |
// Add offsets to input values: | |
return mm_add_epi8(in, mm_shuffle_epi8(lut, indices)) | |
} | |
func mm_movemask_epi8(in __m128i) int { | |
ret := 0 | |
for i := 0; i < 16; i++ { | |
ret |= int((in.bytes[i]&0x80)>>7) << i | |
} | |
return ret | |
} | |
func mm_maddubs_epi16(a, b __m128i) (m __m128i) { | |
for i := 0; i < 8; i++ { | |
ret := int16(a.bytes[2*i+1])*int16(b.bytes[2*i+1]) + int16(a.bytes[2*i])*int16(b.bytes[2*i]) | |
m.bytes[2*i] = byte(ret) | |
m.bytes[2*i+1] = byte(ret >> 8) | |
} | |
return | |
} | |
func mm_madd_epi16(a, b __m128i) (m __m128i) { | |
for i := 0; i < 4; i++ { | |
ah := int32(a.bytes[4*i+2]) | (int32(a.bytes[4*i+3]) << 8) | |
al := int32(a.bytes[4*i]) | (int32(a.bytes[4*i+1]) << 8) | |
bh := int32(b.bytes[4*i+2]) | (int32(b.bytes[4*i+3]) << 8) | |
bl := int32(b.bytes[4*i]) | (int32(b.bytes[4*i+1]) << 8) | |
ret := ah*bh + al*bl | |
m.bytes[4*i] = byte(ret) | |
m.bytes[4*i+1] = byte(ret >> 8) | |
m.bytes[4*i+2] = byte(ret >> 16) | |
m.bytes[4*i+3] = byte(ret >> 24) | |
} | |
return | |
} | |
func dec_reshuffle(in __m128i) __m128i { | |
// in, bits, upper case are most significant bits, lower case are least significant bits | |
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ | |
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG | |
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD | |
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA | |
merge_ab_and_bc := mm_maddubs_epi16(in, mm_set1_epi32(0x01400140)) | |
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK | |
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH | |
// 0000eeee FFffffff 0000DDDD DDddEEEE | |
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB | |
out := mm_madd_epi16(merge_ab_and_bc, mm_set1_epi32(0x00011000)) | |
// 00000000 JJJJJJjj KKKKkkkk LLllllll | |
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii | |
// 00000000 DDDDDDdd EEEEeeee FFffffff | |
// 00000000 AAAAAAaa BBBBbbbb CCcccccc | |
return mm_shuffle_epi8(out, mm_setr_epi8(2, 1, 0, | |
6, 5, 4, | |
10, 9, 8, | |
14, 13, 12, | |
255, 255, 255, 255)) | |
// 00000000 00000000 00000000 00000000 | |
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii | |
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee | |
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa | |
} | |
// The input consists of six character sets in the Base64 alphabet, which we | |
// need to map back to the 6-bit values they represent. There are three ranges, | |
// two singles, and then there's the rest. | |
// | |
// # From To Add Characters | |
// 1 [43] [62] +19 + | |
// 2 [47] [63] +16 / | |
// 3 [48..57] [52..61] +4 0..9 | |
// 4 [65..90] [0..25] -65 A..Z | |
// 5 [97..122] [26..51] -71 a..z | |
// | |
// (6) Everything else => invalid input | |
// | |
// We will use lookup tables for character validation and offset computation. | |
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this | |
// allows to mask with 0x2F instead of 0x0F and thus save one constant | |
// declaration (register and/or memory access). | |
// | |
// For offsets: | |
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00) | |
// 0000 = garbage | |
// 0001 = / | |
// 0010 = + | |
// 0011 = 0-9 | |
// 0100 = A-Z | |
// 0101 = A-Z | |
// 0110 = a-z | |
// 0111 = a-z | |
// 1000 >= garbage | |
// | |
// For validation, here's the table. | |
// A character is valid if and only if the AND of the 2 lookups equals 0: | |
// | |
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 | |
// | |
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A | |
// | |
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI | |
// | |
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// | |
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US | |
// | |
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// | |
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . / | |
// | |
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00 | |
// | |
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | |
// | |
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02 | |
// | |
// 0100 0x04 char @ A B C D E F G H I J K L M N O | |
// | |
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 | |
// | |
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _ | |
// | |
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 | |
// | |
// 0110 0x04 char ` a b c d e f g h i j k l m n o | |
// | |
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 | |
// | |
// 0111 0x08 char p q r s t u v w x y z { | } ~ | |
// | |
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 | |
// | |
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
func dec_translate_std(in __m128i) (m __m128i, ret int) { | |
lut_lo := mm_setr_epi8( | |
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, | |
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A) | |
lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, | |
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10) | |
lut_roll := mm_setr_epi8(0, 16, 19, 4, 256-65, 256-65, 256-71, 256-71, | |
0, 0, 0, 0, 0, 0, 0, 0) | |
mask_2F := mm_set1_epi8(0x2F) | |
hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_2F) | |
lo_nibbles := mm_and_si128(in, mask_2F) | |
hi := mm_shuffle_epi8(lut_hi, hi_nibbles) | |
lo := mm_shuffle_epi8(lut_lo, lo_nibbles) | |
if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 { | |
return | |
} | |
eq_2F := mm_cmpeq_epi8(in, mask_2F) | |
roll := mm_shuffle_epi8(lut_roll, mm_add_epi8(eq_2F, hi_nibbles)) | |
// Now simply add the delta values to the input: | |
return mm_add_epi8(in, roll), 1 | |
} | |
// The input consists of six character sets in the Base64 alphabet, which we | |
// need to map back to the 6-bit values they represent. There are three ranges, | |
// two singles, and then there's the rest. | |
// | |
// # From To Add Characters | |
// 1 [45] [62] +17 - | |
// 2 [48..57] [52..61] +4 0..9 | |
// 3 [65..90] [0..25] -65 A..Z | |
// 4 [95] [63] -32 _ | |
// 5 [97..122] [26..51] -71 a..z | |
// | |
// (6) Everything else => invalid input | |
// | |
// We will use lookup tables for character validation and offset computation. | |
// | |
// For offsets: | |
// Perfect hash for lut = ((src >> 4) & 0x0F) - ((src > 0x5e) ? 0xFF : 0x00) | |
// 0000 = garbage | |
// 0001 = garbage | |
// 0010 = - | |
// 0011 = 0-9 | |
// 0100 = A-Z | |
// 0101 = A-Z | |
// 0110 = _ | |
// 0111 = a-z | |
// 1000 = a-z | |
// 1000 > garbage | |
// | |
// For validation, here's the table. | |
// A character is valid if and only if the AND of the 2 lookups equals 0: | |
// | |
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 | |
// | |
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1B 0x1B 0x1A 0x1B 0x33 | |
// | |
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI | |
// | |
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// | |
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US | |
// | |
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// | |
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . / | |
// | |
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 | |
// | |
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | |
// | |
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02 | |
// | |
// 0100 0x04 char @ A B C D E F G H I J K L M N O | |
// | |
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 | |
// | |
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _ | |
// | |
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x00 | |
// | |
// 0110 0x04 char ` a b c d e f g h i j k l m n o | |
// | |
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 | |
// | |
// 0111 0x28 char p q r s t u v w x y z { | } ~ | |
// | |
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x20 | |
// | |
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 | |
func dec_translate_url(in __m128i) (m __m128i, ret int) { | |
lut_lo := mm_setr_epi8( | |
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, | |
0x11, 0x11, 0x13, 0x1B, 0x1B, 0x1A, 0x1B, 0x33) | |
lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x28, | |
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10) | |
lut_roll := mm_setr_epi8(0, 0, 17, 4, 256-65, 256-65, 256-32, 256-71, 256-71, 0, 0, 0, 0, 0, 0, 0) | |
mask_0F := mm_set1_epi8(0x0F) | |
hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_0F) | |
lo_nibbles := mm_and_si128(in, mask_0F) | |
hi := mm_shuffle_epi8(lut_hi, hi_nibbles) | |
lo := mm_shuffle_epi8(lut_lo, lo_nibbles) | |
if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 { | |
return | |
} | |
gt_5e := mm_cmpgt_epi8(in, mm_set1_epi8(0x5E)) | |
roll := mm_shuffle_epi8(lut_roll, mm_sub_epi8(hi_nibbles, gt_5e)) | |
// Now simply add the delta values to the input: | |
return mm_add_epi8(in, roll), 1 | |
} | |
func main() { | |
// std | |
encoded := mm_setr_epi8([]byte(base64.StdEncoding.EncodeToString([]byte("abcdefghijkl")))...) | |
decoded, ret := dec_translate_std(encoded) | |
if ret == 1 { | |
decoded = dec_reshuffle(decoded) | |
fmt.Printf("%v\n", string(decoded.bytes[:12])) | |
} else { | |
fmt.Println("invalid base64 encoded") | |
} | |
// url | |
encoded = mm_setr_epi8([]byte(base64.URLEncoding.EncodeToString([]byte("!?$*&()'-=@~")))...) | |
decoded, ret = dec_translate_url(encoded) | |
if ret == 1 { | |
decoded = dec_reshuffle(decoded) | |
fmt.Printf("%v\n", string(decoded.bytes[:12])) | |
} else { | |
fmt.Println("invalid base64 encoded") | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/base64" | |
"encoding/binary" | |
"fmt" | |
) | |
type __m128i struct { | |
bytes [16]byte | |
} | |
func set64(hi, lo uint64) (m __m128i) { | |
binary.LittleEndian.PutUint64(m.bytes[:], lo) | |
binary.LittleEndian.PutUint64(m.bytes[8:], hi) | |
return | |
} | |
func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) { | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_and_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] & b.bytes[i] | |
} | |
return | |
} | |
func mm_or_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] | b.bytes[i] | |
} | |
return | |
} | |
func mm_andnot_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = (^a.bytes[i]) & b.bytes[i] | |
} | |
return | |
} | |
func mm_shuffle_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if b.bytes[i]&0x80 == 0x80 { | |
m.bytes[i] = 0 | |
} else { | |
idx := b.bytes[i] & 0x0f | |
m.bytes[i] = a.bytes[idx] | |
} | |
} | |
return | |
} | |
func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) { | |
e0 := binary.LittleEndian.Uint32(a.bytes[:]) | |
e1 := binary.LittleEndian.Uint32(a.bytes[4:]) | |
e2 := binary.LittleEndian.Uint32(a.bytes[8:]) | |
e3 := binary.LittleEndian.Uint32(a.bytes[12:]) | |
if imm8 > 31 { | |
e0 = 0 | |
e1 = 0 | |
e2 = 0 | |
e3 = 0 | |
} else { | |
e0 = e0 >> imm8 | |
e1 = e1 >> imm8 | |
e2 = e2 >> imm8 | |
e3 = e3 >> imm8 | |
} | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) { | |
e0 := binary.LittleEndian.Uint32(a.bytes[:]) | |
e1 := binary.LittleEndian.Uint32(a.bytes[4:]) | |
e2 := binary.LittleEndian.Uint32(a.bytes[8:]) | |
e3 := binary.LittleEndian.Uint32(a.bytes[12:]) | |
if imm8 > 31 { | |
e0 = 0 | |
e1 = 0 | |
e2 = 0 | |
e3 = 0 | |
} else { | |
e0 = e0 << imm8 | |
e1 = e1 << imm8 | |
e2 = e2 << imm8 | |
e3 = e3 << imm8 | |
} | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) { | |
lo := binary.LittleEndian.Uint64(a.bytes[:]) | |
hi := binary.LittleEndian.Uint64(a.bytes[8:]) | |
if imm8 > 63 { | |
lo = 0 | |
hi = 0 | |
} else { | |
lo = lo >> imm8 | |
hi = hi >> imm8 | |
} | |
binary.LittleEndian.PutUint64(m.bytes[:], lo) | |
binary.LittleEndian.PutUint64(m.bytes[8:], hi) | |
return | |
} | |
func mm_setr_epi8(in []byte) (m __m128i) { | |
n := len(in) | |
if n > 16 { | |
n = 16 | |
} | |
for i := 0; i < n; i++ { | |
m.bytes[i] = in[i] | |
} | |
return | |
} | |
func mm_set1_epi32(a uint32) (m __m128i) { | |
return mm_set_epi32(a, a, a, a) | |
} | |
func mm_cmplt_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if a.bytes[i] < b.bytes[i] { | |
m.bytes[i] = 0xff | |
} else { | |
m.bytes[i] = 0x00 | |
} | |
} | |
return | |
} | |
func mm_set1_epi8(a int8) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = byte(a) | |
} | |
return | |
} | |
func mm_add_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] + b.bytes[i] | |
} | |
return | |
} | |
func xor(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] ^ b.bytes[i] | |
} | |
return | |
} | |
func main() { | |
// Load string: | |
str := mm_setr_epi8([]byte("ABCDEFGHIJKLMMMM")) | |
// Reorder to 32-bit big-endian, duplicating the third byte in every block of four. | |
// This copies the third byte to its final destination, so we can include it later | |
// by just masking instead of shifting and masking. | |
// The workset must be in big-endian, otherwise the shifted bits do not carry over | |
// properly among adjacent bytes: | |
str = mm_shuffle_epi8(str, | |
mm_setr_epi8([]byte{2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9})) | |
// Mask to pass through only the lower 6 bits of one byte; | |
mask := mm_set1_epi32(0x3F000000) | |
// Shift bits by 2, mask in only the first byte: | |
res := mm_srli_epi32(str, 2) | |
res = mm_and_si128(res, mask) | |
mask = mm_srli_epi32(mask, 8) | |
// Shift bits by 4, mask in only the second byte: | |
res = mm_or_si128(mm_and_si128(mm_srli_epi32(str, 4), mask), res) | |
mask = mm_srli_epi32(mask, 8) | |
// Shift bits by 6, mask in only the third byte: | |
res = mm_or_si128(mm_and_si128(mm_srli_epi32(str, 6), mask), res) | |
mask = mm_srli_epi32(mask, 8) | |
// No shift necessary for the fourth byte because we duplicated | |
// the third byte to this position; just mask: | |
res = mm_or_si128(mm_and_si128(str, mask), res) | |
// Reorder to 32-bit little-endian: | |
res = mm_shuffle_epi8(res, | |
mm_setr_epi8([]byte{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12})) | |
// set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
s1mask := mm_cmplt_epi8(res, mm_set1_epi8(26)) | |
blockmask := s1mask | |
// set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" | |
s2mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(52))) | |
blockmask = mm_or_si128(s2mask, blockmask) | |
// set 3: 52..61, "0123456789" | |
s3mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(62))) | |
blockmask = mm_or_si128(s3mask, blockmask) | |
s4mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(63))) | |
blockmask = mm_or_si128(s4mask, blockmask) | |
// Create the masked character sets: | |
s1 := mm_and_si128(s1mask, mm_add_epi8(res, mm_set1_epi8('A'))) | |
s2 := mm_and_si128(s2mask, mm_add_epi8(res, mm_set1_epi8('a'-26))) | |
s3 := mm_and_si128(s3mask, mm_add_epi8(res, mm_set1_epi8('0'-52))) | |
s4 := mm_and_si128(s4mask, mm_set1_epi8('+')) | |
s5 := mm_andnot_si128(blockmask, mm_set1_epi8('/')) | |
result := mm_or_si128(s1, s2) | |
result = mm_or_si128(result, s3) | |
result = mm_or_si128(result, s4) | |
result = mm_or_si128(result, s5) | |
fmt.Printf("%s\n", string(result.bytes[:])) | |
fmt.Printf("%s\n", base64.StdEncoding.EncodeToString([]byte("ABCDEFGHIJKL"))) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/base64" | |
"encoding/binary" | |
"fmt" | |
) | |
type __m128i struct { | |
bytes [16]byte | |
} | |
func set64(hi, lo uint64) (m __m128i) { | |
binary.LittleEndian.PutUint64(m.bytes[:], lo) | |
binary.LittleEndian.PutUint64(m.bytes[8:], hi) | |
return | |
} | |
func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) { | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_and_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] & b.bytes[i] | |
} | |
return | |
} | |
func mm_or_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] | b.bytes[i] | |
} | |
return | |
} | |
func mm_andnot_si128(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = (^a.bytes[i]) & b.bytes[i] | |
} | |
return | |
} | |
func mm_shuffle_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if b.bytes[i]&0x80 == 0x80 { | |
m.bytes[i] = 0 | |
} else { | |
idx := b.bytes[i] & 0x0f | |
m.bytes[i] = a.bytes[idx] | |
} | |
} | |
return | |
} | |
func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) { | |
e0 := binary.LittleEndian.Uint32(a.bytes[:]) | |
e1 := binary.LittleEndian.Uint32(a.bytes[4:]) | |
e2 := binary.LittleEndian.Uint32(a.bytes[8:]) | |
e3 := binary.LittleEndian.Uint32(a.bytes[12:]) | |
if imm8 > 31 { | |
e0 = 0 | |
e1 = 0 | |
e2 = 0 | |
e3 = 0 | |
} else { | |
e0 = e0 >> imm8 | |
e1 = e1 >> imm8 | |
e2 = e2 >> imm8 | |
e3 = e3 >> imm8 | |
} | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) { | |
e0 := binary.LittleEndian.Uint32(a.bytes[:]) | |
e1 := binary.LittleEndian.Uint32(a.bytes[4:]) | |
e2 := binary.LittleEndian.Uint32(a.bytes[8:]) | |
e3 := binary.LittleEndian.Uint32(a.bytes[12:]) | |
if imm8 > 31 { | |
e0 = 0 | |
e1 = 0 | |
e2 = 0 | |
e3 = 0 | |
} else { | |
e0 = e0 << imm8 | |
e1 = e1 << imm8 | |
e2 = e2 << imm8 | |
e3 = e3 << imm8 | |
} | |
binary.LittleEndian.PutUint32(m.bytes[:], e0) | |
binary.LittleEndian.PutUint32(m.bytes[4:], e1) | |
binary.LittleEndian.PutUint32(m.bytes[8:], e2) | |
binary.LittleEndian.PutUint32(m.bytes[12:], e3) | |
return | |
} | |
func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) { | |
lo := binary.LittleEndian.Uint64(a.bytes[:]) | |
hi := binary.LittleEndian.Uint64(a.bytes[8:]) | |
if imm8 > 63 { | |
lo = 0 | |
hi = 0 | |
} else { | |
lo = lo >> imm8 | |
hi = hi >> imm8 | |
} | |
binary.LittleEndian.PutUint64(m.bytes[:], lo) | |
binary.LittleEndian.PutUint64(m.bytes[8:], hi) | |
return | |
} | |
func mm_set_epi8(in ...byte) (m __m128i) { | |
n := len(in) | |
if n > 16 { | |
n = 16 | |
} | |
for i := 0; i < n; i++ { | |
m.bytes[15-i] = in[i] | |
} | |
return | |
} | |
func mm_setr_epi8(in ...byte) (m __m128i) { | |
n := len(in) | |
if n > 16 { | |
n = 16 | |
} | |
for i := 0; i < n; i++ { | |
m.bytes[i] = in[i] | |
} | |
return | |
} | |
func mm_set1_epi32(a uint32) (m __m128i) { | |
return mm_set_epi32(a, a, a, a) | |
} | |
func mm_cmplt_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if a.bytes[i] < b.bytes[i] { | |
m.bytes[i] = 0xff | |
} else { | |
m.bytes[i] = 0x00 | |
} | |
} | |
return | |
} | |
func mm_cmpgt_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
if a.bytes[i] > b.bytes[i] { | |
m.bytes[i] = 0xff | |
} else { | |
m.bytes[i] = 0x00 | |
} | |
} | |
return | |
} | |
func mm_set1_epi8(a int8) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = byte(a) | |
} | |
return | |
} | |
func xor(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] ^ b.bytes[i] | |
} | |
return | |
} | |
func mm_subs_epu8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = 0 | |
if a.bytes[i] > b.bytes[i] { | |
m.bytes[i] = a.bytes[i] - b.bytes[i] | |
} | |
} | |
return | |
} | |
func mm_add_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = a.bytes[i] + b.bytes[i] | |
} | |
return | |
} | |
func mm_sub_epi8(a, b __m128i) (m __m128i) { | |
for i := 0; i < 16; i++ { | |
m.bytes[i] = byte(int8(a.bytes[i]) - int8(b.bytes[i])) | |
} | |
return | |
} | |
func mm_mulhi_epu16(a, b __m128i) (m __m128i) { | |
for i := 0; i < 8; i++ { | |
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8 | |
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8 | |
t3 := (t1 * t2) >> 16 | |
m.bytes[2*i] = byte(t3) | |
m.bytes[2*i+1] = byte(t3 >> 8) | |
} | |
return | |
} | |
func mm_mullo_epi16(a, b __m128i) (m __m128i) { | |
for i := 0; i < 8; i++ { | |
t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8 | |
t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8 | |
t3 := int32(t1) * int32(t2) | |
m.bytes[2*i] = byte(t3) | |
m.bytes[2*i+1] = byte(t3 >> 8) | |
} | |
return | |
} | |
func enc_reshuffle(in __m128i) __m128i { | |
// Input, bytes MSB to LSB: | |
// 0 0 0 0 l k j i h g f e d c b a | |
in = mm_shuffle_epi8(in, mm_set_epi8( | |
10, 11, 9, 10, | |
7, 8, 6, 7, | |
4, 5, 3, 4, | |
1, 2, 0, 1)) | |
fmt.Printf("%x\n", (in.bytes[:])) | |
// in, bytes MSB to LSB: | |
// k l j k | |
// h i g h | |
// e f d e | |
// b c a b | |
t0 := mm_and_si128(in, mm_set1_epi32(0x0FC0FC00)) | |
// bits, upper case are most significant bits, lower case are least significant bits | |
// 0000kkkk LL000000 JJJJJJ00 00000000 | |
// 0000hhhh II000000 GGGGGG00 00000000 | |
// 0000eeee FF000000 DDDDDD00 00000000 | |
// 0000bbbb CC000000 AAAAAA00 00000000 | |
fmt.Printf("t0=%x\n", (t0.bytes[:])) | |
t1 := mm_mulhi_epu16(t0, mm_set1_epi32(0x04000040)) | |
// 00000000 00kkkkLL 00000000 00JJJJJJ | |
// 00000000 00hhhhII 00000000 00GGGGGG | |
// 00000000 00eeeeFF 00000000 00DDDDDD | |
// 00000000 00bbbbCC 00000000 00AAAAAA | |
fmt.Printf("t1=%x\n", (t1.bytes[:])) | |
t2 := mm_and_si128(in, mm_set1_epi32(0x003F03F0)) | |
// 00000000 00llllll 000000jj KKKK0000 | |
// 00000000 00iiiiii 000000gg HHHH0000 | |
// 00000000 00ffffff 000000dd EEEE0000 | |
// 00000000 00cccccc 000000aa BBBB0000 | |
fmt.Printf("t2=%x\n", (t2.bytes[:])) | |
t3 := mm_mullo_epi16(t2, mm_set1_epi32(0x01000010)) | |
// 00llllll 00000000 00jjKKKK 00000000 | |
// 00iiiiii 00000000 00ggHHHH 00000000 | |
// 00ffffff 00000000 00ddEEEE 00000000 | |
// 00cccccc 00000000 00aaBBBB 00000000 | |
fmt.Printf("t3=%x\n", (t3.bytes[:])) | |
return mm_or_si128(t1, t3) | |
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ | |
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG | |
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD | |
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA | |
} | |
func enc_translate(in __m128i) __m128i { | |
lut := mm_setr_epi8( | |
65, 71, 252, 252, | |
252, 252, 252, 252, | |
252, 252, 252, 252, | |
237, 240, 0, 0) | |
fmt.Printf("lut=%x\n", (lut.bytes[:])) | |
// Translate values 0..63 to the Base64 alphabet. There are five sets: | |
// # From To Abs Index Characters | |
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ | |
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz | |
// 2 [52..61] [48..57] -4 [2..11] 0123456789 | |
// 3 [62] [43] -19 12 + | |
// 4 [63] [47] -16 13 / | |
// Create LUT indices from the input. The index for range #0 is right, | |
// others are 1 less than expected: | |
indices := mm_subs_epu8(in, mm_set1_epi8(51)) | |
fmt.Printf("indices=%x\n", (indices.bytes[:])) | |
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: | |
mask := mm_cmpgt_epi8(in, mm_set1_epi8(25)) | |
fmt.Printf("mask=%x\n", (mask.bytes[:])) | |
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are | |
// now correct: | |
indices = mm_sub_epi8(indices, mask) | |
fmt.Printf("indices=%x\n", (indices.bytes[:])) | |
// Add offsets to input values: | |
return mm_add_epi8(in, mm_shuffle_epi8(lut, indices)) | |
} | |
func main() { | |
// Load string: | |
str := mm_setr_epi8('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', '0', '0', '0', '0') | |
// Reshuffle: | |
result := enc_reshuffle(str) | |
fmt.Printf("resuffle %x\n", (result.bytes[:])) | |
// Translate reshuffled bytes to the Base64 alphabet: | |
result = enc_translate(result) | |
fmt.Printf("%s\n", string(result.bytes[:])) | |
fmt.Printf("%s\n", base64.StdEncoding.EncodeToString([]byte("abcdefghijkl"))) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment