emmansun/base64_decode_sse.go

## base64_decode_sse.go
package main

import (
	"encoding/base64"
	"encoding/binary"
	"fmt"
)

type __m128i struct {
	bytes [16]byte
}

func set64(hi, lo uint64) (m __m128i) {
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
}

func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) {
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_and_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] & b.bytes[i]
	}
	return
}

func mm_or_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] | b.bytes[i]
	}
	return
}

func mm_andnot_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = (^a.bytes[i]) & b.bytes[i]
	}
	return
}

func mm_shuffle_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if b.bytes[i]&0x80 == 0x80 {
			m.bytes[i] = 0
		} else {
			idx := b.bytes[i] & 0x0f
			m.bytes[i] = a.bytes[idx]
		}
	}
	return
}

func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
		e0 = 0
		e1 = 0
		e2 = 0
		e3 = 0
	} else {
		e0 = e0 >> imm8
		e1 = e1 >> imm8
		e2 = e2 >> imm8
		e3 = e3 >> imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
		e0 = 0
		e1 = 0
		e2 = 0
		e3 = 0
	} else {
		e0 = e0 << imm8
		e1 = e1 << imm8
		e2 = e2 << imm8
		e3 = e3 << imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) {
	lo := binary.LittleEndian.Uint64(a.bytes[:])
	hi := binary.LittleEndian.Uint64(a.bytes[8:])
	if imm8 > 63 {
		lo = 0
		hi = 0
	} else {
		lo = lo >> imm8
		hi = hi >> imm8
	}
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
}

func mm_set_epi8(in ...byte) (m __m128i) {
	n := len(in)
	if n > 16 {
		n = 16
	}
	for i := 0; i < n; i++ {
		m.bytes[15-i] = in[i]
	}
	return
}

func mm_setr_epi8(in ...byte) (m __m128i) {
	n := len(in)
	if n > 16 {
		n = 16
	}
	for i := 0; i < n; i++ {
		m.bytes[i] = in[i]
	}
	return
}

func mm_set1_epi32(a uint32) (m __m128i) {
	return mm_set_epi32(a, a, a, a)
}

func mm_cmplt_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if a.bytes[i] < b.bytes[i] {
			m.bytes[i] = 0xff
		} else {
			m.bytes[i] = 0x00
		}
	}
	return
}

func mm_cmpgt_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if a.bytes[i] > b.bytes[i] {
			m.bytes[i] = 0xff
		} else {
			m.bytes[i] = 0x00
		}
	}
	return
}

func mm_cmpeq_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if a.bytes[i] == b.bytes[i] {
			m.bytes[i] = 0xff
		} else {
			m.bytes[i] = 0x00
		}
	}
	return
}

func mm_set1_epi8(a int8) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = byte(a)
	}
	return
}

func xor(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] ^ b.bytes[i]
	}
	return
}

func mm_subs_epu8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = 0
		if a.bytes[i] > b.bytes[i] {
			m.bytes[i] = a.bytes[i] - b.bytes[i]
		}
	}
	return
}

func mm_add_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] + b.bytes[i]
	}
	return
}

func mm_sub_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = byte(int8(a.bytes[i]) - int8(b.bytes[i]))
	}
	return
}

func mm_mulhi_epu16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
		t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
		t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
		t3 := (t1 * t2) >> 16

		m.bytes[2*i] = byte(t3)
		m.bytes[2*i+1] = byte(t3 >> 8)
	}
	return
}

func mm_mullo_epi16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
		t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
		t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
		t3 := int32(t1) * int32(t2)
		m.bytes[2*i] = byte(t3)
		m.bytes[2*i+1] = byte(t3 >> 8)
	}
	return
}

func enc_reshuffle(in __m128i) __m128i {
	// Input, bytes MSB to LSB:
	// 0 0 0 0 l k j i h g f e d c b a

	in = mm_shuffle_epi8(in, mm_set_epi8(
		10, 11, 9, 10,
		7, 8, 6, 7,
		4, 5, 3, 4,
		1, 2, 0, 1))
	fmt.Printf("%x\n", (in.bytes[:]))
	// in, bytes MSB to LSB:
	// k l j k
	// h i g h
	// e f d e
	// b c a b

	t0 := mm_and_si128(in, mm_set1_epi32(0x0FC0FC00))
	// bits, upper case are most significant bits, lower case are least significant bits
	// 0000kkkk LL000000 JJJJJJ00 00000000
	// 0000hhhh II000000 GGGGGG00 00000000
	// 0000eeee FF000000 DDDDDD00 00000000
	// 0000bbbb CC000000 AAAAAA00 00000000
	fmt.Printf("t0=%x\n", (t0.bytes[:]))

	t1 := mm_mulhi_epu16(t0, mm_set1_epi32(0x04000040))
	// 00000000 00kkkkLL 00000000 00JJJJJJ
	// 00000000 00hhhhII 00000000 00GGGGGG
	// 00000000 00eeeeFF 00000000 00DDDDDD
	// 00000000 00bbbbCC 00000000 00AAAAAA
	fmt.Printf("t1=%x\n", (t1.bytes[:]))

	t2 := mm_and_si128(in, mm_set1_epi32(0x003F03F0))
	// 00000000 00llllll 000000jj KKKK0000
	// 00000000 00iiiiii 000000gg HHHH0000
	// 00000000 00ffffff 000000dd EEEE0000
	// 00000000 00cccccc 000000aa BBBB0000
	fmt.Printf("t2=%x\n", (t2.bytes[:]))

	t3 := mm_mullo_epi16(t2, mm_set1_epi32(0x01000010))
	// 00llllll 00000000 00jjKKKK 00000000
	// 00iiiiii 00000000 00ggHHHH 00000000
	// 00ffffff 00000000 00ddEEEE 00000000
	// 00cccccc 00000000 00aaBBBB 00000000
	fmt.Printf("t3=%x\n", (t3.bytes[:]))

	return mm_or_si128(t1, t3)
	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

func enc_translate(in __m128i) __m128i {
	lut := mm_setr_epi8(
		65, 71, 252, 252,
		252, 252, 252, 252,
		252, 252, 252, 252,
		237, 240, 0, 0)
	fmt.Printf("lut=%x\n", (lut.bytes[:]))

	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs    Index  Characters
	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
	// 3  [62]      [43]       -19       12  +
	// 4  [63]      [47]       -16       13  /

	// Create LUT indices from the input. The index for range #0 is right,
	// others are 1 less than expected:
	indices := mm_subs_epu8(in, mm_set1_epi8(51))
	fmt.Printf("indices=%x\n", (indices.bytes[:]))

	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
	mask := mm_cmpgt_epi8(in, mm_set1_epi8(25))
	fmt.Printf("mask=%x\n", (mask.bytes[:]))

	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
	// now correct:
	indices = mm_sub_epi8(indices, mask)
	fmt.Printf("indices=%x\n", (indices.bytes[:]))

	// Add offsets to input values:
	return mm_add_epi8(in, mm_shuffle_epi8(lut, indices))
}

func mm_movemask_epi8(in __m128i) int {
	ret := 0
	for i := 0; i < 16; i++ {
		ret |= int((in.bytes[i]&0x80)>>7) << i
	}
	return ret
}

func mm_maddubs_epi16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
		ret := int16(a.bytes[2*i+1])*int16(b.bytes[2*i+1]) + int16(a.bytes[2*i])*int16(b.bytes[2*i])
		m.bytes[2*i] = byte(ret)
		m.bytes[2*i+1] = byte(ret >> 8)
	}
	return
}

func mm_madd_epi16(a, b __m128i) (m __m128i) {
	for i := 0; i < 4; i++ {
		ah := int32(a.bytes[4*i+2]) | (int32(a.bytes[4*i+3]) << 8)
		al := int32(a.bytes[4*i]) | (int32(a.bytes[4*i+1]) << 8)
		bh := int32(b.bytes[4*i+2]) | (int32(b.bytes[4*i+3]) << 8)
		bl := int32(b.bytes[4*i]) | (int32(b.bytes[4*i+1]) << 8)
		ret := ah*bh + al*bl
		m.bytes[4*i] = byte(ret)
		m.bytes[4*i+1] = byte(ret >> 8)
		m.bytes[4*i+2] = byte(ret >> 16)
		m.bytes[4*i+3] = byte(ret >> 24)
	}
	return
}

func dec_reshuffle(in __m128i) __m128i {
	// in, bits, upper case are most significant bits, lower case are least significant bits
	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
	merge_ab_and_bc := mm_maddubs_epi16(in, mm_set1_epi32(0x01400140))
	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
	// 0000eeee FFffffff 0000DDDD DDddEEEE
	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
	out := mm_madd_epi16(merge_ab_and_bc, mm_set1_epi32(0x00011000))
	// 00000000 JJJJJJjj KKKKkkkk LLllllll
	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
	// 00000000 DDDDDDdd EEEEeeee FFffffff
	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
	return mm_shuffle_epi8(out, mm_setr_epi8(2, 1, 0,
		6, 5, 4,
		10, 9, 8,
		14, 13, 12,
		255, 255, 255, 255))
	// 00000000 00000000 00000000 00000000
	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
}

// The input consists of six character sets in the Base64 alphabet, which we
// need to map back to the 6-bit values they represent. There are three ranges,
// two singles, and then there's the rest.
//
//	#  From       To        Add  Characters
//	1  [43]       [62]      +19  +
//	2  [47]       [63]      +16  /
//	3  [48..57]   [52..61]   +4  0..9
//	4  [65..90]   [0..25]   -65  A..Z
//	5  [97..122]  [26..51]  -71  a..z
//
// (6) Everything else => invalid input
//
// We will use lookup tables for character validation and offset computation.
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
// allows to mask with 0x2F instead of 0x0F and thus save one constant
// declaration (register and/or memory access).
//
// For offsets:
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
// 0000 = garbage
// 0001 = /
// 0010 = +
// 0011 = 0-9
// 0100 = A-Z
// 0101 = A-Z
// 0110 = a-z
// 0111 = a-z
// 1000 >= garbage
//
// For validation, here's the table.
// A character is valid if and only if the AND of the 2 lookups equals 0:
//
// hi \ lo              0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
//
//	LUT             0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
//
// 0000 0x10 char        NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL   BS   HT   LF   VT   FF   CR   SO   SI
//
//	andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0001 0x10 char        DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN   EM  SUB  ESC   FS   GS   RS   US
//
//	andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0010 0x01 char               !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
//
//	andlut     0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
//
// 0011 0x02 char          0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
//
//	andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
//
// 0100 0x04 char          @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
//
//	andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0101 0x08 char          P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
//
//	andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 0110 0x04 char          `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
//
//	andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0111 0x08 char          p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
//
//	andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 1000 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1001 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1010 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1011 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1100 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1101 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
func dec_translate_std(in __m128i) (m __m128i, ret int) {
	lut_lo := mm_setr_epi8(
		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A)

	lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10)

	lut_roll := mm_setr_epi8(0, 16, 19, 4, 256-65, 256-65, 256-71, 256-71,
		0, 0, 0, 0, 0, 0, 0, 0)

	mask_2F := mm_set1_epi8(0x2F)

	hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_2F)
	lo_nibbles := mm_and_si128(in, mask_2F)
	hi := mm_shuffle_epi8(lut_hi, hi_nibbles)
	lo := mm_shuffle_epi8(lut_lo, lo_nibbles)
	if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 {
		return
	}
	eq_2F := mm_cmpeq_epi8(in, mask_2F)
	roll := mm_shuffle_epi8(lut_roll, mm_add_epi8(eq_2F, hi_nibbles))

	// Now simply add the delta values to the input:
	return mm_add_epi8(in, roll), 1
}

// The input consists of six character sets in the Base64 alphabet, which we
// need to map back to the 6-bit values they represent. There are three ranges,
// two singles, and then there's the rest.
//
//	#  From       To        Add  Characters
//	1  [45]       [62]      +17  -
//	2  [48..57]   [52..61]   +4  0..9
//	3  [65..90]   [0..25]   -65  A..Z
//	4  [95]       [63]      -32  _
//	5  [97..122]  [26..51]  -71  a..z
//
// (6) Everything else => invalid input
//
// We will use lookup tables for character validation and offset computation.
//
// For offsets:
// Perfect hash for lut = ((src >> 4) & 0x0F) - ((src > 0x5e) ? 0xFF : 0x00)
// 0000 = garbage
// 0001 = garbage
// 0010 = -
// 0011 = 0-9
// 0100 = A-Z
// 0101 = A-Z
// 0110 = _
// 0111 = a-z
// 1000 = a-z
// 1000 > garbage
//
// For validation, here's the table.
// A character is valid if and only if the AND of the 2 lookups equals 0:
//
// hi \ lo              0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
//
//	LUT             0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1B 0x1B 0x1A 0x1B 0x33
//
// 0000 0x10 char        NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL   BS   HT   LF   VT   FF   CR   SO   SI
//
//	andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0001 0x10 char        DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN   EM  SUB  ESC   FS   GS   RS   US
//
//	andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0010 0x01 char               !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
//
//	andlut     0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01
//
// 0011 0x02 char          0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
//
//	andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
//
// 0100 0x04 char          @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
//
//	andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0101 0x08 char          P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
//
//	andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x00
//
// 0110 0x04 char          `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
//
//	andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0111 0x28 char          p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
//
//	andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x20
//
// 1000 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1001 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1010 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1011 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1100 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1101 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
func dec_translate_url(in __m128i) (m __m128i, ret int) {
	lut_lo := mm_setr_epi8(
		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
		0x11, 0x11, 0x13, 0x1B, 0x1B, 0x1A, 0x1B, 0x33)

	lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x28,
		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10)

	lut_roll := mm_setr_epi8(0, 0, 17, 4, 256-65, 256-65, 256-32, 256-71, 256-71, 0, 0, 0, 0, 0, 0, 0)

	mask_0F := mm_set1_epi8(0x0F)

	hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_0F)
	lo_nibbles := mm_and_si128(in, mask_0F)
	hi := mm_shuffle_epi8(lut_hi, hi_nibbles)
	lo := mm_shuffle_epi8(lut_lo, lo_nibbles)
	if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 {
		return
	}
	gt_5e := mm_cmpgt_epi8(in, mm_set1_epi8(0x5E))
	roll := mm_shuffle_epi8(lut_roll, mm_sub_epi8(hi_nibbles, gt_5e))

	// Now simply add the delta values to the input:
	return mm_add_epi8(in, roll), 1
}

func main() {
	// std
	encoded := mm_setr_epi8([]byte(base64.StdEncoding.EncodeToString([]byte("abcdefghijkl")))...)
	decoded, ret := dec_translate_std(encoded)
	if ret == 1 {
		decoded = dec_reshuffle(decoded)
		fmt.Printf("%v\n", string(decoded.bytes[:12]))
	} else {
		fmt.Println("invalid base64 encoded")
	}
	// url
	encoded = mm_setr_epi8([]byte(base64.URLEncoding.EncodeToString([]byte("!?$*&()'-=@~")))...)
	decoded, ret = dec_translate_url(encoded)
	if ret == 1 {
		decoded = dec_reshuffle(decoded)
		fmt.Printf("%v\n", string(decoded.bytes[:12]))
	} else {
		fmt.Println("invalid base64 encoded")
	}
}

## base64_encode_sse.go
package main

import (
	"encoding/base64"
	"encoding/binary"
	"fmt"
)

type __m128i struct {
	bytes [16]byte
}

func set64(hi, lo uint64) (m __m128i) {
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
}

func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) {
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_and_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] & b.bytes[i]
	}
	return
}

func mm_or_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] | b.bytes[i]
	}
	return
}

func mm_andnot_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = (^a.bytes[i]) & b.bytes[i]
	}
	return
}

func mm_shuffle_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if b.bytes[i]&0x80 == 0x80 {
			m.bytes[i] = 0
		} else {
			idx := b.bytes[i] & 0x0f
			m.bytes[i] = a.bytes[idx]
		}
	}
	return
}

func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
		e0 = 0
		e1 = 0
		e2 = 0
		e3 = 0
	} else {
		e0 = e0 >> imm8
		e1 = e1 >> imm8
		e2 = e2 >> imm8
		e3 = e3 >> imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
		e0 = 0
		e1 = 0
		e2 = 0
		e3 = 0
	} else {
		e0 = e0 << imm8
		e1 = e1 << imm8
		e2 = e2 << imm8
		e3 = e3 << imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) {
	lo := binary.LittleEndian.Uint64(a.bytes[:])
	hi := binary.LittleEndian.Uint64(a.bytes[8:])
	if imm8 > 63 {
		lo = 0
		hi = 0
	} else {
		lo = lo >> imm8
		hi = hi >> imm8
	}
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
}

func mm_setr_epi8(in []byte) (m __m128i) {
	n := len(in)
	if n > 16 {
		n = 16
	}
	for i := 0; i < n; i++ {
		m.bytes[i] = in[i]
	}
	return
}

func mm_set1_epi32(a uint32) (m __m128i) {
	return mm_set_epi32(a, a, a, a)
}

func mm_cmplt_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if a.bytes[i] < b.bytes[i] {
			m.bytes[i] = 0xff
		} else {
			m.bytes[i] = 0x00
		}
	}
	return
}

func mm_set1_epi8(a int8) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = byte(a)
	}
	return
}

func mm_add_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] + b.bytes[i]
	}
	return
}

func xor(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] ^ b.bytes[i]
	}
	return
}

func main() {
	// Load string:
	str := mm_setr_epi8([]byte("ABCDEFGHIJKLMMMM"))

	// Reorder to 32-bit big-endian, duplicating the third byte in every block of four.
	// This copies the third byte to its final destination, so we can include it later
	// by just masking instead of shifting and masking.
	// The workset must be in big-endian, otherwise the shifted bits do not carry over
	// properly among adjacent bytes:
	str = mm_shuffle_epi8(str,
		mm_setr_epi8([]byte{2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9}))

	// Mask to pass through only the lower 6 bits of one byte;
	mask := mm_set1_epi32(0x3F000000)

	// Shift bits by 2, mask in only the first byte:
	res := mm_srli_epi32(str, 2)
	res = mm_and_si128(res, mask)
	mask = mm_srli_epi32(mask, 8)

	// Shift bits by 4, mask in only the second byte:
	res = mm_or_si128(mm_and_si128(mm_srli_epi32(str, 4), mask), res)
	mask = mm_srli_epi32(mask, 8)

	// Shift bits by 6, mask in only the third byte:
	res = mm_or_si128(mm_and_si128(mm_srli_epi32(str, 6), mask), res)
	mask = mm_srli_epi32(mask, 8)

	// No shift necessary for the fourth byte because we duplicated
	// the third byte to this position; just mask:
	res = mm_or_si128(mm_and_si128(str, mask), res)

	// Reorder to 32-bit little-endian:
	res = mm_shuffle_epi8(res,
		mm_setr_epi8([]byte{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}))

	// set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	s1mask := mm_cmplt_epi8(res, mm_set1_epi8(26))
	blockmask := s1mask

	// set 2: 26..51, "abcdefghijklmnopqrstuvwxyz"
	s2mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(52)))
	blockmask = mm_or_si128(s2mask, blockmask)

	// set 3: 52..61, "0123456789"
	s3mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(62)))
	blockmask = mm_or_si128(s3mask, blockmask)

	s4mask := mm_andnot_si128(blockmask, mm_cmplt_epi8(res, mm_set1_epi8(63)))
	blockmask = mm_or_si128(s4mask, blockmask)

	// Create the masked character sets:
	s1 := mm_and_si128(s1mask, mm_add_epi8(res, mm_set1_epi8('A')))
	s2 := mm_and_si128(s2mask, mm_add_epi8(res, mm_set1_epi8('a'-26)))
	s3 := mm_and_si128(s3mask, mm_add_epi8(res, mm_set1_epi8('0'-52)))
	s4 := mm_and_si128(s4mask, mm_set1_epi8('+'))
	s5 := mm_andnot_si128(blockmask, mm_set1_epi8('/'))

	result := mm_or_si128(s1, s2)
	result = mm_or_si128(result, s3)
	result = mm_or_si128(result, s4)
	result = mm_or_si128(result, s5)

	fmt.Printf("%s\n", string(result.bytes[:]))

	fmt.Printf("%s\n", base64.StdEncoding.EncodeToString([]byte("ABCDEFGHIJKL")))
}

## base64_encode_sse_2.go
package main

import (
	"encoding/base64"
	"encoding/binary"
	"fmt"
)

type __m128i struct {
	bytes [16]byte
}

func set64(hi, lo uint64) (m __m128i) {
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
}

func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) {
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_and_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] & b.bytes[i]
	}
	return
}

func mm_or_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] | b.bytes[i]
	}
	return
}

func mm_andnot_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = (^a.bytes[i]) & b.bytes[i]
	}
	return
}

func mm_shuffle_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if b.bytes[i]&0x80 == 0x80 {
			m.bytes[i] = 0
		} else {
			idx := b.bytes[i] & 0x0f
			m.bytes[i] = a.bytes[idx]
		}
	}
	return
}

func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
		e0 = 0
		e1 = 0
		e2 = 0
		e3 = 0
	} else {
		e0 = e0 >> imm8
		e1 = e1 >> imm8
		e2 = e2 >> imm8
		e3 = e3 >> imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
		e0 = 0
		e1 = 0
		e2 = 0
		e3 = 0
	} else {
		e0 = e0 << imm8
		e1 = e1 << imm8
		e2 = e2 << imm8
		e3 = e3 << imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
}

func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) {
	lo := binary.LittleEndian.Uint64(a.bytes[:])
	hi := binary.LittleEndian.Uint64(a.bytes[8:])
	if imm8 > 63 {
		lo = 0
		hi = 0
	} else {
		lo = lo >> imm8
		hi = hi >> imm8
	}
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
}

func mm_set_epi8(in ...byte) (m __m128i) {
	n := len(in)
	if n > 16 {
		n = 16
	}
	for i := 0; i < n; i++ {
		m.bytes[15-i] = in[i]
	}
	return
}

func mm_setr_epi8(in ...byte) (m __m128i) {
	n := len(in)
	if n > 16 {
		n = 16
	}
	for i := 0; i < n; i++ {
		m.bytes[i] = in[i]
	}
	return
}

func mm_set1_epi32(a uint32) (m __m128i) {
	return mm_set_epi32(a, a, a, a)
}

func mm_cmplt_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if a.bytes[i] < b.bytes[i] {
			m.bytes[i] = 0xff
		} else {
			m.bytes[i] = 0x00
		}
	}
	return
}

func mm_cmpgt_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		if a.bytes[i] > b.bytes[i] {
			m.bytes[i] = 0xff
		} else {
			m.bytes[i] = 0x00
		}
	}
	return
}

func mm_set1_epi8(a int8) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = byte(a)
	}
	return
}

func xor(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] ^ b.bytes[i]
	}
	return
}

func mm_subs_epu8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = 0
		if a.bytes[i] > b.bytes[i] {
			m.bytes[i] = a.bytes[i] - b.bytes[i]
		}
	}
	return
}

func mm_add_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = a.bytes[i] + b.bytes[i]
	}
	return
}

func mm_sub_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
		m.bytes[i] = byte(int8(a.bytes[i]) - int8(b.bytes[i]))
	}
	return
}

func mm_mulhi_epu16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
		t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
		t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
		t3 := (t1 * t2) >> 16

		m.bytes[2*i] = byte(t3)
		m.bytes[2*i+1] = byte(t3 >> 8)
	}
	return
}

func mm_mullo_epi16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
		t1 := uint32(a.bytes[2*i]) | uint32(a.bytes[2*i+1])<<8
		t2 := uint32(b.bytes[2*i]) | uint32(b.bytes[2*i+1])<<8
		t3 := int32(t1) * int32(t2)
		m.bytes[2*i] = byte(t3)
		m.bytes[2*i+1] = byte(t3 >> 8)
	}
	return
}

func enc_reshuffle(in __m128i) __m128i {
	// Input, bytes MSB to LSB:
	// 0 0 0 0 l k j i h g f e d c b a

	in = mm_shuffle_epi8(in, mm_set_epi8(
		10, 11, 9, 10,
		7, 8, 6, 7,
		4, 5, 3, 4,
		1, 2, 0, 1))
	fmt.Printf("%x\n", (in.bytes[:]))
	// in, bytes MSB to LSB:
	// k l j k
	// h i g h
	// e f d e
	// b c a b

	t0 := mm_and_si128(in, mm_set1_epi32(0x0FC0FC00))
	// bits, upper case are most significant bits, lower case are least significant bits
	// 0000kkkk LL000000 JJJJJJ00 00000000
	// 0000hhhh II000000 GGGGGG00 00000000
	// 0000eeee FF000000 DDDDDD00 00000000
	// 0000bbbb CC000000 AAAAAA00 00000000
	fmt.Printf("t0=%x\n", (t0.bytes[:]))

	t1 := mm_mulhi_epu16(t0, mm_set1_epi32(0x04000040))
	// 00000000 00kkkkLL 00000000 00JJJJJJ
	// 00000000 00hhhhII 00000000 00GGGGGG
	// 00000000 00eeeeFF 00000000 00DDDDDD
	// 00000000 00bbbbCC 00000000 00AAAAAA
	fmt.Printf("t1=%x\n", (t1.bytes[:]))

	t2 := mm_and_si128(in, mm_set1_epi32(0x003F03F0))
	// 00000000 00llllll 000000jj KKKK0000
	// 00000000 00iiiiii 000000gg HHHH0000
	// 00000000 00ffffff 000000dd EEEE0000
	// 00000000 00cccccc 000000aa BBBB0000
	fmt.Printf("t2=%x\n", (t2.bytes[:]))

	t3 := mm_mullo_epi16(t2, mm_set1_epi32(0x01000010))
	// 00llllll 00000000 00jjKKKK 00000000
	// 00iiiiii 00000000 00ggHHHH 00000000
	// 00ffffff 00000000 00ddEEEE 00000000
	// 00cccccc 00000000 00aaBBBB 00000000
	fmt.Printf("t3=%x\n", (t3.bytes[:]))

	return mm_or_si128(t1, t3)
	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

func enc_translate(in __m128i) __m128i {
	lut := mm_setr_epi8(
		65, 71, 252, 252,
		252, 252, 252, 252,
		252, 252, 252, 252,
		237, 240, 0, 0)
	fmt.Printf("lut=%x\n", (lut.bytes[:]))

	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs    Index  Characters
	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
	// 3  [62]      [43]       -19       12  +
	// 4  [63]      [47]       -16       13  /

	// Create LUT indices from the input. The index for range #0 is right,
	// others are 1 less than expected:
	indices := mm_subs_epu8(in, mm_set1_epi8(51))
	fmt.Printf("indices=%x\n", (indices.bytes[:]))

	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
	mask := mm_cmpgt_epi8(in, mm_set1_epi8(25))
	fmt.Printf("mask=%x\n", (mask.bytes[:]))

	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
	// now correct:
	indices = mm_sub_epi8(indices, mask)
	fmt.Printf("indices=%x\n", (indices.bytes[:]))

	// Add offsets to input values:
	return mm_add_epi8(in, mm_shuffle_epi8(lut, indices))
}

func main() {
	// Load string:
	str := mm_setr_epi8('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', '0', '0', '0', '0')

	// Reshuffle:
	result := enc_reshuffle(str)
	fmt.Printf("resuffle %x\n", (result.bytes[:]))

	// Translate reshuffled bytes to the Base64 alphabet:
	result = enc_translate(result)

	fmt.Printf("%s\n", string(result.bytes[:]))

	fmt.Printf("%s\n", base64.StdEncoding.EncodeToString([]byte("abcdefghijkl")))
}
	package main

	import (
	"encoding/base64"
	"encoding/binary"
	"fmt"
	)

	type __m128i struct {
	bytes [16]byte
	}

	func set64(hi, lo uint64) (m __m128i) {
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
	}

	func mm_set_epi32(e0, e1, e2, e3 uint32) (m __m128i) {
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
	}

	func mm_and_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = a.bytes[i] & b.bytes[i]
	}
	return
	}

	func mm_or_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = a.bytes[i] \| b.bytes[i]
	}
	return
	}

	func mm_andnot_si128(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = (^a.bytes[i]) & b.bytes[i]
	}
	return
	}

	func mm_shuffle_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	if b.bytes[i]&0x80 == 0x80 {
	m.bytes[i] = 0
	} else {
	idx := b.bytes[i] & 0x0f
	m.bytes[i] = a.bytes[idx]
	}
	}
	return
	}

	func mm_srli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
	e0 = 0
	e1 = 0
	e2 = 0
	e3 = 0
	} else {
	e0 = e0 >> imm8
	e1 = e1 >> imm8
	e2 = e2 >> imm8
	e3 = e3 >> imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
	}

	func mm_slli_epi32(a __m128i, imm8 byte) (m __m128i) {
	e0 := binary.LittleEndian.Uint32(a.bytes[:])
	e1 := binary.LittleEndian.Uint32(a.bytes[4:])
	e2 := binary.LittleEndian.Uint32(a.bytes[8:])
	e3 := binary.LittleEndian.Uint32(a.bytes[12:])
	if imm8 > 31 {
	e0 = 0
	e1 = 0
	e2 = 0
	e3 = 0
	} else {
	e0 = e0 << imm8
	e1 = e1 << imm8
	e2 = e2 << imm8
	e3 = e3 << imm8
	}
	binary.LittleEndian.PutUint32(m.bytes[:], e0)
	binary.LittleEndian.PutUint32(m.bytes[4:], e1)
	binary.LittleEndian.PutUint32(m.bytes[8:], e2)
	binary.LittleEndian.PutUint32(m.bytes[12:], e3)
	return
	}

	func mm_srli_epi64(a __m128i, imm8 byte) (m __m128i) {
	lo := binary.LittleEndian.Uint64(a.bytes[:])
	hi := binary.LittleEndian.Uint64(a.bytes[8:])
	if imm8 > 63 {
	lo = 0
	hi = 0
	} else {
	lo = lo >> imm8
	hi = hi >> imm8
	}
	binary.LittleEndian.PutUint64(m.bytes[:], lo)
	binary.LittleEndian.PutUint64(m.bytes[8:], hi)
	return
	}

	func mm_set_epi8(in ...byte) (m __m128i) {
	n := len(in)
	if n > 16 {
	n = 16
	}
	for i := 0; i < n; i++ {
	m.bytes[15-i] = in[i]
	}
	return
	}

	func mm_setr_epi8(in ...byte) (m __m128i) {
	n := len(in)
	if n > 16 {
	n = 16
	}
	for i := 0; i < n; i++ {
	m.bytes[i] = in[i]
	}
	return
	}

	func mm_set1_epi32(a uint32) (m __m128i) {
	return mm_set_epi32(a, a, a, a)
	}

	func mm_cmplt_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	if a.bytes[i] < b.bytes[i] {
	m.bytes[i] = 0xff
	} else {
	m.bytes[i] = 0x00
	}
	}
	return
	}

	func mm_cmpgt_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	if a.bytes[i] > b.bytes[i] {
	m.bytes[i] = 0xff
	} else {
	m.bytes[i] = 0x00
	}
	}
	return
	}

	func mm_cmpeq_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	if a.bytes[i] == b.bytes[i] {
	m.bytes[i] = 0xff
	} else {
	m.bytes[i] = 0x00
	}
	}
	return
	}

	func mm_set1_epi8(a int8) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = byte(a)
	}
	return
	}

	func xor(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = a.bytes[i] ^ b.bytes[i]
	}
	return
	}

	func mm_subs_epu8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = 0
	if a.bytes[i] > b.bytes[i] {
	m.bytes[i] = a.bytes[i] - b.bytes[i]
	}
	}
	return
	}

	func mm_add_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = a.bytes[i] + b.bytes[i]
	}
	return
	}

	func mm_sub_epi8(a, b __m128i) (m __m128i) {
	for i := 0; i < 16; i++ {
	m.bytes[i] = byte(int8(a.bytes[i]) - int8(b.bytes[i]))
	}
	return
	}

	func mm_mulhi_epu16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
	t1 := uint32(a.bytes[2i]) \| uint32(a.bytes[2i+1])<<8
	t2 := uint32(b.bytes[2i]) \| uint32(b.bytes[2i+1])<<8
	t3 := (t1 * t2) >> 16

	m.bytes[2*i] = byte(t3)
	m.bytes[2*i+1] = byte(t3 >> 8)
	}
	return
	}

	func mm_mullo_epi16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
	t1 := uint32(a.bytes[2i]) \| uint32(a.bytes[2i+1])<<8
	t2 := uint32(b.bytes[2i]) \| uint32(b.bytes[2i+1])<<8
	t3 := int32(t1) * int32(t2)
	m.bytes[2*i] = byte(t3)
	m.bytes[2*i+1] = byte(t3 >> 8)
	}
	return
	}

	func enc_reshuffle(in __m128i) __m128i {
	// Input, bytes MSB to LSB:
	// 0 0 0 0 l k j i h g f e d c b a

	in = mm_shuffle_epi8(in, mm_set_epi8(
	10, 11, 9, 10,
	7, 8, 6, 7,
	4, 5, 3, 4,
	1, 2, 0, 1))
	fmt.Printf("%x\n", (in.bytes[:]))
	// in, bytes MSB to LSB:
	// k l j k
	// h i g h
	// e f d e
	// b c a b

	t0 := mm_and_si128(in, mm_set1_epi32(0x0FC0FC00))
	// bits, upper case are most significant bits, lower case are least significant bits
	// 0000kkkk LL000000 JJJJJJ00 00000000
	// 0000hhhh II000000 GGGGGG00 00000000
	// 0000eeee FF000000 DDDDDD00 00000000
	// 0000bbbb CC000000 AAAAAA00 00000000
	fmt.Printf("t0=%x\n", (t0.bytes[:]))

	t1 := mm_mulhi_epu16(t0, mm_set1_epi32(0x04000040))
	// 00000000 00kkkkLL 00000000 00JJJJJJ
	// 00000000 00hhhhII 00000000 00GGGGGG
	// 00000000 00eeeeFF 00000000 00DDDDDD
	// 00000000 00bbbbCC 00000000 00AAAAAA
	fmt.Printf("t1=%x\n", (t1.bytes[:]))

	t2 := mm_and_si128(in, mm_set1_epi32(0x003F03F0))
	// 00000000 00llllll 000000jj KKKK0000
	// 00000000 00iiiiii 000000gg HHHH0000
	// 00000000 00ffffff 000000dd EEEE0000
	// 00000000 00cccccc 000000aa BBBB0000
	fmt.Printf("t2=%x\n", (t2.bytes[:]))

	t3 := mm_mullo_epi16(t2, mm_set1_epi32(0x01000010))
	// 00llllll 00000000 00jjKKKK 00000000
	// 00iiiiii 00000000 00ggHHHH 00000000
	// 00ffffff 00000000 00ddEEEE 00000000
	// 00cccccc 00000000 00aaBBBB 00000000
	fmt.Printf("t3=%x\n", (t3.bytes[:]))

	return mm_or_si128(t1, t3)
	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
	}

	func enc_translate(in __m128i) __m128i {
	lut := mm_setr_epi8(
	65, 71, 252, 252,
	252, 252, 252, 252,
	252, 252, 252, 252,
	237, 240, 0, 0)
	fmt.Printf("lut=%x\n", (lut.bytes[:]))

	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// # From To Abs Index Characters
	// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
	// 2 [52..61] [48..57] -4 [2..11] 0123456789
	// 3 [62] [43] -19 12 +
	// 4 [63] [47] -16 13 /

	// Create LUT indices from the input. The index for range #0 is right,
	// others are 1 less than expected:
	indices := mm_subs_epu8(in, mm_set1_epi8(51))
	fmt.Printf("indices=%x\n", (indices.bytes[:]))

	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
	mask := mm_cmpgt_epi8(in, mm_set1_epi8(25))
	fmt.Printf("mask=%x\n", (mask.bytes[:]))

	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
	// now correct:
	indices = mm_sub_epi8(indices, mask)
	fmt.Printf("indices=%x\n", (indices.bytes[:]))

	// Add offsets to input values:
	return mm_add_epi8(in, mm_shuffle_epi8(lut, indices))
	}

	func mm_movemask_epi8(in __m128i) int {
	ret := 0
	for i := 0; i < 16; i++ {
	ret \|= int((in.bytes[i]&0x80)>>7) << i
	}
	return ret
	}

	func mm_maddubs_epi16(a, b __m128i) (m __m128i) {
	for i := 0; i < 8; i++ {
	ret := int16(a.bytes[2i+1])int16(b.bytes[2i+1]) + int16(a.bytes[2i])int16(b.bytes[2i])
	m.bytes[2*i] = byte(ret)
	m.bytes[2*i+1] = byte(ret >> 8)
	}
	return
	}

	func mm_madd_epi16(a, b __m128i) (m __m128i) {
	for i := 0; i < 4; i++ {
	ah := int32(a.bytes[4i+2]) \| (int32(a.bytes[4i+3]) << 8)
	al := int32(a.bytes[4i]) \| (int32(a.bytes[4i+1]) << 8)
	bh := int32(b.bytes[4i+2]) \| (int32(b.bytes[4i+3]) << 8)
	bl := int32(b.bytes[4i]) \| (int32(b.bytes[4i+1]) << 8)
	ret := ahbh + albl
	m.bytes[4*i] = byte(ret)
	m.bytes[4*i+1] = byte(ret >> 8)
	m.bytes[4*i+2] = byte(ret >> 16)
	m.bytes[4*i+3] = byte(ret >> 24)
	}
	return
	}

	func dec_reshuffle(in __m128i) __m128i {
	// in, bits, upper case are most significant bits, lower case are least significant bits
	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
	merge_ab_and_bc := mm_maddubs_epi16(in, mm_set1_epi32(0x01400140))
	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
	// 0000eeee FFffffff 0000DDDD DDddEEEE
	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
	out := mm_madd_epi16(merge_ab_and_bc, mm_set1_epi32(0x00011000))
	// 00000000 JJJJJJjj KKKKkkkk LLllllll
	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
	// 00000000 DDDDDDdd EEEEeeee FFffffff
	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
	return mm_shuffle_epi8(out, mm_setr_epi8(2, 1, 0,
	6, 5, 4,
	10, 9, 8,
	14, 13, 12,
	255, 255, 255, 255))
	// 00000000 00000000 00000000 00000000
	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
	}

	// The input consists of six character sets in the Base64 alphabet, which we
	// need to map back to the 6-bit values they represent. There are three ranges,
	// two singles, and then there's the rest.
	//
	// # From To Add Characters
	// 1 [43] [62] +19 +
	// 2 [47] [63] +16 /
	// 3 [48..57] [52..61] +4 0..9
	// 4 [65..90] [0..25] -65 A..Z
	// 5 [97..122] [26..51] -71 a..z
	//
	// (6) Everything else => invalid input
	//
	// We will use lookup tables for character validation and offset computation.
	// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
	// allows to mask with 0x2F instead of 0x0F and thus save one constant
	// declaration (register and/or memory access).
	//
	// For offsets:
	// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
	// 0000 = garbage
	// 0001 = /
	// 0010 = +
	// 0011 = 0-9
	// 0100 = A-Z
	// 0101 = A-Z
	// 0110 = a-z
	// 0111 = a-z
	// 1000 >= garbage
	//
	// For validation, here's the table.
	// A character is valid if and only if the AND of the 2 lookups equals 0:
	//
	// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
	//
	// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
	//
	// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
	//
	// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	//
	// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
	//
	// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	//
	// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
	//
	// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
	//
	// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
	//
	// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
	//
	// 0100 0x04 char @ A B C D E F G H I J K L M N O
	//
	// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
	//
	// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
	//
	// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
	//
	// 0110 0x04 char ` a b c d e f g h i j k l m n o
	//
	// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
	//
	// 0111 0x08 char p q r s t u v w x y z { \| } ~
	//
	// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
	//
	// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	func dec_translate_std(in __m128i) (m __m128i, ret int) {
	lut_lo := mm_setr_epi8(
	0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
	0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A)

	lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10)

	lut_roll := mm_setr_epi8(0, 16, 19, 4, 256-65, 256-65, 256-71, 256-71,
	0, 0, 0, 0, 0, 0, 0, 0)

	mask_2F := mm_set1_epi8(0x2F)

	hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_2F)
	lo_nibbles := mm_and_si128(in, mask_2F)
	hi := mm_shuffle_epi8(lut_hi, hi_nibbles)
	lo := mm_shuffle_epi8(lut_lo, lo_nibbles)
	if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 {
	return
	}
	eq_2F := mm_cmpeq_epi8(in, mask_2F)
	roll := mm_shuffle_epi8(lut_roll, mm_add_epi8(eq_2F, hi_nibbles))

	// Now simply add the delta values to the input:
	return mm_add_epi8(in, roll), 1
	}

	// The input consists of six character sets in the Base64 alphabet, which we
	// need to map back to the 6-bit values they represent. There are three ranges,
	// two singles, and then there's the rest.
	//
	// # From To Add Characters
	// 1 [45] [62] +17 -
	// 2 [48..57] [52..61] +4 0..9
	// 3 [65..90] [0..25] -65 A..Z
	// 4 [95] [63] -32 _
	// 5 [97..122] [26..51] -71 a..z
	//
	// (6) Everything else => invalid input
	//
	// We will use lookup tables for character validation and offset computation.
	//
	// For offsets:
	// Perfect hash for lut = ((src >> 4) & 0x0F) - ((src > 0x5e) ? 0xFF : 0x00)
	// 0000 = garbage
	// 0001 = garbage
	// 0010 = -
	// 0011 = 0-9
	// 0100 = A-Z
	// 0101 = A-Z
	// 0110 = _
	// 0111 = a-z
	// 1000 = a-z
	// 1000 > garbage
	//
	// For validation, here's the table.
	// A character is valid if and only if the AND of the 2 lookups equals 0:
	//
	// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
	//
	// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1B 0x1B 0x1A 0x1B 0x33
	//
	// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
	//
	// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	//
	// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
	//
	// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	//
	// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
	//
	// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01
	//
	// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
	//
	// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
	//
	// 0100 0x04 char @ A B C D E F G H I J K L M N O
	//
	// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
	//
	// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
	//
	// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x00
	//
	// 0110 0x04 char ` a b c d e f g h i j k l m n o
	//
	// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
	//
	// 0111 0x28 char p q r s t u v w x y z { \| } ~
	//
	// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x20
	//
	// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
	func dec_translate_url(in __m128i) (m __m128i, ret int) {
	lut_lo := mm_setr_epi8(
	0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
	0x11, 0x11, 0x13, 0x1B, 0x1B, 0x1A, 0x1B, 0x33)

	lut_hi := mm_setr_epi8(0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x28,
	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10)

	lut_roll := mm_setr_epi8(0, 0, 17, 4, 256-65, 256-65, 256-32, 256-71, 256-71, 0, 0, 0, 0, 0, 0, 0)

	mask_0F := mm_set1_epi8(0x0F)

	hi_nibbles := mm_and_si128(mm_srli_epi32(in, 4), mask_0F)
	lo_nibbles := mm_and_si128(in, mask_0F)
	hi := mm_shuffle_epi8(lut_hi, hi_nibbles)
	lo := mm_shuffle_epi8(lut_lo, lo_nibbles)
	if mm_movemask_epi8(mm_cmpgt_epi8(mm_and_si128(lo, hi), mm_set1_epi8(0))) != 0 {
	return
	}
	gt_5e := mm_cmpgt_epi8(in, mm_set1_epi8(0x5E))
	roll := mm_shuffle_epi8(lut_roll, mm_sub_epi8(hi_nibbles, gt_5e))

	// Now simply add the delta values to the input:
	return mm_add_epi8(in, roll), 1
	}

	func main() {
	// std
	encoded := mm_setr_epi8([]byte(base64.StdEncoding.EncodeToString([]byte("abcdefghijkl")))...)
	decoded, ret := dec_translate_std(encoded)
	if ret == 1 {
	decoded = dec_reshuffle(decoded)
	fmt.Printf("%v\n", string(decoded.bytes[:12]))
	} else {
	fmt.Println("invalid base64 encoded")
	}
	// url
	encoded = mm_setr_epi8([]byte(base64.URLEncoding.EncodeToString([]byte("!?$*&()'-=@~")))...)
	decoded, ret = dec_translate_url(encoded)
	if ret == 1 {
	decoded = dec_reshuffle(decoded)
	fmt.Printf("%v\n", string(decoded.bytes[:12]))
	} else {
	fmt.Println("invalid base64 encoded")
	}
	}