Skip to content

Instantly share code, notes, and snippets.

@aguinet
Created May 20, 2020 11:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aguinet/34be8b8378f7a259b43d033180010539 to your computer and use it in GitHub Desktop.
Save aguinet/34be8b8378f7a259b43d033180010539 to your computer and use it in GitHub Desktop.
// Extracted from https://github.com/pyjamask-cipher/pyjamask-reference-implementation/blob/master/pyjamask.c
#include <cstdint>
#define right_rotate(row) \
row = (row >> 1) | (row << 31);
#define COL_M0 0xa3861085
#define COL_M1 0x63417021
#define COL_M2 0x692cf280
#define COL_M3 0x48a54813
static uint32_t mat_mult(uint32_t mat_col, uint32_t vec)
{
int i;
uint32_t mask, res=0;
#pragma unroll
for (i = 31; i>=0; i--)
{
mask = -((vec >> i) & 1);
res ^= mask & mat_col;
right_rotate(mat_col);
}
return res;
}
extern "C" uint32_t mat_mult_m0(uint32_t vec)
{
return mat_mult(COL_M0, vec);
}
.text
.intel_syntax noprefix
.file "pyjamask.cpp"
.section .rodata,"a",@progbits
.p2align 6 # -- Begin function mat_mult_m0
.LCPI0_0:
.byte 208 # 0xd0
.byte 104 # 0x68
.byte 180 # 0xb4
.byte 90 # 0x5a
.byte 45 # 0x2d
.byte 22 # 0x16
.byte 139 # 0x8b
.byte 197 # 0xc5
.byte 132 # 0x84
.byte 66 # 0x42
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.byte 132 # 0x84
.byte 66 # 0x42
.byte 161 # 0xa1
.byte 48 # 0x30
.byte 24 # 0x18
.byte 12 # 0xc
.byte 134 # 0x86
.byte 67 # 0x43
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.byte 226 # 0xe2
.byte 113 # 0x71
.byte 56 # 0x38
.byte 28 # 0x1c
.byte 14 # 0xe
.byte 135 # 0x87
.byte 195 # 0xc3
.byte 97 # 0x61
.byte 226 # 0xe2
.byte 113 # 0x71
.byte 56 # 0x38
.byte 28 # 0x1c
.byte 14 # 0xe
.byte 135 # 0x87
.byte 195 # 0xc3
.byte 97 # 0x61
.byte 208 # 0xd0
.byte 104 # 0x68
.byte 180 # 0xb4
.byte 90 # 0x5a
.byte 45 # 0x2d
.byte 22 # 0x16
.byte 139 # 0x8b
.byte 197 # 0xc5
.byte 132 # 0x84
.byte 66 # 0x42
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.byte 132 # 0x84
.byte 66 # 0x42
.byte 161 # 0xa1
.byte 48 # 0x30
.byte 24 # 0x18
.byte 12 # 0xc
.byte 134 # 0x86
.byte 67 # 0x43
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.LCPI0_1:
.byte 48 # 0x30
.byte 24 # 0x18
.byte 12 # 0xc
.byte 134 # 0x86
.byte 67 # 0x43
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.byte 226 # 0xe2
.byte 113 # 0x71
.byte 56 # 0x38
.byte 28 # 0x1c
.byte 14 # 0xe
.byte 135 # 0x87
.byte 195 # 0xc3
.byte 97 # 0x61
.byte 208 # 0xd0
.byte 104 # 0x68
.byte 180 # 0xb4
.byte 90 # 0x5a
.byte 45 # 0x2d
.byte 22 # 0x16
.byte 139 # 0x8b
.byte 197 # 0xc5
.byte 132 # 0x84
.byte 66 # 0x42
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.byte 132 # 0x84
.byte 66 # 0x42
.byte 161 # 0xa1
.byte 132 # 0x84
.byte 66 # 0x42
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.byte 132 # 0x84
.byte 66 # 0x42
.byte 161 # 0xa1
.byte 48 # 0x30
.byte 24 # 0x18
.byte 12 # 0xc
.byte 134 # 0x86
.byte 67 # 0x43
.byte 33 # 0x21
.byte 16 # 0x10
.byte 8 # 0x8
.byte 226 # 0xe2
.byte 113 # 0x71
.byte 56 # 0x38
.byte 28 # 0x1c
.byte 14 # 0xe
.byte 135 # 0x87
.byte 195 # 0xc3
.byte 97 # 0x61
.byte 208 # 0xd0
.byte 104 # 0x68
.byte 180 # 0xb4
.byte 90 # 0x5a
.byte 45 # 0x2d
.byte 22 # 0x16
.byte 139 # 0x8b
.byte 197 # 0xc5
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI0_2:
.byte 0 # 0x0
.byte 8 # 0x8
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.text
.globl mat_mult_m0
.p2align 4, 0x90
.type mat_mult_m0,@function
mat_mult_m0: # @mat_mult_m0
.Lmat_mult_m0$local:
.cfi_startproc
# %bb.0:
vmovd xmm0, edi
vpbroadcastb ymm0, xmm0
mov eax, edi
shr eax, 8
vmovd xmm1, eax
vpbroadcastb ymm1, xmm1
vinserti64x4 zmm0, zmm0, ymm1, 1
mov eax, edi
shr eax, 16
vmovd xmm1, eax
vpbroadcastb ymm1, xmm1
shr edi, 24
vmovd xmm2, edi
vpbroadcastb ymm2, xmm2
vgf2p8affineqb zmm0, zmm0, zmmword ptr [rip + .LCPI0_0], 0
vinserti64x4 zmm1, zmm1, ymm2, 1
vgf2p8affineqb zmm1, zmm1, zmmword ptr [rip + .LCPI0_1], 0
vpxorq zmm0, zmm0, zmm1
vextracti64x4 ymm1, zmm0, 1
vpxor ymm0, ymm0, ymm1
vextracti128 xmm1, ymm0, 1
vmovdqa xmm2, xmmword ptr [rip + .LCPI0_2] # xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
vpshufb xmm1, xmm1, xmm2
vpshufb xmm0, xmm0, xmm2
vpunpcklwd xmm0, xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
vmovd eax, xmm0
vzeroupper
ret
.Lfunc_end0:
.size mat_mult_m0, .Lfunc_end0-mat_mult_m0
.cfi_endproc
# -- End function
.ident "clang version 10.0.0-4 "
.section ".note.GNU-stack","",@progbits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment