Skip to content

Instantly share code, notes, and snippets.

@abrown
Last active December 6, 2019 04:58
Show Gist options
  • Save abrown/bfc5b3d28dae29df0f3b5d69dd24b41d to your computer and use it in GitHub Desktop.
Save abrown/bfc5b3d28dae29df0f3b5d69dd24b41d to your computer and use it in GitHub Desktop.
Compare i8x16 shift implementations
.text
.global shift_scalar
.global shift_16x8
.global shift_8x16_emulated
.global shift_8x16_constant
.global shift_8x16_masked
.macro SHIFT_SCALAR_MACRO
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
shl $2, %r8
.endm
shift_scalar:
mov %rax, %rcx
mov $1, %r8
shift_scalar_loop:
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
SHIFT_SCALAR_MACRO
dec %ecx
jnz shift_scalar_loop
shift_scalar_end:
mov %r8, %rax
retq
.macro SHIFT_16x8_MACRO
psllw $2, %xmm0
.endm
shift_16x8:
mov %rax, %rcx
movaps WORD_ONES(%rip), %xmm0
shift_16x8_loop:
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
SHIFT_16x8_MACRO
dec %ecx
jnz shift_16x8_loop
shift_16x8_end:
pextrw $0, %xmm0, %rax
retq
.macro SHIFT_8x16_MACRO
pcmpeqw %xmm1, %xmm1 # pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
andq $7, %rbx # andq(shift, Immediate(7));
mov %rbx, %rdx # movq(tmp, shift);
add $8, %rdx # addq(tmp, Immediate(8));
movq %rdx, %xmm2 # movq(tmp_simd, tmp);
psrlw %xmm2, %xmm1 # psrlw(kScratchDoubleReg, tmp_simd);
packuswb %xmm1, %xmm1 # packuswb(kScratchDoubleReg, kScratchDoubleReg);
pand %xmm1, %xmm0 # pand(dst, kScratchDoubleReg);
movq %rbx, %xmm2 # movq(tmp_simd, shift);
psllw %xmm2, %xmm0 # psllw(dst, tmp_simd);
.endm
shift_8x16_emulated:
mov %rax, %rcx
movaps BYTE_ONES(%rip), %xmm0
mov $2, %rbx
shift_8x16_emulated_loop:
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
SHIFT_8x16_MACRO
dec %ecx
jnz shift_8x16_emulated_loop
shift_8x16_emulated_end:
pextrw $0, %xmm0, %rax
retq
.macro SHIFT_CONSTANT
psllw $2, %xmm0
pand BYTE_MASK(%rip), %xmm0
.endm
shift_8x16_constant:
mov %rax, %rcx
movaps BYTE_ONES(%rip), %xmm0
shift_8x16_constant_loop:
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
SHIFT_CONSTANT
dec %ecx
jnz shift_8x16_constant_loop
shift_8x16_constant_end:
pextrw $0, %xmm0, %rax
retq
.macro SHIFT_MASKED
and $7, %edi
movd %edi, %xmm1
psllw $1, %xmm0
shl $4, %rdi
pand BYTE_MASK(%rip), %xmm0 # this should be `pand xmm0, xmmword ptr [rdi + masks]` but I don't want to set up the masks
.endm
shift_8x16_masked:
mov %rax, %rcx
movaps BYTE_ONES(%rip), %xmm0
shift_8x16_masked_loop:
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
SHIFT_MASKED
dec %ecx
jnz shift_8x16_masked_loop
shift_8x16_masked_end:
pextrw $0, %xmm0, %rax
retq
.section .rodata
.p2align 4 # align to 16 = 1<<4
WORD_ONES:
.word 1, 1, 1, 1, 1, 1, 1, 1
BYTE_ONES:
.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
BYTE_MASK:
.byte 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
extern int shift_scalar();
extern int shift_16x8();
extern int shift_8x16_emulated();
extern int shift_8x16_constant();
extern int shift_8x16_masked();
#define COMPARE_MAX 16
typedef int (*ftype)(int);
ftype parse_function_from_flag(char *flag) {
if (!strncmp(flag, "shift_scalar", COMPARE_MAX)) {
return shift_scalar;
} else if (!strncmp(flag, "shift_16x8", COMPARE_MAX)) {
return shift_16x8;
} else if (!strncmp(flag, "shift_8x16_emulated", COMPARE_MAX)) {
return shift_8x16_emulated;
} else if (!strncmp(flag, "shift_8x16_constant", COMPARE_MAX)) {
return shift_8x16_constant;
} else if (!strncmp(flag, "shift_8x16_masked", COMPARE_MAX)) {
return shift_8x16_masked;
} else {
printf("Invalid function type; expects one of [shift_scalar|shift_16x8|shift_8x16_emulated]\n");
exit(1);
}
}
int main(int argc, char *argv[]) {
if(argc <= 2) {
printf("Usage: %s [# of iterations to run body] [shift_scalar|shift_16x8|shift_8x16_emulated]\n", argv[0]);
exit(1);
}
int n = atoi(argv[1]);
ftype body = parse_function_from_flag(argv[2]);
body(n);
}
.DEFAULT_GOAL = run
FLAGS := -Wall -Wextra
PERF := perf stat -r 10 -ddd
NUM_ITERATIONS := 10000000
# for reference of automatic variables, https://www.gnu.org/software/make/manual/html_node/Automatic-Variables.html
# basic utilities
clean:
rm -rf build
build:
mkdir -p build
run: build/main
$(PERF) $< $(NUM_ITERATIONS) shift_scalar
$(PERF) $< $(NUM_ITERATIONS) shift_16x8
$(PERF) $< $(NUM_ITERATIONS) shift_8x16_emulated
$(PERF) $< $(NUM_ITERATIONS) shift_8x16_constant
$(PERF) $< $(NUM_ITERATIONS) shift_8x16_masked
# build body.s
build/body.o: body.s build
$(AS) $< -o $@
# build main.c with SIMD flags in FLAGS (e.g. -mavx2 -Wno-psabi above)
build/main.o: main.c build
$(CC) $(FLAGS) $< -c -o $@
# assemble main
build/main: build/body.o build/main.o
$(CC) $(FLAGS) $^ -o $@
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment