Last active
December 6, 2019 04:58
-
-
Save abrown/bfc5b3d28dae29df0f3b5d69dd24b41d to your computer and use it in GitHub Desktop.
Compare i8x16 shift implementations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.text | |
.global shift_scalar | |
.global shift_16x8 | |
.global shift_8x16_emulated | |
.global shift_8x16_constant | |
.global shift_8x16_masked | |
.macro SHIFT_SCALAR_MACRO | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
shl $2, %r8 | |
.endm | |
shift_scalar: | |
mov %rax, %rcx | |
mov $1, %r8 | |
shift_scalar_loop: | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
SHIFT_SCALAR_MACRO | |
dec %ecx | |
jnz shift_scalar_loop | |
shift_scalar_end: | |
mov %r8, %rax | |
retq | |
.macro SHIFT_16x8_MACRO | |
psllw $2, %xmm0 | |
.endm | |
shift_16x8: | |
mov %rax, %rcx | |
movaps WORD_ONES(%rip), %xmm0 | |
shift_16x8_loop: | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
SHIFT_16x8_MACRO | |
dec %ecx | |
jnz shift_16x8_loop | |
shift_16x8_end: | |
pextrw $0, %xmm0, %rax | |
retq | |
.macro SHIFT_8x16_MACRO | |
pcmpeqw %xmm1, %xmm1 # pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); | |
andq $7, %rbx # andq(shift, Immediate(7)); | |
mov %rbx, %rdx # movq(tmp, shift); | |
add $8, %rdx # addq(tmp, Immediate(8)); | |
movq %rdx, %xmm2 # movq(tmp_simd, tmp); | |
psrlw %xmm2, %xmm1 # psrlw(kScratchDoubleReg, tmp_simd); | |
packuswb %xmm1, %xmm1 # packuswb(kScratchDoubleReg, kScratchDoubleReg); | |
pand %xmm1, %xmm0 # pand(dst, kScratchDoubleReg); | |
movq %rbx, %xmm2 # movq(tmp_simd, shift); | |
psllw %xmm2, %xmm0 # psllw(dst, tmp_simd); | |
.endm | |
shift_8x16_emulated: | |
mov %rax, %rcx | |
movaps BYTE_ONES(%rip), %xmm0 | |
mov $2, %rbx | |
shift_8x16_emulated_loop: | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
SHIFT_8x16_MACRO | |
dec %ecx | |
jnz shift_8x16_emulated_loop | |
shift_8x16_emulated_end: | |
pextrw $0, %xmm0, %rax | |
retq | |
.macro SHIFT_CONSTANT | |
psllw $2, %xmm0 | |
pand BYTE_MASK(%rip), %xmm0 | |
.endm | |
shift_8x16_constant: | |
mov %rax, %rcx | |
movaps BYTE_ONES(%rip), %xmm0 | |
shift_8x16_constant_loop: | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
SHIFT_CONSTANT | |
dec %ecx | |
jnz shift_8x16_constant_loop | |
shift_8x16_constant_end: | |
pextrw $0, %xmm0, %rax | |
retq | |
.macro SHIFT_MASKED | |
and $7, %edi | |
movd %edi, %xmm1 | |
psllw $1, %xmm0 | |
shl $4, %rdi | |
pand BYTE_MASK(%rip), %xmm0 # this should be `pand xmm0, xmmword ptr [rdi + masks]` but I don't want to set up the masks | |
.endm | |
shift_8x16_masked: | |
mov %rax, %rcx | |
movaps BYTE_ONES(%rip), %xmm0 | |
shift_8x16_masked_loop: | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
SHIFT_MASKED | |
dec %ecx | |
jnz shift_8x16_masked_loop | |
shift_8x16_masked_end: | |
pextrw $0, %xmm0, %rax | |
retq | |
.section .rodata | |
.p2align 4 # align to 16 = 1<<4 | |
WORD_ONES: | |
.word 1, 1, 1, 1, 1, 1, 1, 1 | |
BYTE_ONES: | |
.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
BYTE_MASK: | |
.byte 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
extern int shift_scalar(); | |
extern int shift_16x8(); | |
extern int shift_8x16_emulated(); | |
extern int shift_8x16_constant(); | |
extern int shift_8x16_masked(); | |
#define COMPARE_MAX 16 | |
typedef int (*ftype)(int); | |
ftype parse_function_from_flag(char *flag) { | |
if (!strncmp(flag, "shift_scalar", COMPARE_MAX)) { | |
return shift_scalar; | |
} else if (!strncmp(flag, "shift_16x8", COMPARE_MAX)) { | |
return shift_16x8; | |
} else if (!strncmp(flag, "shift_8x16_emulated", COMPARE_MAX)) { | |
return shift_8x16_emulated; | |
} else if (!strncmp(flag, "shift_8x16_constant", COMPARE_MAX)) { | |
return shift_8x16_constant; | |
} else if (!strncmp(flag, "shift_8x16_masked", COMPARE_MAX)) { | |
return shift_8x16_masked; | |
} else { | |
printf("Invalid function type; expects one of [shift_scalar|shift_16x8|shift_8x16_emulated]\n"); | |
exit(1); | |
} | |
} | |
int main(int argc, char *argv[]) { | |
if(argc <= 2) { | |
printf("Usage: %s [# of iterations to run body] [shift_scalar|shift_16x8|shift_8x16_emulated]\n", argv[0]); | |
exit(1); | |
} | |
int n = atoi(argv[1]); | |
ftype body = parse_function_from_flag(argv[2]); | |
body(n); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.DEFAULT_GOAL = run | |
FLAGS := -Wall -Wextra | |
PERF := perf stat -r 10 -ddd | |
NUM_ITERATIONS := 10000000 | |
# for reference of automatic variables, https://www.gnu.org/software/make/manual/html_node/Automatic-Variables.html | |
# basic utilities | |
clean: | |
rm -rf build | |
build: | |
mkdir -p build | |
run: build/main | |
$(PERF) $< $(NUM_ITERATIONS) shift_scalar | |
$(PERF) $< $(NUM_ITERATIONS) shift_16x8 | |
$(PERF) $< $(NUM_ITERATIONS) shift_8x16_emulated | |
$(PERF) $< $(NUM_ITERATIONS) shift_8x16_constant | |
$(PERF) $< $(NUM_ITERATIONS) shift_8x16_masked | |
# build body.s | |
build/body.o: body.s build | |
$(AS) $< -o $@ | |
# build main.c with SIMD flags in FLAGS (e.g. -mavx2 -Wno-psabi above) | |
build/main.o: main.c build | |
$(CC) $(FLAGS) $< -c -o $@ | |
# assemble main | |
build/main: build/body.o build/main.o | |
$(CC) $(FLAGS) $^ -o $@ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment