Skip to content

Instantly share code, notes, and snippets.

@alexhsamuel
Last active April 27, 2016 14:45
Show Gist options
  • Save alexhsamuel/cfd6ec76f2792e241b1344f992dd6e5b to your computer and use it in GitHub Desktop.
Save alexhsamuel/cfd6ec76f2792e241b1344f992dd6e5b to your computer and use it in GitHub Desktop.
AVX2 256-bit math in action!
// compile with -O3 -march=core-avx2
void
foo(
int const* __restrict__ x,
int const* __restrict__ y,
int* __restrict__ z)
{
for (int i = 0; i < 65536; ++i)
z[i] = x[i] * 2 + y[i];
}
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 11
.globl __Z3fooPKiS0_Pi
.align 4, 0x90
__Z3fooPKiS0_Pi: ## @_Z3fooPKiS0_Pi
.cfi_startproc
## BB#0: ## %overflow.checked
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
xorl %eax, %eax
.align 4, 0x90
LBB0_1: ## %vector.body
## =>This Inner Loop Header: Depth=1
vmovdqu (%rdi,%rax,4), %ymm0
vmovdqu 32(%rdi,%rax,4), %ymm1
vmovdqu 64(%rdi,%rax,4), %ymm2
vmovdqu 96(%rdi,%rax,4), %ymm3
vpaddd %ymm0, %ymm0, %ymm0
vpaddd %ymm1, %ymm1, %ymm1
vpaddd %ymm2, %ymm2, %ymm2
vpaddd %ymm3, %ymm3, %ymm3
vpaddd (%rsi,%rax,4), %ymm0, %ymm0
vpaddd 32(%rsi,%rax,4), %ymm1, %ymm1
vpaddd 64(%rsi,%rax,4), %ymm2, %ymm2
vpaddd 96(%rsi,%rax,4), %ymm3, %ymm3
vmovdqu %ymm0, (%rdx,%rax,4)
vmovdqu %ymm1, 32(%rdx,%rax,4)
vmovdqu %ymm2, 64(%rdx,%rax,4)
vmovdqu %ymm3, 96(%rdx,%rax,4)
addq $32, %rax
cmpq $65536, %rax ## imm = 0x10000
jne LBB0_1
## BB#2: ## %middle.block
popq %rbp
vzeroupper
retq
.cfi_endproc
.subsections_via_symbols
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment