Skip to content

Instantly share code, notes, and snippets.

@krypt-n
Last active May 18, 2017 07:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krypt-n/08b66c2aa430e2f4257e99c459939b5d to your computer and use it in GitHub Desktop.
Save krypt-n/08b66c2aa430e2f4257e99c459939b5d to your computer and use it in GitHub Desktop.
~ gcc --version
gcc (GCC) 6.3.1 20170306
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
~ gcc -march=native -Wall -Wextra -O3 test.c -o test
test.c: In function ‘main’:
test.c:5:14: warning: unused parameter ‘argc’ [-Wunused-parameter]
int main(int argc, char** argv) {
^~~~
test.c:5:27: warning: unused parameter ‘argv’ [-Wunused-parameter]
int main(int argc, char** argv) {
^~~~
~ ./test
15
15
0
~ valgrind --tool=callgrind ./test
==19984== Callgrind, a call-graph generating cache profiler
==19984== Copyright (C) 2002-2015, and GNU GPL'd, by Josef Weidendorfer et al.
==19984== Using Valgrind-3.12.0 and LibVEX; rerun with -h for copyright info
==19984== Command: ./test
==19984==
==19984== For interactive control, run 'callgrind_control -h'.
15
0
0
==19984==
==19984== Events : Ir
==19984== Collected : 158231
==19984==
==19984== I refs: 158,231
~ valgrind --tool=memcheck ./test
==19988== Memcheck, a memory error detector
==19988== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
==19988== Using Valgrind-3.12.0 and LibVEX; rerun with -h for copyright info
==19988== Command: ./test
==19988==
15
15
0
==19988==
==19988== HEAP SUMMARY:
==19988== in use at exit: 0 bytes in 0 blocks
==19988== total heap usage: 1 allocs, 1 frees, 1,024 bytes allocated
==19988==
==19988== All heap blocks were freed -- no leaks are possible
==19988==
==19988== For counts of detected and suppressed errors, rerun with: -v
==19988== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
#include <stdio.h>
#include <immintrin.h>
int main(int argc, char** argv) {
const __m256i one = _mm256_set_epi64x(0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff);
long long unsigned p[3] = {0};
long long unsigned tmp[4];
__m256i B1 = _mm256_set_epi64x(0, 0xf, 0, 0xf);
__m256i B2 = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_maskstore_epi64((long long int*)tmp, one, B1);
p[0] = tmp[2];
p[1] = tmp[0];
_mm256_maskstore_epi64((long long int*)tmp, one, B2);
for(int i = 0 ; i <3; ++i) { //Somehow it is important that this loop is not unrolled
printf("%d\n", (int)p[i]);
}
}
.file "test.c"
.section .rodata.str1.1,"aMS",@progbits,1
.LC1:
.string "%d\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB4605:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
vpcmpeqd %ymm0, %ymm0, %ymm0
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %r10
pushq %rbx
.cfi_offset 12, -24
.cfi_offset 10, -32
.cfi_offset 3, -40
leaq -96(%rbp), %rbx
leaq 24(%rbx), %r12
subq $72, %rsp
vmovdqa .LC0(%rip), %ymm1
movq $0, -80(%rbp)
vpmaskmovq %ymm1, %ymm0, -64(%rbp)
movq -48(%rbp), %rax
vpxor %xmm1, %xmm1, %xmm1
movq %rax, -96(%rbp)
movq -64(%rbp), %rax
vpmaskmovq %ymm1, %ymm0, -64(%rbp)
movq %rax, -88(%rbp)
vzeroupper
.L2:
movl (%rbx), %esi
movl $.LC1, %edi
xorl %eax, %eax
addq $8, %rbx
call printf
cmpq %r12, %rbx
jne .L2
addq $72, %rsp
xorl %eax, %eax
popq %rbx
popq %r10
popq %r12
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4605:
.size main, .-main
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.quad 15
.quad 0
.quad 15
.quad 0
.ident "GCC: (GNU) 6.3.1 20170306"
.section .note.GNU-stack,"",@progbits
.file "test.c"
.section .rodata.str1.1,"aMS",@progbits,1
.LC1:
.string "%d\n"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB4605:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
vpcmpeqd %ymm0, %ymm0, %ymm0
movl $.LC1, %edi
xorl %eax, %eax
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r10
subq $40, %rsp
.cfi_offset 10, -24
vmovdqa .LC0(%rip), %ymm1
vpmaskmovq %ymm1, %ymm0, -48(%rbp)
movq -48(%rbp), %rsi
vpxor %xmm1, %xmm1, %xmm1
vpmaskmovq %ymm1, %ymm0, -48(%rbp)
vzeroupper
call printf
xorl %esi, %esi
movl $.LC1, %edi
xorl %eax, %eax
call printf
addq $40, %rsp
xorl %eax, %eax
popq %r10
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4605:
.size main, .-main
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.quad 15
.quad 0
.quad 15
.quad 0
.ident "GCC: (GNU) 6.3.1 20170306"
.section .note.GNU-stack,"",@progbits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment