Skip to content

Instantly share code, notes, and snippets.

@oconnor663
Last active May 30, 2023 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oconnor663/69176654f1db1bb96077d6ff4141a022 to your computer and use it in GitHub Desktop.
Save oconnor663/69176654f1db1bb96077d6ff4141a022 to your computer and use it in GitHub Desktop.

GCC 13.1.1 (Arch Linux) seems to mis-align __m512i vectors on the stack when -fsanitize=address is enabled. repro.c (below in this Gist) is a minimized repro. Compile it like this:

gcc repro.c -g -mavx512f -fsanitize=address

When I execute it I get the following:

$ ./a.out
AddressSanitizer:DEADLYSIGNAL
=================================================================
==19063==ERROR: AddressSanitizer: SEGV on unknown address (pc 0x561c3e127292 bp 0x7ffc7738ae80 sp 0x7ffc7738ad80 T0)
==19063==The signal is caused by a READ memory access.
==19063==Hint: this fault was caused by a dereference of a high value address (see register values below).  Disassemble the provided pc to learn which register was used.
    #0 0x561c3e127292 in main /tmp/69176654f1db1bb96077d6ff4141a022/repro.c:4
    #1 0x7f632603984f  (/usr/lib/libc.so.6+0x2384f) (BuildId: 2f005a79cd1a8e385972f5a102f16adba414d75e)
    #2 0x7f6326039909 in __libc_start_main (/usr/lib/libc.so.6+0x23909) (BuildId: 2f005a79cd1a8e385972f5a102f16adba414d75e)
    #3 0x561c3e1270b4 in _start (/tmp/69176654f1db1bb96077d6ff4141a022/a.out+0x10b4) (BuildId: 52c240ffdc5530ccfe7bce9ebcc122ac12c8c4e9)

AddressSanitizer can not provide additional info.
SUMMARY: AddressSanitizer: SEGV /tmp/69176654f1db1bb96077d6ff4141a022/repro.c:4 in main
==19063==ABORTING

You can reproduce this in Docker, using the current version of archlinux:base-devel

$ docker run --interactive --tty --rm archlinux:base-devel-20230521.0.152478
[root@c40aaea08a51 /]# gcc --version
gcc (GCC) 13.1.1 20230429
...
[root@c40aaea08a51 /]# cat << END > repro.c
#include <immintrin.h>
int main() { __m512i v = _mm512_set1_epi32(0); return *((int *)&v); }
END
[root@c40aaea08a51 /]# gcc repro.c -g -mavx512f -fsanitize=address
[root@c40aaea08a51 /]# ./a.out
AddressSanitizer:DEADLYSIGNAL
...

Running a.out under GDB shows that the immediate cause of the crash is a vmovdqa64 instruction with a misaligned argument:

(gdb) display /i $pc
3: x/i $pc
=> 0x555555555292 <main+265>:   vmovdqa64 %zmm0,-0x80(%rcx)
(gdb) p $rcx % 64
$2 = 32

Looking at repro.s (below in this Gist), I think the misaligned value of rcx comes from the return value of __asan_stack_malloc_1.

The real world code that originally triggered this bug is: https://github.com/BLAKE3-team/BLAKE3/blob/76f9339312e1d52632a1cfb9df285c01911d99ce/c/blake3_avx512.c#L1078

#include <immintrin.h>
int main() {
__m512i v = _mm512_set1_epi32(0);
// It doesn't really matter what we do next, as long as we convince the
// compiler to put v on the stack. Here we just read an int from it.
return *((int *)&v);
}
.file "repro.c"
.text
.Ltext0:
.file 0 "/var/tmp/gist" "repro.c"
.globl __asan_stack_malloc_1
.section .rodata
.LC0:
.string "1 32 64 3 v:4"
.text
.globl main
.type main, @function
main:
.LASANPC4865:
.LFB4865:
.file 1 "repro.c"
.loc 1 3 12
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-64, %rsp
subq $192, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
leaq 32(%rsp), %rbx
movq %rbx, %r12
cmpl $0, __asan_option_detect_stack_use_after_return(%rip)
je .L1
movl $128, %edi
call __asan_stack_malloc_1@PLT
testq %rax, %rax
je .L1
movq %rax, %rbx
.L1:
leaq 160(%rbx), %rax
movq %rax, %rcx
movq $1102416563, (%rbx)
leaq .LC0(%rip), %rax
movq %rax, 8(%rbx)
leaq .LASANPC4865(%rip), %rax
movq %rax, 16(%rbx)
movq %rbx, %rax
shrq $3, %rax
movl $-235802127, 2147450880(%rax)
movl $-202116109, 2147450892(%rax)
.loc 1 3 12
movq %fs:40, %rdx
movq %rdx, 184(%rsp)
xorl %edx, %edx
movl $0, 28(%rsp)
.LBB4:
.LBB5:
.file 2 "/usr/lib/gcc/x86_64-pc-linux-gnu/13.1.1/include/avx512fintrin.h"
.loc 2 4182 5
vpbroadcastd 28(%rsp), %zmm0
.LBE5:
.LBE4:
.loc 1 4 11 discriminator 2
leaq -128(%rcx), %rdx
movq %rdx, %rsi
shrq $3, %rsi
addq $2147450880, %rsi
movzbl (%rsi), %esi
testb %sil, %sil
setne %dil
testb %sil, %sil
setle %sil
andl %esi, %edi
movl %edi, %r8d
movl $64, %esi
subq $1, %rsi
leaq (%rdx,%rsi), %rdi
movq %rdi, %rsi
shrq $3, %rsi
addq $2147450880, %rsi
movzbl (%rsi), %esi
testb %sil, %sil
setne %r9b
andl $7, %edi
cmpb %sil, %dil
setge %sil
andl %r9d, %esi
orl %r8d, %esi
testb %sil, %sil
je .L6
movl $64, %esi
movq %rdx, %rdi
call __asan_report_store_n@PLT
.L6:
vmovdqa64 %zmm0, -128(%rcx)
.loc 1 7 19
leaq -128(%rcx), %rsi
.loc 1 7 10
movq %rsi, %rdx
movq %rdx, %rcx
shrq $3, %rcx
addq $2147450880, %rcx
movzbl (%rcx), %ecx
testb %cl, %cl
setne %r8b
movq %rdx, %rdi
andl $7, %edi
addl $3, %edi
cmpb %cl, %dil
setge %cl
andl %r8d, %ecx
testb %cl, %cl
je .L7
movq %rdx, %rdi
call __asan_report_load4@PLT
.L7:
movl (%rsi), %edx
.loc 1 3 12
cmpq %rbx, %r12
je .L2
movq $1172321806, (%rbx)
movl $-11, %ecx
vmovd %ecx, %xmm0
vpbroadcastb %xmm0, %xmm0
vmovdqu %xmm0, 2147450880(%rax)
movq 120(%rbx), %rax
movb $0, (%rax)
jmp .L3
.L2:
movl $0, 2147450880(%rax)
movl $0, 2147450892(%rax)
.L3:
.loc 1 8 1
movq 184(%rsp), %rax
subq %fs:40, %rax
je .L9
call __stack_chk_fail@PLT
.L9:
movl %edx, %eax
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4865:
.size main, .-main
.type _sub_I_00099_0, @function
_sub_I_00099_0:
.LFB4866:
.cfi_startproc
.loc 1 8 1
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
call __asan_init@PLT
call __asan_version_mismatch_check_v8@PLT
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4866:
.size _sub_I_00099_0, .-_sub_I_00099_0
.section .init_array.00099,"aw"
.align 8
.quad _sub_I_00099_0
.text
.Letext0:
.section .debug_info,"",@progbits
.Ldebug_info0:
.long 0x131
.value 0x5
.byte 0x1
.byte 0x8
.long .Ldebug_abbrev0
.uleb128 0x3
.long .LASF17
.byte 0x1d
.long .LASF0
.long .LASF1
.quad .Ltext0
.quad .Letext0-.Ltext0
.long .Ldebug_line0
.uleb128 0x1
.byte 0x8
.byte 0x5
.long .LASF2
.uleb128 0x1
.byte 0x8
.byte 0x7
.long .LASF3
.uleb128 0x4
.byte 0x4
.byte 0x5
.string "int"
.uleb128 0x1
.byte 0x8
.byte 0x5
.long .LASF4
.uleb128 0x1
.byte 0x10
.byte 0x4
.long .LASF5
.uleb128 0x1
.byte 0x4
.byte 0x7
.long .LASF6
.uleb128 0x1
.byte 0x8
.byte 0x7
.long .LASF7
.uleb128 0x1
.byte 0x2
.byte 0x5
.long .LASF8
.uleb128 0x1
.byte 0x1
.byte 0x6
.long .LASF9
.uleb128 0x1
.byte 0x4
.byte 0x4
.long .LASF10
.uleb128 0x1
.byte 0x1
.byte 0x8
.long .LASF11
.uleb128 0x1
.byte 0x2
.byte 0x7
.long .LASF12
.uleb128 0x1
.byte 0x1
.byte 0x6
.long .LASF13
.uleb128 0x1
.byte 0x8
.byte 0x4
.long .LASF14
.uleb128 0x5
.long .LASF18
.byte 0x2
.byte 0x2a
.byte 0xd
.long 0x9c
.uleb128 0x6
.long 0x3c
.long 0xa8
.uleb128 0x2
.byte 0xf
.byte 0
.uleb128 0x1
.byte 0x2
.byte 0x4
.long .LASF15
.uleb128 0x1
.byte 0x2
.byte 0x4
.long .LASF16
.uleb128 0x7
.long .LASF19
.byte 0x1
.byte 0x3
.byte 0x5
.long 0x3c
.quad .LFB4865
.quad .LFE4865-.LFB4865
.uleb128 0x1
.byte 0x9c
.long 0x108
.uleb128 0x8
.string "v"
.byte 0x1
.byte 0x4
.byte 0xb
.long 0x108
.uleb128 0x3
.byte 0x72
.sleb128 -128
.uleb128 0x9
.long 0x118
.quad .LBB4
.quad .LBE4-.LBB4
.byte 0x1
.byte 0x4
.byte 0xf
.uleb128 0xa
.long 0x126
.uleb128 0x2
.byte 0x77
.sleb128 28
.byte 0
.byte 0
.uleb128 0xb
.long .LASF20
.long 0x43
.long 0x118
.uleb128 0x2
.byte 0x7
.byte 0
.uleb128 0xc
.long .LASF21
.byte 0x2
.value 0x1053
.byte 0x1
.long 0x108
.byte 0x3
.uleb128 0xd
.string "__A"
.byte 0x2
.value 0x1053
.byte 0x18
.long 0x3c
.byte 0
.byte 0
.section .debug_abbrev,"",@progbits
.Ldebug_abbrev0:
.uleb128 0x1
.uleb128 0x24
.byte 0
.uleb128 0xb
.uleb128 0xb
.uleb128 0x3e
.uleb128 0xb
.uleb128 0x3
.uleb128 0xe
.byte 0
.byte 0
.uleb128 0x2
.uleb128 0x21
.byte 0
.uleb128 0x2f
.uleb128 0xb
.byte 0
.byte 0
.uleb128 0x3
.uleb128 0x11
.byte 0x1
.uleb128 0x25
.uleb128 0xe
.uleb128 0x13
.uleb128 0xb
.uleb128 0x3
.uleb128 0x1f
.uleb128 0x1b
.uleb128 0x1f
.uleb128 0x11
.uleb128 0x1
.uleb128 0x12
.uleb128 0x7
.uleb128 0x10
.uleb128 0x17
.byte 0
.byte 0
.uleb128 0x4
.uleb128 0x24
.byte 0
.uleb128 0xb
.uleb128 0xb
.uleb128 0x3e
.uleb128 0xb
.uleb128 0x3
.uleb128 0x8
.byte 0
.byte 0
.uleb128 0x5
.uleb128 0x16
.byte 0
.uleb128 0x3
.uleb128 0xe
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x39
.uleb128 0xb
.uleb128 0x49
.uleb128 0x13
.byte 0
.byte 0
.uleb128 0x6
.uleb128 0x1
.byte 0x1
.uleb128 0x2107
.uleb128 0x19
.uleb128 0x49
.uleb128 0x13
.uleb128 0x1
.uleb128 0x13
.byte 0
.byte 0
.uleb128 0x7
.uleb128 0x2e
.byte 0x1
.uleb128 0x3f
.uleb128 0x19
.uleb128 0x3
.uleb128 0xe
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x39
.uleb128 0xb
.uleb128 0x49
.uleb128 0x13
.uleb128 0x11
.uleb128 0x1
.uleb128 0x12
.uleb128 0x7
.uleb128 0x40
.uleb128 0x18
.uleb128 0x7c
.uleb128 0x19
.uleb128 0x1
.uleb128 0x13
.byte 0
.byte 0
.uleb128 0x8
.uleb128 0x34
.byte 0
.uleb128 0x3
.uleb128 0x8
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x39
.uleb128 0xb
.uleb128 0x49
.uleb128 0x13
.uleb128 0x2
.uleb128 0x18
.byte 0
.byte 0
.uleb128 0x9
.uleb128 0x1d
.byte 0x1
.uleb128 0x31
.uleb128 0x13
.uleb128 0x11
.uleb128 0x1
.uleb128 0x12
.uleb128 0x7
.uleb128 0x58
.uleb128 0xb
.uleb128 0x59
.uleb128 0xb
.uleb128 0x57
.uleb128 0xb
.byte 0
.byte 0
.uleb128 0xa
.uleb128 0x5
.byte 0
.uleb128 0x31
.uleb128 0x13
.uleb128 0x2
.uleb128 0x18
.byte 0
.byte 0
.uleb128 0xb
.uleb128 0x1
.byte 0x1
.uleb128 0x3
.uleb128 0xe
.uleb128 0x2107
.uleb128 0x19
.uleb128 0x49
.uleb128 0x13
.uleb128 0x1
.uleb128 0x13
.byte 0
.byte 0
.uleb128 0xc
.uleb128 0x2e
.byte 0x1
.uleb128 0x3f
.uleb128 0x19
.uleb128 0x3
.uleb128 0xe
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0x5
.uleb128 0x39
.uleb128 0xb
.uleb128 0x27
.uleb128 0x19
.uleb128 0x49
.uleb128 0x13
.uleb128 0x20
.uleb128 0xb
.uleb128 0x34
.uleb128 0x19
.byte 0
.byte 0
.uleb128 0xd
.uleb128 0x5
.byte 0
.uleb128 0x3
.uleb128 0x8
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0x5
.uleb128 0x39
.uleb128 0xb
.uleb128 0x49
.uleb128 0x13
.byte 0
.byte 0
.byte 0
.section .debug_aranges,"",@progbits
.long 0x2c
.value 0x2
.long .Ldebug_info0
.byte 0x8
.byte 0
.value 0
.value 0
.quad .Ltext0
.quad .Letext0-.Ltext0
.quad 0
.quad 0
.section .debug_line,"",@progbits
.Ldebug_line0:
.section .debug_str,"MS",@progbits,1
.LASF4:
.string "long long int"
.LASF12:
.string "short unsigned int"
.LASF6:
.string "unsigned int"
.LASF19:
.string "main"
.LASF3:
.string "long unsigned int"
.LASF7:
.string "long long unsigned int"
.LASF20:
.string "__m512i"
.LASF10:
.string "float"
.LASF15:
.string "_Float16"
.LASF11:
.string "unsigned char"
.LASF9:
.string "char"
.LASF2:
.string "long int"
.LASF14:
.string "double"
.LASF18:
.string "__v16si"
.LASF13:
.string "signed char"
.LASF5:
.string "long double"
.LASF16:
.string "__bf16"
.LASF17:
.string "GNU C17 13.1.1 20230429 -mavx512f -mtune=generic -march=x86-64 -g -fsanitize=address"
.LASF21:
.string "_mm512_set1_epi32"
.LASF8:
.string "short int"
.section .debug_line_str,"MS",@progbits,1
.LASF0:
.string "repro.c"
.LASF1:
.string "/var/tmp/gist"
.ident "GCC: (GNU) 13.1.1 20230429"
.section .note.GNU-stack,"",@progbits
@oconnor663
Copy link
Author

@oconnor663
Copy link
Author

oconnor663 commented May 30, 2023

Andrew Pinski pointed out that -fstack-protector-strong is required to reproduce. It turns out that Arch Linux sets that by default, which explains why this bug didn't immediately repro on other distros. Now I can see it on Godbolt: https://godbolt.org/z/47a695sWY

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment