Skip to content

Instantly share code, notes, and snippets.

@nikic
Created December 15, 2022 09:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nikic/507f6ee3276e66d76b0d4a0c2b9ad7ce to your computer and use it in GitHub Desktop.
Save nikic/507f6ee3276e66d76b0d4a0c2b9ad7ce to your computer and use it in GitHub Desktop.
commit 8b2c6e753c2a1b9c52645767a64652a9828ebdb7
Author: Nikita Popov <npopov@redhat.com>
Date: Thu Dec 15 10:31:22 2022 +0100
wip
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2e6321326954..8795b27ecf69 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -218,10 +218,10 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
- Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+ Intrinsic<[], [], [IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
- Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+ Intrinsic<[], [], [IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// The 1st parameter is a mask for the types of instructions that may be allowed
// to cross the SCHED_BARRIER during scheduling.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 723a38b2ecc6..030c28178dfc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -315,8 +315,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
let SchedRW = [];
let hasNoSchedulingInfo = 1;
let hasSideEffects = 1;
- let mayLoad = 0;
- let mayStore = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
let isConvergent = 1;
let FixedSize = 1;
let Size = 0;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index ed4dc5481ce8..347a0cd3eb9d 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1272,6 +1272,8 @@ def S_BARRIER : SOPP_Pseudo <"s_barrier", (ins), "",
let simm16 = 0;
let fixed_imm = 1;
let isConvergent = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
index 68f481316944..96e2aca25bc0 100644
--- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
@@ -19,12 +19,12 @@ define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_barrier
; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_barrier
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 48c4e0276edd..f9f92448957e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT0-NEXT: v_not_b32_e32 v3, v0
; VARIANT0-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3
; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; VARIANT0-NEXT: s_barrier
-; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3
; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
@@ -37,8 +37,8 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT1-NEXT: v_not_b32_e32 v3, v0
; VARIANT1-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; VARIANT1-NEXT: s_barrier
; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3
+; VARIANT1-NEXT: s_barrier
; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT1-NEXT: s_waitcnt expcnt(0)
@@ -54,14 +54,14 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VARIANT2-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3]
+; VARIANT2-NEXT: s_waitcnt vmcnt(0)
+; VARIANT2-NEXT: s_barrier
; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4
; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VARIANT2-NEXT: v_mov_b32_e32 v3, s3
; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; VARIANT2-NEXT: s_waitcnt vmcnt(0)
-; VARIANT2-NEXT: s_barrier
; VARIANT2-NEXT: global_load_dword v0, v[0:1], off
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3]
@@ -74,13 +74,13 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VARIANT3-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3]
+; VARIANT3-NEXT: s_barrier
; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4
; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VARIANT3-NEXT: v_mov_b32_e32 v3, s3
; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; VARIANT3-NEXT: s_barrier
; VARIANT3-NEXT: global_load_dword v0, v[0:1], off
; VARIANT3-NEXT: s_waitcnt vmcnt(0)
; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index 27207ea94479..b57871feb07c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -10,13 +10,13 @@ define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_barrier
; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_barrier
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -50,19 +50,18 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4
-; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_barrier
+; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1
; SI-NEXT: ds_read_b32 v0, v0
; SI-NEXT: ds_read_b32 v3, v2
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_mov_b32_e32 v2, 0
-; SI-NEXT: s_waitcnt lgkmcnt(1)
-; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; SI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:16
; SI-NEXT: s_endpgm
;
@@ -72,10 +71,10 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
; CI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4
-; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v1
-; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_barrier
+; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v1
+; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index ecf6dd1eb17e..d256a18762a3 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -153,9 +153,9 @@ bb:
}
; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
-; GCN: flat_load_{{dword|b32}}
; GFX8_9: s_waitcnt lgkmcnt(0){{$}}
-; GFX8_9: s_waitcnt vmcnt(0){{$}}
+; GCN: flat_load_{{dword|b32}}
+; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10PLUS: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GFX10PLUS: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment