Skip to content

Instantly share code, notes, and snippets.

@JustinTArthur
Last active March 22, 2022 19:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JustinTArthur/3edfa2237854878ff4ae56cbd2861ada to your computer and use it in GitHub Desktop.
Save JustinTArthur/3edfa2237854878ff4ae56cbd2861ada to your computer and use it in GitHub Desktop.
Radeon R600 GPU Code for Signal Waiting
module &signal:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
// Sample Kernel From Michael Körber
prog kernel &__signal_test_kernel(
kernarg_u64 %val,
kernarg_sig64 %handle_in,
kernarg_sig64 %handle_out)
{
@__OpenCL_signal_test_kernel_entry:
ld_kernarg_align(8)_width(all)_sig64 $d0, [%handle_in];
ld_kernarg_align(8)_width(all)_sig64 $d1, [%handle_out];
// load value pointer to d0
ld_kernarg_align(8)_width(all)_u64 $d2, [%val];
@loop:
atomicnoret_add_global_scacq_wg_s64 [$d2], 1;
signal_wait_eq_rlx_s64_sig64 $d1, $d0, 1;
cmp_ne_b1_s64 $c1, $d1, 1;
cbr_width(all)_b1 $c1, @loop;
ret;
};
asic(VI)
type(CS)
//
// amd_kernel_code_t for &__signal_test_kernel (000000000000 - 000000000100)
//
//
// &__signal_test_kernel:
//
s_mov_b32 m0, 0x00010000 // 000000000100: BEFC00FF 00010000
s_load_dwordx2 s[0:1], s[4:5], 0x08 // 000000000108: C0060002 00000008
s_load_dwordx2 s[2:3], s[4:5], 0x00 // 000000000110: C0060082 00000000
s_mov_b64 s[4:5], exec // 000000000118: BE84017E
label_0047:
s_waitcnt lgkmcnt(0) // 00000000011C: BF8C007F
v_mov_b32 v0, s2 // 000000000120: 7E000202
v_mov_b32 v1, s3 // 000000000124: 7E020203
v_lshlrev_b64 v[2:3], 0, 1 // 000000000128: D28F0002 00010280
flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] // 000000000130: DD880000 00000200
s_memrealtime s[6:7] // 000000000138: C0940180 00000000
s_mov_b64 s[8:9], exec // 000000000140: BE88017E
s_mov_b64 s[10:11], exec // 000000000144: BE8A017E
s_nop 0x0000 // 000000000148: BF800000
s_nop 0x0000 // 00000000014C: BF800000
s_nop 0x0000 // 000000000150: BF800000
s_nop 0x0000 // 000000000154: BF800000
s_nop 0x0000 // 000000000158: BF800000
s_nop 0x0000 // 00000000015C: BF800000
label_0058:
v_add_u32 v0, s[12:13], s0, 8 // 000000000160: D1190C00 00011000
v_mov_b32 v1, s1 // 000000000168: 7E020201
v_addc_u32 v1, vcc, v1, 0, s[12:13] // 00000000016C: D11C6A01 00310101
flat_load_dwordx2 v[2:3], v[0:1] glc // 000000000174: DC550000 02000000
s_memrealtime s[12:13] // 00000000017C: C0940300 00000000
s_waitcnt lgkmcnt(0) // 000000000184: BF8C007F
s_sub_u32 s12, s12, s6 // 000000000188: 808C060C
s_subb_u32 s13, s13, s7 // 00000000018C: 828D070D
v_mov_b32 v0, 0x0000ffff // 000000000190: 7E0002FF 0000FFFF
v_mov_b32 v1, 0 // 000000000198: 7E020280
v_cmp_ge_u64 s[12:13], s[12:13], v[0:1] // 00000000019C: D0EE000C 0002000C
s_waitcnt vmcnt(0) // 0000000001A4: BF8C0F70
v_cmp_eq_i64 s[14:15], v[2:3], 1 // 0000000001A8: D0E2000E 00010302
s_or_b64 s[12:13], s[14:15], s[12:13] // 0000000001B0: 878C0C0E
s_mov_b64 s[14:15], exec // 0000000001B4: BE8E017E
s_andn2_b64 exec, s[14:15], s[12:13] // 0000000001B8: 89FE0C0E
s_cbranch_execz label_0070 // 0000000001BC: BF880000
label_0070:
s_andn2_b64 exec, s[14:15], exec // 0000000001C0: 89FE7E0E
s_cbranch_execz label_0074 // 0000000001C4: BF880002
s_andn2_b64 s[10:11], s[10:11], exec // 0000000001C8: 898A7E0A
s_cbranch_scc0 label_0077 // 0000000001CC: BF840003
label_0074:
s_and_b64 exec, s[14:15], s[10:11] // 0000000001D0: 86FE0A0E
s_sleep 0x0001 // 0000000001D4: BF8E0001
s_branch label_0058 // 0000000001D8: BF82FFE1
label_0077:
s_mov_b64 exec, s[8:9] // 0000000001DC: BEFE0108
v_cmp_ne_i64 s[6:7], v[2:3], 1 // 0000000001E0: D0E50006 00010302
s_mov_b64 s[8:9], exec // 0000000001E8: BE88017E
s_andn2_b64 exec, s[8:9], s[6:7] // 0000000001EC: 89FE0608
s_cbranch_execz label_007F // 0000000001F0: BF880002
s_andn2_b64 s[4:5], s[4:5], exec // 0000000001F4: 89847E04
s_cbranch_scc0 label_0081 // 0000000001F8: BF840002
label_007F:
s_and_b64 exec, s[8:9], s[4:5] // 0000000001FC: 86FE0408
s_branch label_0047 // 000000000200: BF82FFC6
label_0081:
s_endpgm // 000000000204: BF810000
asic(VI)
type(CS)
//
// amd_kernel_code_t for &__signal_test_kernel (000000000000 - 000000000100)
//
//
// &__signal_test_kernel:
//
s_mov_b32 m0, 0x00010000 // 000000000100: BEFC00FF 00010000
s_load_dwordx4 s[0:3], s[4:5], 0x00 // 000000000108: C00A0002 00000000
label_0044:
v_lshlrev_b64 v[0:1], 0, 1 // 000000000110: D28F0000 00010280
s_waitcnt lgkmcnt(0) // 000000000118: BF8C007F
v_mov_b32 v2, s0 // 00000000011C: 7E040200
v_mov_b32 v3, s1 // 000000000120: 7E060201
flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] // 000000000124: DD880000 00000002
s_memrealtime s[4:5] // 00000000012C: C0940100 00000000
s_nop 0x0000 // 000000000134: BF800000
s_nop 0x0000 // 000000000138: BF800000
s_nop 0x0000 // 00000000013C: BF800000
label_0050:
s_add_u32 s6, s2, 8 // 000000000140: 80068802
s_addc_u32 s7, s3, 0 // 000000000144: 82078003
v_mov_b32 v0, s6 // 000000000148: 7E000206
v_mov_b32 v1, s7 // 00000000014C: 7E020207
flat_load_dwordx2 v[2:3], v[0:1] glc // 000000000150: DC550000 02000000
s_memrealtime s[6:7] // 000000000158: C0940180 00000000
s_waitcnt lgkmcnt(0) // 000000000160: BF8C007F
s_sub_u32 s6, s6, s4 // 000000000164: 80860406
s_subb_u32 s7, s7, s5 // 000000000168: 82870507
v_mov_b32 v0, 0x0000ffff // 00000000016C: 7E0002FF 0000FFFF
v_mov_b32 v1, 0 // 000000000174: 7E020280
v_cmp_ge_u64 vcc, s[6:7], v[0:1] // 000000000178: 7DDC0006
s_waitcnt vmcnt(0) // 00000000017C: BF8C0F70
v_cmp_eq_i64 s[6:7], v[2:3], 1 // 000000000180: D0E20006 00010302
s_or_b64 vcc, s[6:7], vcc // 000000000188: 87EA6A06
s_and_b64 vcc, vcc, exec // 00000000018C: 86EA7E6A
s_cbranch_vccnz label_0067 // 000000000190: BF870002
s_sleep 0x0001 // 000000000194: BF8E0001
s_branch label_0050 // 000000000198: BF82FFE9
label_0067:
v_lshlrev_b64 v[0:1], 0, 1 // 00000000019C: D28F0000 00010280
v_cmp_eq_i64 vcc, v[2:3], v[0:1] // 0000000001A4: 7DC40102
s_cbranch_vccz label_0044 // 0000000001A8: BF86FFD9
s_endpgm // 0000000001AC: BF810000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment