Skip to content

Instantly share code, notes, and snippets.

@masahi
Created October 10, 2017 15:22
Show Gist options
  • Save masahi/6c7f270240891bc7e1e82dd221e05903 to your computer and use it in GitHub Desktop.
Save masahi/6c7f270240891bc7e1e82dd221e05903 to your computer and use it in GitHub Desktop.
The output of $ llc-5.0 -march=amdgcn -mcpu=gfx803 myadd_kernel.ll
.text
.hsa_code_object_version 2,1
.hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
.globl myadd__kernel0 ; -- Begin function myadd__kernel0
.p2align 8
.type myadd__kernel0,@function
.amdgpu_hsa_kernel myadd__kernel0
myadd__kernel0: ; @myadd__kernel0
.amd_kernel_code_t
amd_code_version_major = 1
amd_code_version_minor = 1
amd_machine_kind = 1
amd_machine_version_major = 8
amd_machine_version_minor = 0
amd_machine_version_stepping = 3
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
granulated_workitem_vgpr_count = 1
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
priv = 0
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
enable_sgpr_private_segment_wave_byte_offset = 0
user_sgpr_count = 6
enable_trap_handler = 1
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
enable_sgpr_workgroup_id_z = 0
enable_sgpr_workgroup_info = 0
enable_vgpr_workitem_id = 0
enable_exception_msb = 0
granulated_lds_size = 0
enable_exception = 0
enable_sgpr_private_segment_buffer = 1
enable_sgpr_dispatch_ptr = 0
enable_sgpr_queue_ptr = 0
enable_sgpr_kernarg_segment_ptr = 1
enable_sgpr_dispatch_id = 0
enable_sgpr_flat_scratch_init = 0
enable_sgpr_private_segment_size = 0
enable_sgpr_grid_workgroup_count_x = 0
enable_sgpr_grid_workgroup_count_y = 0
enable_sgpr_grid_workgroup_count_z = 0
enable_ordered_append_gds = 0
private_element_size = 1
is_ptr64 = 1
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
workitem_private_segment_byte_size = 0
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 28
workgroup_fbarrier_count = 0
wavefront_sgpr_count = 12
workitem_vgpr_count = 7
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
reserved_sgpr_count = 0
debug_wavefront_private_segment_offset_sgpr = 0
debug_private_segment_buffer_sgpr = 0
kernarg_segment_alignment = 4
group_segment_alignment = 4
private_segment_alignment = 4
wavefront_size = 6
call_convention = -1
runtime_loader_kernel_symbol = 0
.end_amd_kernel_code_t
; BB#0: ; %entry
s_load_dword s0, s[4:5], 0x18
v_lshlrev_b32_e32 v0, 7, v0
s_waitcnt lgkmcnt(0)
v_sub_i32_e32 v1, vcc, s0, v0
v_cmp_lt_i32_e32 vcc, s6, v1
s_and_saveexec_b64 s[0:1], vcc
s_xor_b64 s[0:1], exec, s[0:1]
; mask branch BB0_2
s_cbranch_execz BB0_2
BB0_1: ; %if_then
s_load_dwordx2 s[2:3], s[4:5], 0x0
s_load_dwordx2 s[8:9], s[4:5], 0x8
v_add_i32_e32 v0, vcc, s6, v0
s_load_dwordx2 s[4:5], s[4:5], 0x10
v_ashrrev_i32_e32 v1, 31, v0
v_lshlrev_b64 v[0:1], 2, v[0:1]
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v5, s9
v_mov_b32_e32 v6, s3
v_add_i32_e32 v2, vcc, s4, v0
v_mov_b32_e32 v3, s5
v_addc_u32_e32 v3, vcc, v3, v1, vcc
v_add_i32_e32 v4, vcc, s8, v0
v_addc_u32_e32 v5, vcc, v5, v1, vcc
flat_load_dword v2, v[2:3]
flat_load_dword v3, v[4:5]
v_add_i32_e32 v0, vcc, s2, v0
v_addc_u32_e32 v1, vcc, v6, v1, vcc
s_waitcnt vmcnt(0) lgkmcnt(0)
v_add_f32_e32 v2, v2, v3
flat_store_dword v[0:1], v2
BB0_2: ; %if_end
s_or_b64 exec, exec, s[0:1]
s_endpgm
.Lfunc_end0:
.size myadd__kernel0, .Lfunc_end0-myadd__kernel0
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 156
; NumSgprs: 12
; NumVgprs: 7
; ScratchSize: 0
; FloatMode: 192
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 12
; NumVGPRsForWavesPerEU: 7
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 1
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
.text
.globl myadd__kernel1 ; -- Begin function myadd__kernel1
.p2align 8
.type myadd__kernel1,@function
.amdgpu_hsa_kernel myadd__kernel1
myadd__kernel1: ; @myadd__kernel1
.amd_kernel_code_t
amd_code_version_major = 1
amd_code_version_minor = 1
amd_machine_kind = 1
amd_machine_version_major = 8
amd_machine_version_minor = 0
amd_machine_version_stepping = 3
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
granulated_workitem_vgpr_count = 1
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
priv = 0
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
enable_sgpr_private_segment_wave_byte_offset = 0
user_sgpr_count = 6
enable_trap_handler = 1
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
enable_sgpr_workgroup_id_z = 0
enable_sgpr_workgroup_info = 0
enable_vgpr_workitem_id = 0
enable_exception_msb = 0
granulated_lds_size = 0
enable_exception = 0
enable_sgpr_private_segment_buffer = 1
enable_sgpr_dispatch_ptr = 0
enable_sgpr_queue_ptr = 0
enable_sgpr_kernarg_segment_ptr = 1
enable_sgpr_dispatch_id = 0
enable_sgpr_flat_scratch_init = 0
enable_sgpr_private_segment_size = 0
enable_sgpr_grid_workgroup_count_x = 0
enable_sgpr_grid_workgroup_count_y = 0
enable_sgpr_grid_workgroup_count_z = 0
enable_ordered_append_gds = 0
private_element_size = 1
is_ptr64 = 1
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
workitem_private_segment_byte_size = 0
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 20
workgroup_fbarrier_count = 0
wavefront_sgpr_count = 9
workitem_vgpr_count = 5
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
reserved_sgpr_count = 0
debug_wavefront_private_segment_offset_sgpr = 0
debug_private_segment_buffer_sgpr = 0
kernarg_segment_alignment = 4
group_segment_alignment = 4
private_segment_alignment = 4
wavefront_size = 6
call_convention = -1
runtime_loader_kernel_symbol = 0
.end_amd_kernel_code_t
; BB#0: ; %entry
s_load_dword s0, s[4:5], 0x10
v_lshlrev_b32_e32 v0, 7, v0
s_waitcnt lgkmcnt(0)
v_sub_i32_e32 v1, vcc, s0, v0
v_cmp_lt_i32_e32 vcc, s6, v1
s_and_saveexec_b64 s[0:1], vcc
s_xor_b64 s[0:1], exec, s[0:1]
; mask branch BB1_2
s_cbranch_execz BB1_2
BB1_1: ; %if_then
s_load_dwordx2 s[2:3], s[4:5], 0x0
v_add_i32_e32 v0, vcc, s6, v0
s_load_dwordx2 s[4:5], s[4:5], 0x8
v_ashrrev_i32_e32 v1, 31, v0
v_lshlrev_b64 v[0:1], 2, v[0:1]
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v4, s3
v_add_i32_e32 v2, vcc, s4, v0
v_mov_b32_e32 v3, s5
v_addc_u32_e32 v3, vcc, v3, v1, vcc
flat_load_dword v2, v[2:3]
v_add_i32_e32 v0, vcc, s2, v0
v_addc_u32_e32 v1, vcc, v4, v1, vcc
s_waitcnt vmcnt(0) lgkmcnt(0)
v_add_f32_e32 v2, 1.0, v2
flat_store_dword v[0:1], v2
BB1_2: ; %if_end
s_or_b64 exec, exec, s[0:1]
s_endpgm
.Lfunc_end1:
.size myadd__kernel1, .Lfunc_end1-myadd__kernel1
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 128
; NumSgprs: 9
; NumVgprs: 5
; ScratchSize: 0
; FloatMode: 192
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 9
; NumVGPRsForWavesPerEU: 5
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 1
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
.section ".note.GNU-stack"
.amdgpu_code_object_metadata
---
Version: [ 1, 0 ]
Kernels:
- Name: myadd__kernel0
Args:
- Size: 8
Align: 8
ValueKind: GlobalBuffer
ValueType: F32
AddrSpaceQual: Global
- Size: 8
Align: 8
ValueKind: GlobalBuffer
ValueType: F32
AccQual: ReadOnly
AddrSpaceQual: Global
- Size: 8
Align: 8
ValueKind: GlobalBuffer
ValueType: F32
AccQual: ReadOnly
AddrSpaceQual: Global
- Size: 4
Align: 4
ValueKind: ByValue
ValueType: I32
CodeProps:
KernargSegmentSize: 28
WavefrontNumSGPRs: 12
WorkitemNumVGPRs: 7
KernargSegmentAlign: 4
GroupSegmentAlign: 4
PrivateSegmentAlign: 4
WavefrontSize: 6
- Name: myadd__kernel1
Args:
- Size: 8
Align: 8
ValueKind: GlobalBuffer
ValueType: I8
AddrSpaceQual: Global
- Size: 8
Align: 8
ValueKind: GlobalBuffer
ValueType: F32
AccQual: ReadOnly
AddrSpaceQual: Global
- Size: 4
Align: 4
ValueKind: ByValue
ValueType: I32
CodeProps:
KernargSegmentSize: 20
WavefrontNumSGPRs: 9
WorkitemNumVGPRs: 5
KernargSegmentAlign: 4
GroupSegmentAlign: 4
PrivateSegmentAlign: 4
WavefrontSize: 6
...
.end_amdgpu_code_object_metadata
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment