Skip to content

Instantly share code, notes, and snippets.

@dadeba
Created January 29, 2012 21:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dadeba/1700860 to your computer and use it in GitHub Desktop.
Save dadeba/1700860 to your computer and use it in GitHub Desktop.
Tahiti ISA for 4x4 DGEMM kernel Catalyst 12.1
shader main
asic(SI_ASIC)
type(PS)
// s_ps_state in s0
s_mov_b64 s[44:45], exec // 00000000: BEAC047E
s_wqm_b64 exec, exec // 00000004: BEFE0A7E
s_load_dwordx4 s[12:15], s[10:11], 0x00 // 00000008: C0860B00
v_floor_f32 v0, v2 // 0000000C: 7E004902
v_floor_f32 v1, v3 // 00000010: 7E024903
v_mul_legacy_f32 v71, 2.0, v0 // 00000014: 0E8E00F4
v_mul_legacy_f32 v3, 2.0, v1 // 00000018: 0E0602F4
s_load_dwordx8 s[16:23], s[2:3], 0x00 // 0000001C: C0C80300
s_load_dwordx4 s[24:27], s[8:9], 0x00 // 00000020: C08C0900
s_load_dwordx4 s[28:31], s[8:9], 0x04 // 00000024: C08E0904
s_load_dwordx8 s[32:39], s[2:3], 0x08 // 00000028: C0D00308
s_load_dwordx4 s[0:3], s[8:9], 0x08 // 0000002C: C0800908
s_load_dwordx4 s[40:43], s[8:9], 0x0c // 00000030: C094090C
v_mov_b32 v4, -2.0 // 00000034: 7E0802F5
v_mov_b32 v73, 0 // 00000038: 7E920280
v_mov_b32 v74, 0 // 0000003C: 7E940280
v_mov_b32 v75, 0 // 00000040: 7E960280
v_mov_b32 v76, 0 // 00000044: 7E980280
v_mov_b32 v77, 0 // 00000048: 7E9A0280
v_mov_b32 v78, 0 // 0000004C: 7E9C0280
v_mov_b32 v79, 0 // 00000050: 7E9E0280
v_mov_b32 v80, 0 // 00000054: 7EA00280
v_mov_b32 v81, 0 // 00000058: 7EA20280
v_mov_b32 v82, 0 // 0000005C: 7EA40280
v_mov_b32 v15, 0 // 00000060: 7E1E0280
v_mov_b32 v16, 0 // 00000064: 7E200280
v_mov_b32 v17, 0 // 00000068: 7E220280
v_mov_b32 v18, 0 // 0000006C: 7E240280
v_mov_b32 v19, 0 // 00000070: 7E260280
v_mov_b32 v20, 0 // 00000074: 7E280280
v_mov_b32 v21, 0 // 00000078: 7E2A0280
v_mov_b32 v22, 0 // 0000007C: 7E2C0280
v_mov_b32 v23, 0 // 00000080: 7E2E0280
v_mov_b32 v24, 0 // 00000084: 7E300280
v_mov_b32 v25, 0 // 00000088: 7E320280
v_mov_b32 v26, 0 // 0000008C: 7E340280
v_mov_b32 v27, 0 // 00000090: 7E360280
v_mov_b32 v28, 0 // 00000094: 7E380280
v_mov_b32 v29, 0 // 00000098: 7E3A0280
v_mov_b32 v30, 0 // 0000009C: 7E3C0280
v_mov_b32 v31, 0 // 000000A0: 7E3E0280
v_mov_b32 v32, 0 // 000000A4: 7E400280
v_mov_b32 v33, 0 // 000000A8: 7E420280
v_mov_b32 v34, 0 // 000000AC: 7E440280
v_mov_b32 v35, 0 // 000000B0: 7E460280
v_mov_b32 v36, 0 // 000000B4: 7E480280
label_002E:
s_waitcnt 0x0000 // 000000B8: BF8C0000
v_add_f32 v4, 2.0, v4 // 000000BC: 060808F4
s_buffer_load_dword s8, s[12:15], 0x01 // 000000C0: C2040D01
s_waitcnt lgkmcnt(0) // 000000C4: BF8C007F
v_mov_b32 v37, s8 // 000000C8: 7E4A0208
v_cmp_eq_f32 vcc, v4, v37 // 000000CC: 7C044B04
s_cbranch_vccz label_0036 // 000000D0: BF860001
s_branch label_00A2 // 000000D4: BF82006C
label_0036:
v_mov_b32 v39, 1 // 000000D8: 7E4E0281
v_mov_b32 v40, v3 // 000000DC: 7E500303
v_mov_b32 v41, v4 // 000000E0: 7E520304
v_mov_b32 v72, v4 // 000000E4: 7E900304
v_mov_b32 v44, 1 // 000000E8: 7E580281
v_mov_b32 v45, v71 // 000000EC: 7E5A0347
v_mov_b32 v46, v4 // 000000F0: 7E5C0304
v_mov_b32 v47, 0x00000100 // 000000F4: 7E5E02FF 00000100
v_mov_b32 v48, v3 // 000000FC: 7E600303
v_mov_b32 v49, v4 // 00000100: 7E620304
v_mov_b32 v50, 0x00000101 // 00000104: 7E6402FF 00000101
v_mov_b32 v51, v3 // 0000010C: 7E660303
v_mov_b32 v52, v4 // 00000110: 7E680304
v_mov_b32 v53, 0x00000100 // 00000114: 7E6A02FF 00000100
v_mov_b32 v54, v71 // 0000011C: 7E6C0347
v_mov_b32 v55, v4 // 00000120: 7E6E0304
v_mov_b32 v56, 0x00000101 // 00000124: 7E7002FF 00000101
v_mov_b32 v57, v71 // 0000012C: 7E720347
v_mov_b32 v58, v4 // 00000130: 7E740304
image_sample v[59:62], v[3:6], s[16:23], s[24:27] dmask:0xf unorm // 00000134: F0801F00 00C43B03
image_sample_o v[37:40], v[39:42], s[16:23], s[28:31] dmask:0xf unorm // 0000013C: F0C01F00 00E42527
image_sample v[63:66], v[71:74], s[32:39], s[0:3] dmask:0xf unorm // 00000144: F0801F00 00083F47
image_sample_o v[41:44], v[44:47], s[32:39], s[40:43] dmask:0xf unorm // 0000014C: F0C01F00 0148292C
image_sample_o v[45:48], v[47:50], s[16:23], s[24:27] dmask:0xf unorm // 00000154: F0C01F00 00C42D2F
image_sample_o v[49:52], v[50:53], s[16:23], s[28:31] dmask:0xf unorm // 0000015C: F0C01F00 00E43132
image_sample_o v[67:70], v[53:56], s[32:39], s[0:3] dmask:0xf unorm // 00000164: F0C01F00 00084335
image_sample_o v[53:56], v[56:59], s[32:39], s[40:43] dmask:0xf unorm // 0000016C: F0C01F00 01483538
s_waitcnt vmcnt(5) // 00000174: BF8C1F75
v_fma_f64 v[5:6], v[59:60], v[63:64], v[73:74] // 00000178: D2980005 05267F3B
v_fma_f64 v[7:8], v[59:60], v[65:66], v[75:76] // 00000180: D2980007 052E833B
s_waitcnt vmcnt(4) // 00000188: BF8C1F74
v_fma_f64 v[9:10], v[59:60], v[41:42], v[77:78] // 0000018C: D2980009 0536533B
v_fma_f64 v[11:12], v[59:60], v[43:44], v[79:80] // 00000194: D298000B 053E573B
v_fma_f64 v[13:14], v[61:62], v[63:64], v[81:82] // 0000019C: D298000D 05467F3D
v_fma_f64 v[15:16], v[61:62], v[65:66], v[15:16] // 000001A4: D298000F 043E833D
v_fma_f64 v[17:18], v[61:62], v[41:42], v[17:18] // 000001AC: D2980011 0446533D
v_fma_f64 v[19:20], v[61:62], v[43:44], v[19:20] // 000001B4: D2980013 044E573D
v_fma_f64 v[21:22], v[37:38], v[63:64], v[21:22] // 000001BC: D2980015 04567F25
v_fma_f64 v[23:24], v[37:38], v[65:66], v[23:24] // 000001C4: D2980017 045E8325
v_fma_f64 v[25:26], v[37:38], v[41:42], v[25:26] // 000001CC: D2980019 04665325
v_fma_f64 v[27:28], v[37:38], v[43:44], v[27:28] // 000001D4: D298001B 046E5725
v_fma_f64 v[29:30], v[39:40], v[63:64], v[29:30] // 000001DC: D298001D 04767F27
v_fma_f64 v[31:32], v[39:40], v[65:66], v[31:32] // 000001E4: D298001F 047E8327
v_fma_f64 v[33:34], v[39:40], v[41:42], v[33:34] // 000001EC: D2980021 04865327
v_fma_f64 v[35:36], v[39:40], v[43:44], v[35:36] // 000001F4: D2980023 048E5727
s_waitcnt vmcnt(1) // 000001FC: BF8C1F71
v_fma_f64 v[73:74], v[45:46], v[67:68], v[5:6] // 00000200: D2980049 0416872D
v_fma_f64 v[75:76], v[45:46], v[69:70], v[7:8] // 00000208: D298004B 041E8B2D
s_waitcnt vmcnt(0) // 00000210: BF8C1F70
v_fma_f64 v[77:78], v[45:46], v[53:54], v[9:10] // 00000214: D298004D 04266B2D
v_fma_f64 v[79:80], v[45:46], v[55:56], v[11:12] // 0000021C: D298004F 042E6F2D
v_fma_f64 v[81:82], v[47:48], v[67:68], v[13:14] // 00000224: D2980051 0436872F
v_fma_f64 v[15:16], v[47:48], v[69:70], v[15:16] // 0000022C: D298000F 043E8B2F
v_fma_f64 v[17:18], v[47:48], v[53:54], v[17:18] // 00000234: D2980011 04466B2F
v_fma_f64 v[19:20], v[47:48], v[55:56], v[19:20] // 0000023C: D2980013 044E6F2F
v_fma_f64 v[21:22], v[49:50], v[67:68], v[21:22] // 00000244: D2980015 04568731
v_fma_f64 v[23:24], v[49:50], v[69:70], v[23:24] // 0000024C: D2980017 045E8B31
v_fma_f64 v[25:26], v[49:50], v[53:54], v[25:26] // 00000254: D2980019 04666B31
v_fma_f64 v[27:28], v[49:50], v[55:56], v[27:28] // 0000025C: D298001B 046E6F31
v_fma_f64 v[29:30], v[51:52], v[67:68], v[29:30] // 00000264: D298001D 04768733
v_fma_f64 v[31:32], v[51:52], v[69:70], v[31:32] // 0000026C: D298001F 047E8B33
v_fma_f64 v[33:34], v[51:52], v[53:54], v[33:34] // 00000274: D2980021 04866B33
v_fma_f64 v[35:36], v[51:52], v[55:56], v[35:36] // 0000027C: D2980023 048E6F33
s_branch label_002E // 00000284: BF82FF8C
label_00A2:
v_mul_legacy_f32 v1, 4.0, v1 // 00000288: 0E0202F6
v_mov_b32 v12, v15 // 0000028C: 7E18030F
v_mov_b32 v13, v16 // 00000290: 7E1A0310
v_mov_b32 v14, v17 // 00000294: 7E1C0311
v_mov_b32 v15, v18 // 00000298: 7E1E0312
v_mov_b32 v16, v19 // 0000029C: 7E200313
v_mov_b32 v17, v20 // 000002A0: 7E220314
v_mov_b32 v18, v21 // 000002A4: 7E240315
v_mov_b32 v19, v22 // 000002A8: 7E260316
v_mov_b32 v20, v23 // 000002AC: 7E280317
v_mov_b32 v21, v24 // 000002B0: 7E2A0318
v_mov_b32 v22, v25 // 000002B4: 7E2C0319
v_mov_b32 v23, v26 // 000002B8: 7E2E031A
v_mov_b32 v24, v27 // 000002BC: 7E30031B
v_mov_b32 v25, v28 // 000002C0: 7E32031C
v_mov_b32 v26, v29 // 000002C4: 7E34031D
v_mov_b32 v27, v30 // 000002C8: 7E36031E
v_mov_b32 v28, v31 // 000002CC: 7E38031F
v_mov_b32 v29, v32 // 000002D0: 7E3A0320
v_mov_b32 v30, v33 // 000002D4: 7E3C0321
v_mov_b32 v31, v34 // 000002D8: 7E3E0322
v_mov_b32 v32, v35 // 000002DC: 7E400323
v_mov_b32 v33, v36 // 000002E0: 7E420324
s_load_dwordx4 s[0:3], s[10:11], 0x04 // 000002E4: C0800B04
s_load_dwordx4 s[8:11], s[10:11], 0x08 // 000002E8: C0840B08
s_buffer_load_dword s12, s[12:15], 0x00 // 000002EC: C2060D00
s_waitcnt lgkmcnt(0) // 000002F0: BF8C007F
v_mul_legacy_f32 v1, s12, v1 // 000002F4: 0E02020C
v_mac_legacy_f32 v1, 2.0, v0 // 000002F8: 0C0200F4
v_cvt_i32_f32 v0, v1 // 000002FC: 7E001101
s_buffer_load_dwordx4 s[0:3], s[0:3], 0x00 // 00000300: C2800100
s_buffer_load_dwordx4 s[8:11], s[8:11], 0x00 // 00000304: C2840900
s_waitcnt lgkmcnt(0) // 00000308: BF8C007F
v_add_i32 v1, vcc, s0, v0 // 0000030C: 4A020000
v_add_i32 v34, vcc, s1, v0 // 00000310: 4A440001
v_add_i32 v35, vcc, s2, v0 // 00000314: 4A460002
v_add_i32 v0, vcc, s3, v0 // 00000318: 4A000003
v_lshlrev_b32 v1, 4, v1 // 0000031C: 34020284
v_add_i32 v36, vcc, 16, v1 // 00000320: 4A480290
v_lshlrev_b32 v34, 4, v34 // 00000324: 34444484
tbuffer_load_format_xyzw v[37:40], v1, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000328: EBF31000 80012501
tbuffer_load_format_xyzw v[41:44], v1, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000330: EBF31010 80012901
tbuffer_load_format_xyzw v[45:48], v34, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000338: EBF31000 80012D22
tbuffer_load_format_xyzw v[49:52], v34, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000340: EBF31010 80013122
v_add_i32 v53, vcc, 16, v34 // 00000348: 4A6A4490
v_lshlrev_b32 v35, 4, v35 // 0000034C: 34464684
v_add_i32 v54, vcc, 16, v35 // 00000350: 4A6C4690
v_lshlrev_b32 v0, 4, v0 // 00000354: 34000084
tbuffer_load_format_xyzw v[55:58], v35, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000358: EBF31000 80013723
tbuffer_load_format_xyzw v[59:62], v35, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000360: EBF31010 80013B23
tbuffer_load_format_xyzw v[63:66], v0, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000368: EBF31000 80013F00
tbuffer_load_format_xyzw v[67:70], v0, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000370: EBF31010 80014300
v_add_i32 v71, vcc, 16, v0 // 00000378: 4A8E0090
s_waitcnt vmcnt(7) // 0000037C: BF8C1F77
v_mul_f64 v[37:38], v[37:38], s[8:9] // 00000380: D2CA0025 02001125
v_mul_f64 v[39:40], v[39:40], s[8:9] // 00000388: D2CA0027 02001127
s_waitcnt vmcnt(6) // 00000390: BF8C1F76
v_mul_f64 v[41:42], v[41:42], s[8:9] // 00000394: D2CA0029 02001129
v_mul_f64 v[43:44], v[43:44], s[8:9] // 0000039C: D2CA002B 0200112B
s_waitcnt vmcnt(5) // 000003A4: BF8C1F75
v_mul_f64 v[45:46], v[45:46], s[8:9] // 000003A8: D2CA002D 0200112D
v_mul_f64 v[47:48], v[47:48], s[8:9] // 000003B0: D2CA002F 0200112F
s_waitcnt vmcnt(4) // 000003B8: BF8C1F74
v_mul_f64 v[49:50], v[49:50], s[8:9] // 000003BC: D2CA0031 02001131
v_mul_f64 v[51:52], v[51:52], s[8:9] // 000003C4: D2CA0033 02001133
s_waitcnt vmcnt(3) // 000003CC: BF8C1F73
v_mul_f64 v[55:56], v[55:56], s[8:9] // 000003D0: D2CA0037 02001137
v_mul_f64 v[57:58], v[57:58], s[8:9] // 000003D8: D2CA0039 02001139
s_waitcnt vmcnt(2) // 000003E0: BF8C1F72
v_mul_f64 v[59:60], v[59:60], s[8:9] // 000003E4: D2CA003B 0200113B
v_mul_f64 v[61:62], v[61:62], s[8:9] // 000003EC: D2CA003D 0200113D
s_waitcnt vmcnt(1) // 000003F4: BF8C1F71
v_mul_f64 v[63:64], v[63:64], s[8:9] // 000003F8: D2CA003F 0200113F
v_mul_f64 v[65:66], v[65:66], s[8:9] // 00000400: D2CA0041 02001141
s_waitcnt vmcnt(0) // 00000408: BF8C1F70
v_mul_f64 v[67:68], v[67:68], s[8:9] // 0000040C: D2CA0043 02001143
v_mul_f64 v[69:70], v[69:70], s[8:9] // 00000414: D2CA0045 02001145
v_fma_f64 v[2:3], s[10:11], v[73:74], v[37:38] // 0000041C: D2980002 0496920A
v_fma_f64 v[4:5], s[10:11], v[75:76], v[39:40] // 00000424: D2980004 049E960A
v_fma_f64 v[6:7], s[10:11], v[77:78], v[41:42] // 0000042C: D2980006 04A69A0A
v_fma_f64 v[8:9], s[10:11], v[79:80], v[43:44] // 00000434: D2980008 04AE9E0A
v_fma_f64 v[37:38], s[10:11], v[81:82], v[45:46] // 0000043C: D2980025 04B6A20A
v_fma_f64 v[39:40], s[10:11], v[12:13], v[47:48] // 00000444: D2980027 04BE180A
v_fma_f64 v[10:11], s[10:11], v[14:15], v[49:50] // 0000044C: D298000A 04C61C0A
v_fma_f64 v[12:13], s[10:11], v[16:17], v[51:52] // 00000454: D298000C 04CE200A
v_fma_f64 v[14:15], s[10:11], v[18:19], v[55:56] // 0000045C: D298000E 04DE240A
v_fma_f64 v[16:17], s[10:11], v[20:21], v[57:58] // 00000464: D2980010 04E6280A
v_fma_f64 v[18:19], s[10:11], v[22:23], v[59:60] // 0000046C: D2980012 04EE2C0A
v_fma_f64 v[20:21], s[10:11], v[24:25], v[61:62] // 00000474: D2980014 04F6300A
v_fma_f64 v[22:23], s[10:11], v[26:27], v[63:64] // 0000047C: D2980016 04FE340A
v_fma_f64 v[24:25], s[10:11], v[28:29], v[65:66] // 00000484: D2980018 0506380A
v_fma_f64 v[26:27], s[10:11], v[30:31], v[67:68] // 0000048C: D298001A 050E3C0A
v_fma_f64 v[28:29], s[10:11], v[32:33], v[69:70] // 00000494: D298001C 0516400A
s_and_b64 exec, exec, s[44:45] // 0000049C: 87FE2C7E
tbuffer_store_format_xyzw v[2:5], v1, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 000004A0: EBF71000 80010201
s_wqm_b64 exec, exec // 000004A8: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004AC: 87FE2C7E
tbuffer_store_format_xyzw v[6:9], v36, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 000004B0: EBF71000 80010624
s_wqm_b64 exec, exec // 000004B8: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004BC: 87FE2C7E
tbuffer_store_format_xyzw v[37:40], v34, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 000004C0: EBF71000 80012522
s_wqm_b64 exec, exec // 000004C8: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004CC: 87FE2C7E
tbuffer_store_format_xyzw v[10:13], v53, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 000004D0: EBF71000 80010A35
s_wqm_b64 exec, exec // 000004D8: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004DC: 87FE2C7E
tbuffer_store_format_xyzw v[14:17], v35, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 000004E0: EBF71000 80010E23
s_wqm_b64 exec, exec // 000004E8: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004EC: 87FE2C7E
tbuffer_store_format_xyzw v[18:21], v54, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 000004F0: EBF71000 80011236
s_wqm_b64 exec, exec // 000004F8: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004FC: 87FE2C7E
tbuffer_store_format_xyzw v[22:25], v0, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000500: EBF71000 80011600
s_wqm_b64 exec, exec // 00000508: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 0000050C: 87FE2C7E
tbuffer_store_format_xyzw v[26:29], v71, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00000510: EBF71000 80011A47
v_mov_b32 v0, 0 // 00000518: 7E000280
s_mov_b64 exec, s[44:45] // 0000051C: BEFE042C
v_nop // 00000520: 7E000000
v_nop // 00000524: 7E000000
exp null, off, off, off, off done vm // 00000528: F8001890 00000000
s_endpgm // 00000530: BF810000
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment