Skip to content

Instantly share code, notes, and snippets.

@dadeba
Created January 18, 2012 11:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dadeba/1632529 to your computer and use it in GitHub Desktop.
Save dadeba/1632529 to your computer and use it in GitHub Desktop.
Tahiti ISA for 4x4 DGEMM kernel generated from our IL DGEMM code
shader main
asic(SI_ASIC)
type(PS)
// s_ps_state in s0
s_mov_b64 s[44:45], exec // 00000000: BEAC047E
s_wqm_b64 exec, exec // 00000004: BEFE0A7E
v_floor_f32 v0, v2 // 00000008: 7E004902
v_floor_f32 v1, v3 // 0000000C: 7E024903
v_mul_legacy_f32 v55, 2.0, v0 // 00000010: 0E6E00F4
v_mul_legacy_f32 v3, 2.0, v1 // 00000014: 0E0602F4
s_load_dwordx4 s[12:15], s[10:11], 0x00 // 00000018: C0860B00
s_load_dwordx8 s[16:23], s[2:3], 0x00 // 0000001C: C0C80300
s_load_dwordx4 s[24:27], s[8:9], 0x00 // 00000020: C08C0900
s_load_dwordx4 s[28:31], s[8:9], 0x04 // 00000024: C08E0904
s_load_dwordx8 s[32:39], s[2:3], 0x08 // 00000028: C0D00308
s_load_dwordx4 s[0:3], s[8:9], 0x08 // 0000002C: C0800908
s_load_dwordx4 s[40:43], s[8:9], 0x0c // 00000030: C094090C
v_mov_b32 v4, -2.0 // 00000034: 7E0802F5
v_mov_b32 v57, 0 // 00000038: 7E720280
v_mov_b32 v58, 0 // 0000003C: 7E740280
v_mov_b32 v59, 0 // 00000040: 7E760280
v_mov_b32 v60, 0 // 00000044: 7E780280
v_mov_b32 v61, 0 // 00000048: 7E7A0280
v_mov_b32 v62, 0 // 0000004C: 7E7C0280
v_mov_b32 v63, 0 // 00000050: 7E7E0280
v_mov_b32 v64, 0 // 00000054: 7E800280
v_mov_b32 v65, 0 // 00000058: 7E820280
v_mov_b32 v66, 0 // 0000005C: 7E840280
v_mov_b32 v15, 0 // 00000060: 7E1E0280
v_mov_b32 v16, 0 // 00000064: 7E200280
v_mov_b32 v17, 0 // 00000068: 7E220280
v_mov_b32 v18, 0 // 0000006C: 7E240280
v_mov_b32 v19, 0 // 00000070: 7E260280
v_mov_b32 v20, 0 // 00000074: 7E280280
v_mov_b32 v21, 0 // 00000078: 7E2A0280
v_mov_b32 v22, 0 // 0000007C: 7E2C0280
v_mov_b32 v23, 0 // 00000080: 7E2E0280
v_mov_b32 v24, 0 // 00000084: 7E300280
v_mov_b32 v25, 0 // 00000088: 7E320280
v_mov_b32 v26, 0 // 0000008C: 7E340280
v_mov_b32 v27, 0 // 00000090: 7E360280
v_mov_b32 v28, 0 // 00000094: 7E380280
v_mov_b32 v29, 0 // 00000098: 7E3A0280
v_mov_b32 v30, 0 // 0000009C: 7E3C0280
v_mov_b32 v31, 0 // 000000A0: 7E3E0280
v_mov_b32 v32, 0 // 000000A4: 7E400280
v_mov_b32 v33, 0 // 000000A8: 7E420280
v_mov_b32 v34, 0 // 000000AC: 7E440280
v_mov_b32 v35, 0 // 000000B0: 7E460280
v_mov_b32 v36, 0 // 000000B4: 7E480280
label_002E:
s_waitcnt 0x0000 // 000000B8: BF8C0000
v_add_f32 v4, 2.0, v4 // 000000BC: 060808F4
s_buffer_load_dword s8, s[12:15], 0x01 // 000000C0: C2040D01
s_waitcnt lgkmcnt(0) // 000000C4: BF8C007F
v_mov_b32 v37, s8 // 000000C8: 7E4A0208
v_cmp_eq_f32 vcc, v4, v37 // 000000CC: 7C044B04
s_cbranch_vccz label_0036 // 000000D0: BF860001
s_branch label_00A0 // 000000D4: BF82006A
label_0036:
v_mov_b32 v39, 1 // 000000D8: 7E4E0281
v_mov_b32 v40, v3 // 000000DC: 7E500303
v_mov_b32 v41, v4 // 000000E0: 7E520304
v_mov_b32 v56, v4 // 000000E4: 7E700304
v_mov_b32 v44, 1 // 000000E8: 7E580281
v_mov_b32 v45, v55 // 000000EC: 7E5A0337
v_mov_b32 v46, v4 // 000000F0: 7E5C0304
image_sample v[47:50], v[3:6], s[16:23], s[24:27] dmask:0xf unorm // 000000F4: F0801F00 00C42F03
image_sample_o v[37:40], v[39:42], s[16:23], s[28:31] dmask:0xf unorm // 000000FC: F0C01F00 00E42527
image_sample v[51:54], v[55:58], s[32:39], s[0:3] dmask:0xf unorm // 00000104: F0801F00 00083337
image_sample_o v[41:44], v[44:47], s[32:39], s[40:43] dmask:0xf unorm // 0000010C: F0C01F00 0148292C
s_waitcnt vmcnt(1) // 00000114: BF8C1F71
v_fma_f64 v[5:6], v[47:48], v[51:52], v[57:58] // 00000118: D2980005 04E6672F
v_fma_f64 v[7:8], v[47:48], v[53:54], v[59:60] // 00000120: D2980007 04EE6B2F
s_waitcnt vmcnt(0) // 00000128: BF8C1F70
v_fma_f64 v[9:10], v[47:48], v[41:42], v[61:62] // 0000012C: D2980009 04F6532F
v_fma_f64 v[11:12], v[47:48], v[43:44], v[63:64] // 00000134: D298000B 04FE572F
v_fma_f64 v[13:14], v[49:50], v[51:52], v[65:66] // 0000013C: D298000D 05066731
v_fma_f64 v[15:16], v[49:50], v[53:54], v[15:16] // 00000144: D298000F 043E6B31
v_fma_f64 v[17:18], v[49:50], v[41:42], v[17:18] // 0000014C: D2980011 04465331
v_fma_f64 v[19:20], v[49:50], v[43:44], v[19:20] // 00000154: D2980013 044E5731
v_fma_f64 v[21:22], v[37:38], v[51:52], v[21:22] // 0000015C: D2980015 04566725
v_fma_f64 v[23:24], v[37:38], v[53:54], v[23:24] // 00000164: D2980017 045E6B25
v_fma_f64 v[25:26], v[37:38], v[41:42], v[25:26] // 0000016C: D2980019 04665325
v_fma_f64 v[27:28], v[37:38], v[43:44], v[27:28] // 00000174: D298001B 046E5725
v_fma_f64 v[29:30], v[39:40], v[51:52], v[29:30] // 0000017C: D298001D 04766727
v_fma_f64 v[31:32], v[39:40], v[53:54], v[31:32] // 00000184: D298001F 047E6B27
v_fma_f64 v[33:34], v[39:40], v[41:42], v[33:34] // 0000018C: D2980021 04865327
v_fma_f64 v[35:36], v[39:40], v[43:44], v[35:36] // 00000194: D2980023 048E5727
v_mov_b32 v2, 0x00000100 // 0000019C: 7E0402FF 00000100
v_mov_b32 v40, 0x00000101 // 000001A4: 7E5002FF 00000101
v_mov_b32 v41, v3 // 000001AC: 7E520303
v_mov_b32 v42, v4 // 000001B0: 7E540304
v_mov_b32 v43, 0x00000100 // 000001B4: 7E5602FF 00000100
v_mov_b32 v44, v55 // 000001BC: 7E580337
v_mov_b32 v45, v4 // 000001C0: 7E5A0304
v_mov_b32 v46, 0x00000101 // 000001C4: 7E5C02FF 00000101
v_mov_b32 v47, v55 // 000001CC: 7E5E0337
v_mov_b32 v48, v4 // 000001D0: 7E600304
image_sample_o v[49:52], v[2:5], s[16:23], s[24:27] dmask:0xf unorm // 000001D4: F0C01F00 00C43102
image_sample_o v[37:40], v[40:43], s[16:23], s[28:31] dmask:0xf unorm // 000001DC: F0C01F00 00E42528
image_sample_o v[41:44], v[43:46], s[32:39], s[0:3] dmask:0xf unorm // 000001E4: F0C01F00 0008292B
image_sample_o v[45:48], v[46:49], s[32:39], s[40:43] dmask:0xf unorm // 000001EC: F0C01F00 01482D2E
s_waitcnt vmcnt(1) // 000001F4: BF8C1F71
v_fma_f64 v[57:58], v[49:50], v[41:42], v[5:6] // 000001F8: D2980039 04165331
v_fma_f64 v[59:60], v[49:50], v[43:44], v[7:8] // 00000200: D298003B 041E5731
s_waitcnt vmcnt(0) // 00000208: BF8C1F70
v_fma_f64 v[61:62], v[49:50], v[45:46], v[9:10] // 0000020C: D298003D 04265B31
v_fma_f64 v[63:64], v[49:50], v[47:48], v[11:12] // 00000214: D298003F 042E5F31
v_fma_f64 v[65:66], v[51:52], v[41:42], v[13:14] // 0000021C: D2980041 04365333
v_fma_f64 v[15:16], v[51:52], v[43:44], v[15:16] // 00000224: D298000F 043E5733
v_fma_f64 v[17:18], v[51:52], v[45:46], v[17:18] // 0000022C: D2980011 04465B33
v_fma_f64 v[19:20], v[51:52], v[47:48], v[19:20] // 00000234: D2980013 044E5F33
v_fma_f64 v[21:22], v[37:38], v[41:42], v[21:22] // 0000023C: D2980015 04565325
v_fma_f64 v[23:24], v[37:38], v[43:44], v[23:24] // 00000244: D2980017 045E5725
v_fma_f64 v[25:26], v[37:38], v[45:46], v[25:26] // 0000024C: D2980019 04665B25
v_fma_f64 v[27:28], v[37:38], v[47:48], v[27:28] // 00000254: D298001B 046E5F25
v_fma_f64 v[29:30], v[39:40], v[41:42], v[29:30] // 0000025C: D298001D 04765327
v_fma_f64 v[31:32], v[39:40], v[43:44], v[31:32] // 00000264: D298001F 047E5727
v_fma_f64 v[33:34], v[39:40], v[45:46], v[33:34] // 0000026C: D2980021 04865B27
v_fma_f64 v[35:36], v[39:40], v[47:48], v[35:36] // 00000274: D2980023 048E5F27
s_branch label_002E // 0000027C: BF82FF8E
label_00A0:
v_mul_legacy_f32 v1, 4.0, v1 // 00000280: 0E0202F6
v_mov_b32 v12, v15 // 00000284: 7E18030F
v_mov_b32 v13, v16 // 00000288: 7E1A0310
v_mov_b32 v14, v17 // 0000028C: 7E1C0311
v_mov_b32 v15, v18 // 00000290: 7E1E0312
v_mov_b32 v16, v19 // 00000294: 7E200313
v_mov_b32 v17, v20 // 00000298: 7E220314
v_mov_b32 v18, v21 // 0000029C: 7E240315
v_mov_b32 v19, v22 // 000002A0: 7E260316
v_mov_b32 v20, v23 // 000002A4: 7E280317
v_mov_b32 v21, v24 // 000002A8: 7E2A0318
v_mov_b32 v22, v25 // 000002AC: 7E2C0319
v_mov_b32 v23, v26 // 000002B0: 7E2E031A
v_mov_b32 v24, v27 // 000002B4: 7E30031B
v_mov_b32 v25, v28 // 000002B8: 7E32031C
v_mov_b32 v26, v29 // 000002BC: 7E34031D
v_mov_b32 v27, v30 // 000002C0: 7E36031E
v_mov_b32 v28, v31 // 000002C4: 7E38031F
v_mov_b32 v29, v32 // 000002C8: 7E3A0320
v_mov_b32 v30, v33 // 000002CC: 7E3C0321
v_mov_b32 v31, v34 // 000002D0: 7E3E0322
v_mov_b32 v32, v35 // 000002D4: 7E400323
v_mov_b32 v33, v36 // 000002D8: 7E420324
s_load_dwordx4 s[0:3], s[10:11], 0x04 // 000002DC: C0800B04
s_load_dwordx4 s[8:11], s[10:11], 0x08 // 000002E0: C0840B08
s_buffer_load_dword s12, s[12:15], 0x00 // 000002E4: C2060D00
s_waitcnt lgkmcnt(0) // 000002E8: BF8C007F
v_mul_legacy_f32 v1, s12, v1 // 000002EC: 0E02020C
v_mac_legacy_f32 v1, 2.0, v0 // 000002F0: 0C0200F4
v_cvt_i32_f32 v0, v1 // 000002F4: 7E001101
s_buffer_load_dwordx4 s[0:3], s[0:3], 0x00 // 000002F8: C2800100
s_buffer_load_dwordx4 s[8:11], s[8:11], 0x00 // 000002FC: C2840900
s_waitcnt lgkmcnt(0) // 00000300: BF8C007F
v_add_i32 v1, vcc, s0, v0 // 00000304: 4A020000
v_add_i32 v34, vcc, s1, v0 // 00000308: 4A440001
v_add_i32 v35, vcc, s2, v0 // 0000030C: 4A460002
v_add_i32 v0, vcc, s3, v0 // 00000310: 4A000003
v_lshlrev_b32 v1, 4, v1 // 00000314: 34020284
v_lshlrev_b32 v34, 4, v34 // 00000318: 34444484
tbuffer_load_format_xyzw v[36:39], v1, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 0
000031C: EBF31000 80012401
tbuffer_load_format_xyzw v[40:43], v1, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_F
LOAT] // 00000324: EBF31010 80012801
tbuffer_load_format_xyzw v[44:47], v34, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] //
0000032C: EBF31000 80012C22
tbuffer_load_format_xyzw v[48:51], v34, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_
FLOAT] // 00000334: EBF31010 80013022
v_lshlrev_b32 v35, 4, v35 // 0000033C: 34464684
v_lshlrev_b32 v0, 4, v0 // 00000340: 34000084
s_waitcnt vmcnt(3) // 00000344: BF8C1F73
v_mul_f64 v[36:37], v[36:37], s[8:9] // 00000348: D2CA0024 02001124
v_mul_f64 v[38:39], v[38:39], s[8:9] // 00000350: D2CA0026 02001126
tbuffer_load_format_xyzw v[52:55], v35, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] //
00000358: EBF31000 80013423
s_waitcnt vmcnt(3) // 00000360: BF8C1F73
v_mul_f64 v[40:41], v[40:41], s[8:9] // 00000364: D2CA0028 02001128
v_mul_f64 v[42:43], v[42:43], s[8:9] // 0000036C: D2CA002A 0200112A
s_waitcnt vmcnt(2) // 00000374: BF8C1F72
v_mul_f64 v[44:45], v[44:45], s[8:9] // 00000378: D2CA002C 0200112C
v_mul_f64 v[46:47], v[46:47], s[8:9] // 00000380: D2CA002E 0200112E
s_waitcnt vmcnt(1) // 00000388: BF8C1F71
v_mul_f64 v[48:49], v[48:49], s[8:9] // 0000038C: D2CA0030 02001130
v_mul_f64 v[50:51], v[50:51], s[8:9] // 00000394: D2CA0032 02001132
s_waitcnt vmcnt(0) // 0000039C: BF8C1F70
v_mul_f64 v[52:53], v[52:53], s[8:9] // 000003A0: D2CA0034 02001134
v_mul_f64 v[54:55], v[54:55], s[8:9] // 000003A8: D2CA0036 02001136
v_fma_f64 v[2:3], s[10:11], v[57:58], v[36:37] // 000003B0: D2980002 0492720A
v_fma_f64 v[4:5], s[10:11], v[59:60], v[38:39] // 000003B8: D2980004 049A760A
v_fma_f64 v[6:7], s[10:11], v[61:62], v[40:41] // 000003C0: D2980006 04A27A0A
v_fma_f64 v[8:9], s[10:11], v[63:64], v[42:43] // 000003C8: D2980008 04AA7E0A
v_fma_f64 v[10:11], s[10:11], v[65:66], v[44:45] // 000003D0: D298000A 04B2820A
v_fma_f64 v[12:13], s[10:11], v[12:13], v[46:47] // 000003D8: D298000C 04BA180A
v_fma_f64 v[14:15], s[10:11], v[14:15], v[48:49] // 000003E0: D298000E 04C21C0A
v_fma_f64 v[16:17], s[10:11], v[16:17], v[50:51] // 000003E8: D2980010 04CA200A
v_fma_f64 v[18:19], s[10:11], v[18:19], v[52:53] // 000003F0: D2980012 04D2240A
v_fma_f64 v[20:21], s[10:11], v[20:21], v[54:55] // 000003F8: D2980014 04DA280A
tbuffer_load_format_xyzw v[36:39], v35, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_
FLOAT] // 00000400: EBF31010 80012423
tbuffer_load_format_xyzw v[40:43], v0, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 0
0000408: EBF31000 80012800
tbuffer_load_format_xyzw v[44:47], v0, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_F
LOAT] // 00000410: EBF31010 80012C00
s_and_b64 exec, exec, s[44:45] // 00000418: 87FE2C7E
tbuffer_store_format_xyzw v[2:5], v1, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] // 00
00041C: EBF71000 80010201
s_wqm_b64 exec, exec // 00000424: BEFE0A7E
tbuffer_store_format_xyzw v[6:9], v1, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FL
OAT] // 00000428: EBF71010 80010601
s_and_b64 exec, exec, s[44:45] // 00000430: 87FE2C7E
tbuffer_store_format_xyzw v[10:13], v34, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] //
00000434: EBF71000 80010A22
s_wqm_b64 exec, exec // 0000043C: BEFE0A7E
tbuffer_store_format_xyzw v[14:17], v34, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT
_FLOAT] // 00000440: EBF71010 80010E22
s_and_b64 exec, exec, s[44:45] // 00000448: 87FE2C7E
tbuffer_store_format_xyzw v[18:21], v35, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] //
0000044C: EBF71000 80011223
s_wqm_b64 exec, exec // 00000454: BEFE0A7E
s_waitcnt vmcnt(7) & expcnt(4) // 00000458: BF8C1F47
v_mul_f64 v[1:2], v[36:37], s[8:9] // 0000045C: D2CA0001 02001124
v_mul_f64 v[3:4], v[38:39], s[8:9] // 00000464: D2CA0003 02001126
s_waitcnt vmcnt(6) & expcnt(3) // 0000046C: BF8C1F36
v_mul_f64 v[5:6], v[40:41], s[8:9] // 00000470: D2CA0005 02001128
v_mul_f64 v[7:8], v[42:43], s[8:9] // 00000478: D2CA0007 0200112A
s_waitcnt vmcnt(5) & expcnt(2) // 00000480: BF8C1F25
v_mul_f64 v[9:10], v[44:45], s[8:9] // 00000484: D2CA0009 0200112C
v_mul_f64 v[11:12], v[46:47], s[8:9] // 0000048C: D2CA000B 0200112E
v_fma_f64 v[1:2], s[10:11], v[22:23], v[1:2] // 00000494: D2980001 04062C0A
v_fma_f64 v[3:4], s[10:11], v[24:25], v[3:4] // 0000049C: D2980003 040E300A
s_waitcnt expcnt(1) // 000004A4: BF8C1F1F
v_fma_f64 v[13:14], s[10:11], v[26:27], v[5:6] // 000004A8: D298000D 0416340A
v_fma_f64 v[15:16], s[10:11], v[28:29], v[7:8] // 000004B0: D298000F 041E380A
v_fma_f64 v[5:6], s[10:11], v[30:31], v[9:10] // 000004B8: D2980005 04263C0A
v_fma_f64 v[7:8], s[10:11], v[32:33], v[11:12] // 000004C0: D2980007 042E400A
s_and_b64 exec, exec, s[44:45] // 000004C8: 87FE2C7E
tbuffer_store_format_xyzw v[1:4], v35, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_F
LOAT] // 000004CC: EBF71010 80010123
s_wqm_b64 exec, exec // 000004D4: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004D8: 87FE2C7E
tbuffer_store_format_xyzw v[13:16], v0, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] //
000004DC: EBF71000 80010D00
s_wqm_b64 exec, exec // 000004E4: BEFE0A7E
s_and_b64 exec, exec, s[44:45] // 000004E8: 87FE2C7E
tbuffer_store_format_xyzw v[5:8], v0, s[4:7], 0 offen offset:16 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FL
OAT] // 000004EC: EBF71010 80010500
v_mov_b32 v0, 0 // 000004F4: 7E000280
s_mov_b64 exec, s[44:45] // 000004F8: BEFE042C
s_waitcnt expcnt(2) // 000004FC: BF8C1F2F
v_nop // 00000500: 7E000000
v_nop // 00000504: 7E000000
exp null, off, off, off, off done vm // 00000508: F8001890 00000000
s_endpgm // 00000510: BF810000
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment