Skip to content

Instantly share code, notes, and snippets.

@Jokeren
Created February 28, 2023 03:29
Show Gist options
  • Save Jokeren/6da57bfcc8579931f0418917cafc4e12 to your computer and use it in GitHub Desktop.
Save Jokeren/6da57bfcc8579931f0418917cafc4e12 to your computer and use it in GitHub Desktop.
bug.ptx
//
// Generated by LLVM NVPTX Back-End
//
.version 8.0
.target sm_80
.address_size 64
// .globl triton__0d1d2d3d
.visible .entry triton__0d1d2d3d(
.param .u64 triton__0d1d2d3d_param_0,
.param .u64 triton__0d1d2d3d_param_1,
.param .u64 triton__0d1d2d3d_param_2,
.param .u32 triton__0d1d2d3d_param_3
)
.maxntid 128, 1, 1
{
.reg .pred %p<18>;
.reg .b16 %rs<17>;
.reg .b32 %r<178>;
.reg .b64 %rd<38>;
ld.param.u64 %rd18, [triton__0d1d2d3d_param_0];
ld.param.u64 %rd19, [triton__0d1d2d3d_param_1];
mov.u32 %r13, %tid.x;
shl.b32 %r14, %r13, 3;
ld.param.u64 %rd20, [triton__0d1d2d3d_param_2];
and.b32 %r15, %r14, 1016;
mov.u32 %r16, %ctaid.x;
shl.b32 %r17, %r16, 10;
or.b32 %r18, %r15, %r17;
or.b32 %r20, %r18, 1;
or.b32 %r21, %r18, 2;
or.b32 %r22, %r18, 3;
or.b32 %r23, %r18, 4;
or.b32 %r24, %r18, 5;
or.b32 %r25, %r18, 6;
or.b32 %r26, %r18, 7;
mul.hi.s32 %r27, %r18, 715827883;
shr.u32 %r28, %r27, 31;
shr.s32 %r29, %r27, 1;
add.s32 %r30, %r29, %r28;
mul.lo.s32 %r31, %r30, 12;
sub.s32 %r32, %r18, %r31;
mul.hi.s32 %r33, %r20, 715827883;
shr.u32 %r34, %r33, 31;
shr.s32 %r35, %r33, 1;
add.s32 %r36, %r35, %r34;
mul.lo.s32 %r37, %r36, 12;
sub.s32 %r38, %r20, %r37;
mul.hi.s32 %r39, %r21, 715827883;
shr.u32 %r40, %r39, 31;
shr.s32 %r41, %r39, 1;
add.s32 %r42, %r41, %r40;
mul.lo.s32 %r43, %r42, 12;
sub.s32 %r44, %r21, %r43;
mul.hi.s32 %r45, %r22, 715827883;
shr.u32 %r46, %r45, 31;
shr.s32 %r47, %r45, 1;
add.s32 %r48, %r47, %r46;
mul.lo.s32 %r49, %r48, 12;
sub.s32 %r50, %r22, %r49;
mul.hi.s32 %r51, %r23, 715827883;
shr.u32 %r52, %r51, 31;
shr.s32 %r53, %r51, 1;
add.s32 %r54, %r53, %r52;
mul.lo.s32 %r55, %r54, 12;
sub.s32 %r56, %r23, %r55;
mul.hi.s32 %r57, %r24, 715827883;
shr.u32 %r58, %r57, 31;
shr.s32 %r59, %r57, 1;
add.s32 %r60, %r59, %r58;
mul.lo.s32 %r61, %r60, 12;
sub.s32 %r62, %r24, %r61;
mul.hi.s32 %r63, %r25, 715827883;
shr.u32 %r64, %r63, 31;
shr.s32 %r65, %r63, 1;
add.s32 %r66, %r65, %r64;
mul.lo.s32 %r67, %r66, 12;
sub.s32 %r68, %r25, %r67;
mul.hi.s32 %r69, %r26, 715827883;
shr.u32 %r70, %r69, 31;
shr.s32 %r71, %r69, 1;
add.s32 %r72, %r71, %r70;
mul.lo.s32 %r73, %r72, 12;
sub.s32 %r74, %r26, %r73;
mul.wide.s32 %rd21, %r32, 4;
add.s64 %rd1, %rd18, %rd21;
mul.wide.s32 %rd22, %r38, 4;
add.s64 %rd2, %rd18, %rd22;
mul.wide.s32 %rd23, %r44, 4;
add.s64 %rd3, %rd18, %rd23;
mul.wide.s32 %rd24, %r50, 4;
add.s64 %rd4, %rd18, %rd24;
mul.wide.s32 %rd25, %r56, 4;
add.s64 %rd5, %rd18, %rd25;
mul.wide.s32 %rd26, %r62, 4;
add.s64 %rd6, %rd18, %rd26;
mul.wide.s32 %rd27, %r68, 4;
add.s64 %rd7, %rd18, %rd27;
mul.wide.s32 %rd28, %r74, 4;
add.s64 %rd8, %rd18, %rd28;
mov.pred %p1, -1;
@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
@%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ];
@%p1 ld.global.b32 { %r3 }, [ %rd3 + 0 ];
@%p1 ld.global.b32 { %r4 }, [ %rd4 + 0 ];
@%p1 ld.global.b32 { %r5 }, [ %rd5 + 0 ];
@%p1 ld.global.b32 { %r6 }, [ %rd6 + 0 ];
@%p1 ld.global.b32 { %r7 }, [ %rd7 + 0 ];
@%p1 ld.global.b32 { %r8 }, [ %rd8 + 0 ];
mul.wide.s32 %rd29, %r18, 2;
add.s64 %rd9, %rd19, %rd29;
@%p1 ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd9 + 0 ];
cvt.u16.u32 %rs2, %r9;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r9; }
cvt.u16.u32 %rs6, %r10;
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r10; }
cvt.u16.u32 %rs10, %r11;
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r11; }
cvt.u16.u32 %rs14, %r12;
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r12; }
mul.hi.s32 %r75, %r30, 715827883;
shr.u32 %r76, %r75, 31;
shr.s32 %r77, %r75, 1;
add.s32 %r78, %r77, %r76;
mul.lo.s32 %r79, %r78, 12;
sub.s32 %r80, %r30, %r79;
mul.hi.s32 %r81, %r18, 954437177;
shr.u32 %r82, %r81, 31;
shr.s32 %r83, %r81, 5;
add.s32 %r84, %r83, %r82;
mul.hi.s32 %r85, %r36, 715827883;
shr.u32 %r86, %r85, 31;
shr.s32 %r87, %r85, 1;
add.s32 %r88, %r87, %r86;
mul.lo.s32 %r89, %r88, 12;
sub.s32 %r90, %r36, %r89;
mul.hi.s32 %r91, %r20, 954437177;
shr.u32 %r92, %r91, 31;
shr.s32 %r93, %r91, 5;
add.s32 %r94, %r93, %r92;
mul.hi.s32 %r95, %r42, 715827883;
shr.u32 %r96, %r95, 31;
shr.s32 %r97, %r95, 1;
add.s32 %r98, %r97, %r96;
mul.lo.s32 %r99, %r98, 12;
sub.s32 %r100, %r42, %r99;
mul.hi.s32 %r101, %r21, 954437177;
shr.u32 %r102, %r101, 31;
shr.s32 %r103, %r101, 5;
add.s32 %r104, %r103, %r102;
mul.hi.s32 %r105, %r48, 715827883;
shr.u32 %r106, %r105, 31;
shr.s32 %r107, %r105, 1;
add.s32 %r108, %r107, %r106;
mul.lo.s32 %r109, %r108, 12;
sub.s32 %r110, %r48, %r109;
mul.hi.s32 %r111, %r22, 954437177;
shr.u32 %r112, %r111, 31;
shr.s32 %r113, %r111, 5;
add.s32 %r114, %r113, %r112;
mul.hi.s32 %r115, %r54, 715827883;
shr.u32 %r116, %r115, 31;
shr.s32 %r117, %r115, 1;
add.s32 %r118, %r117, %r116;
mul.lo.s32 %r119, %r118, 12;
sub.s32 %r120, %r54, %r119;
mul.hi.s32 %r121, %r23, 954437177;
shr.u32 %r122, %r121, 31;
shr.s32 %r123, %r121, 5;
add.s32 %r124, %r123, %r122;
mul.hi.s32 %r125, %r60, 715827883;
shr.u32 %r126, %r125, 31;
shr.s32 %r127, %r125, 1;
add.s32 %r128, %r127, %r126;
mul.lo.s32 %r129, %r128, 12;
sub.s32 %r130, %r60, %r129;
mul.hi.s32 %r131, %r24, 954437177;
shr.u32 %r132, %r131, 31;
shr.s32 %r133, %r131, 5;
add.s32 %r134, %r133, %r132;
mul.hi.s32 %r135, %r66, 715827883;
shr.u32 %r136, %r135, 31;
shr.s32 %r137, %r135, 1;
add.s32 %r138, %r137, %r136;
mul.lo.s32 %r139, %r138, 12;
sub.s32 %r140, %r66, %r139;
mul.hi.s32 %r141, %r25, 954437177;
shr.u32 %r142, %r141, 31;
shr.s32 %r143, %r141, 5;
add.s32 %r144, %r143, %r142;
mul.hi.s32 %r145, %r72, 715827883;
shr.u32 %r146, %r145, 31;
shr.s32 %r147, %r145, 1;
add.s32 %r148, %r147, %r146;
mul.lo.s32 %r149, %r148, 12;
sub.s32 %r150, %r72, %r149;
mul.hi.s32 %r151, %r26, 954437177;
shr.u32 %r152, %r151, 31;
shr.s32 %r153, %r151, 5;
add.s32 %r154, %r153, %r152;
mad.lo.s32 %r155, %r84, 144, %r80;
mad.lo.s32 %r156, %r1, 12, %r155;
mad.lo.s32 %r157, %r94, 144, %r90;
mad.lo.s32 %r158, %r2, 12, %r157;
mad.lo.s32 %r159, %r104, 144, %r100;
mad.lo.s32 %r160, %r3, 12, %r159;
mad.lo.s32 %r161, %r114, 144, %r110;
mad.lo.s32 %r162, %r4, 12, %r161;
mad.lo.s32 %r163, %r124, 144, %r120;
mad.lo.s32 %r164, %r5, 12, %r163;
mad.lo.s32 %r165, %r134, 144, %r130;
mad.lo.s32 %r166, %r6, 12, %r165;
mad.lo.s32 %r167, %r144, 144, %r140;
mad.lo.s32 %r168, %r7, 12, %r167;
mad.lo.s32 %r169, %r154, 144, %r150;
mad.lo.s32 %r170, %r8, 12, %r169;
mul.wide.s32 %rd30, %r156, 2;
add.s64 %rd10, %rd20, %rd30;
mul.wide.s32 %rd31, %r158, 2;
add.s64 %rd11, %rd20, %rd31;
mul.wide.s32 %rd32, %r160, 2;
add.s64 %rd12, %rd20, %rd32;
mul.wide.s32 %rd33, %r162, 2;
add.s64 %rd13, %rd20, %rd33;
mul.wide.s32 %rd34, %r164, 2;
add.s64 %rd14, %rd20, %rd34;
mul.wide.s32 %rd35, %r166, 2;
add.s64 %rd15, %rd20, %rd35;
mul.wide.s32 %rd36, %r168, 2;
add.s64 %rd16, %rd20, %rd36;
mul.wide.s32 %rd37, %r170, 2;
{add.s64 %rd17, %rd20, %rd37;
setp.lt.s32 %p10, %r14, 1;
@%p10 atom.global.gpu.add.noftz.f16 %rs1, [ %rd10 + 0 ], %rs2;}
{or.b32 %r171, %r14, 1;
setp.lt.s32 %p11, %r171, 1;
@%p11 atom.global.gpu.add.noftz.f16 %rs3, [ %rd11 + 0 ], %rs4;}
{or.b32 %r172, %r14, 2;
setp.lt.s32 %p12, %r172, 1;
@%p12 atom.global.gpu.add.noftz.f16 %rs5, [ %rd12 + 0 ], %rs6;}
{or.b32 %r173, %r14, 3;
setp.lt.s32 %p13, %r173, 1;
@%p13 atom.global.gpu.add.noftz.f16 %rs7, [ %rd13 + 0 ], %rs8;}
{or.b32 %r174, %r14, 4;
setp.lt.s32 %p14, %r174, 1;
@%p14 atom.global.gpu.add.noftz.f16 %rs9, [ %rd14 + 0 ], %rs10;}
{or.b32 %r175, %r14, 5;
setp.lt.s32 %p15, %r175, 1;
@%p15 atom.global.gpu.add.noftz.f16 %rs11, [ %rd15 + 0 ], %rs12;}
{or.b32 %r176, %r14, 6;
setp.lt.s32 %p16, %r176, 1;
@%p16 atom.global.gpu.add.noftz.f16 %rs13, [ %rd16 + 0 ], %rs14;}
{or.b32 %r177, %r14, 7;
setp.lt.s32 %p17, %r177, 1;
@%p17 atom.global.gpu.add.noftz.f16 %rs15, [ %rd17 + 0 ], %rs16;}
ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment