Skip to content

Instantly share code, notes, and snippets.

@abadams
Created October 20, 2017 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abadams/ff9c5a32ca2473760414c925048de100 to your computer and use it in GitHub Desktop.
Save abadams/ff9c5a32ca2473760414c925048de100 to your computer and use it in GitHub Desktop.
.version 4.0
.target sm_50
.address_size 64
// .globl kernel_out_s0_y_y___block_id_y // -- Begin function kernel_out_s0_y_y___block_id_y
// @kernel_out_s0_y_y___block_id_y
.visible .entry kernel_out_s0_y_y___block_id_y(
.param .u32 kernel_out_s0_y_y___block_id_y_param_0,
.param .u32 kernel_out_s0_y_y___block_id_y_param_1,
.param .u32 kernel_out_s0_y_y___block_id_y_param_2,
.param .u64 kernel_out_s0_y_y___block_id_y_param_3,
.param .u64 kernel_out_s0_y_y___block_id_y_param_4,
.param .u64 kernel_out_s0_y_y___block_id_y_param_5
)
{
.reg .pred %p<4>;
.reg .f32 %f<265>;
.reg .b32 %r<40>;
.reg .b64 %rd<13>;
// BB#0: // %entry
ld.param.u32 %r13, [kernel_out_s0_y_y___block_id_y_param_2];
ld.param.u32 %r15, [kernel_out_s0_y_y___block_id_y_param_0];
ld.param.u64 %rd4, [kernel_out_s0_y_y___block_id_y_param_5];
cvta.to.global.u64 %rd1, %rd4;
ld.param.u32 %r16, [kernel_out_s0_y_y___block_id_y_param_1];
ld.param.u64 %rd5, [kernel_out_s0_y_y___block_id_y_param_4];
cvta.to.global.u64 %rd2, %rd5;
ld.param.u64 %rd6, [kernel_out_s0_y_y___block_id_y_param_3];
cvta.to.global.u64 %rd3, %rd6;
mov.u32 %r17, %ctaid.y;
mov.u32 %r18, %ctaid.x;
mov.u32 %r19, %tid.x;
shl.b32 %r20, %r19, 1;
mul.lo.s32 %r21, %r18, 192;
setp.lt.u32 %p1, %r21, 832;
selp.s32 %r22, -1, 0, %p1;
not.b32 %r23, %r22;
and.b32 %r24, %r21, %r22;
and.b32 %r25, %r23, 832;
or.b32 %r26, %r25, %r24;
add.s32 %r1, %r26, %r20;
shl.b32 %r27, %r16, 10;
shl.b32 %r2, %r17, 13;
sub.s32 %r28, %r2, %r27;
or.b32 %r3, %r28, %r19;
shl.b32 %r29, %r15, 10;
sub.s32 %r36, %r1, %r29;
mov.f32 %f217, 0f00000000;
mov.u32 %r14, 0;
mov.f32 %f218, %f217;
mov.f32 %f219, %f217;
mov.f32 %f220, %f217;
mov.f32 %f221, %f217;
mov.f32 %f222, %f217;
mov.f32 %f223, %f217;
mov.f32 %f224, %f217;
mov.f32 %f225, %f217;
mov.f32 %f226, %f217;
mov.f32 %f227, %f217;
mov.f32 %f228, %f217;
mov.f32 %f229, %f217;
mov.f32 %f230, %f217;
mov.f32 %f231, %f217;
mov.f32 %f232, %f217;
mov.f32 %f233, %f217;
mov.f32 %f234, %f217;
mov.f32 %f235, %f217;
mov.f32 %f236, %f217;
mov.f32 %f237, %f217;
mov.f32 %f238, %f217;
mov.f32 %f239, %f217;
mov.f32 %f240, %f217;
mov.f32 %f241, %f217;
mov.f32 %f242, %f217;
mov.f32 %f243, %f217;
mov.f32 %f244, %f217;
mov.f32 %f245, %f217;
mov.f32 %f246, %f217;
mov.f32 %f247, %f217;
mov.f32 %f248, %f217;
mov.f32 %f249, %f217;
mov.f32 %f250, %f217;
mov.f32 %f251, %f217;
mov.f32 %f252, %f217;
mov.f32 %f253, %f217;
mov.f32 %f254, %f217;
mov.f32 %f255, %f217;
mov.f32 %f256, %f217;
mov.f32 %f257, %f217;
mov.f32 %f258, %f217;
mov.f32 %f259, %f217;
mov.f32 %f260, %f217;
mov.f32 %f261, %f217;
mov.f32 %f262, %f217;
mov.f32 %f263, %f217;
mov.f32 %f264, %f217;
mov.u32 %r37, %r14;
LBB0_1: // %"for prod.s1.r4$x.r5"
// =>This Loop Header: Depth=1
// Child Loop BB0_2 Depth 2
shl.b32 %r31, %r37, 5;
add.s32 %r32, %r3, %r31;
mul.wide.s32 %rd7, %r32, 4;
add.s64 %rd8, %rd2, %rd7;
ld.global.nc.f32 %f49, [%rd8];
ld.global.nc.f32 %f50, [%rd8+4096];
ld.global.nc.f32 %f51, [%rd8+8192];
ld.global.nc.f32 %f52, [%rd8+12288];
ld.global.nc.f32 %f53, [%rd8+16384];
ld.global.nc.f32 %f54, [%rd8+20480];
ld.global.nc.f32 %f55, [%rd8+24576];
ld.global.nc.f32 %f56, [%rd8+28672];
mov.u32 %r38, %r36;
mov.u32 %r39, %r14;
LBB0_2: // %"for prod.s1.r4$x.r6"
// Parent Loop BB0_1 Depth=1
// => This Inner Loop Header: Depth=2
shfl.idx.b32 %f155, %f49, %r39, 31;
mul.wide.s32 %rd9, %r38, 4;
add.s64 %rd10, %rd3, %rd9;
ld.global.nc.v2.f32 {%f156, %f157}, [%rd10];
fma.rn.ftz.f32 %f248, %f157, %f155, %f248;
fma.rn.ftz.f32 %f247, %f156, %f155, %f247;
shfl.idx.b32 %f158, %f50, %r39, 31;
fma.rn.ftz.f32 %f242, %f157, %f158, %f242;
fma.rn.ftz.f32 %f241, %f156, %f158, %f241;
shfl.idx.b32 %f159, %f51, %r39, 31;
fma.rn.ftz.f32 %f236, %f157, %f159, %f236;
fma.rn.ftz.f32 %f235, %f156, %f159, %f235;
shfl.idx.b32 %f160, %f52, %r39, 31;
fma.rn.ftz.f32 %f230, %f157, %f160, %f230;
fma.rn.ftz.f32 %f229, %f156, %f160, %f229;
shfl.idx.b32 %f161, %f53, %r39, 31;
fma.rn.ftz.f32 %f224, %f157, %f161, %f224;
fma.rn.ftz.f32 %f223, %f156, %f161, %f223;
shfl.idx.b32 %f162, %f54, %r39, 31;
fma.rn.ftz.f32 %f218, %f157, %f162, %f218;
fma.rn.ftz.f32 %f217, %f156, %f162, %f217;
shfl.idx.b32 %f163, %f55, %r39, 31;
fma.rn.ftz.f32 %f254, %f157, %f163, %f254;
fma.rn.ftz.f32 %f253, %f156, %f163, %f253;
shfl.idx.b32 %f164, %f56, %r39, 31;
fma.rn.ftz.f32 %f260, %f157, %f164, %f260;
fma.rn.ftz.f32 %f259, %f156, %f164, %f259;
ld.global.nc.v2.f32 {%f165, %f166}, [%rd10+256];
fma.rn.ftz.f32 %f246, %f155, %f166, %f246;
fma.rn.ftz.f32 %f245, %f155, %f165, %f245;
fma.rn.ftz.f32 %f240, %f158, %f166, %f240;
fma.rn.ftz.f32 %f239, %f158, %f165, %f239;
fma.rn.ftz.f32 %f234, %f159, %f166, %f234;
fma.rn.ftz.f32 %f233, %f159, %f165, %f233;
fma.rn.ftz.f32 %f228, %f160, %f166, %f228;
fma.rn.ftz.f32 %f227, %f160, %f165, %f227;
fma.rn.ftz.f32 %f222, %f161, %f166, %f222;
fma.rn.ftz.f32 %f221, %f161, %f165, %f221;
fma.rn.ftz.f32 %f250, %f162, %f166, %f250;
fma.rn.ftz.f32 %f249, %f162, %f165, %f249;
fma.rn.ftz.f32 %f256, %f166, %f163, %f256;
fma.rn.ftz.f32 %f255, %f165, %f163, %f255;
fma.rn.ftz.f32 %f262, %f166, %f164, %f262;
fma.rn.ftz.f32 %f261, %f165, %f164, %f261;
ld.global.nc.v2.f32 {%f167, %f168}, [%rd10+512];
fma.rn.ftz.f32 %f244, %f155, %f168, %f244;
fma.rn.ftz.f32 %f243, %f155, %f167, %f243;
fma.rn.ftz.f32 %f238, %f158, %f168, %f238;
fma.rn.ftz.f32 %f237, %f158, %f167, %f237;
fma.rn.ftz.f32 %f232, %f159, %f168, %f232;
fma.rn.ftz.f32 %f231, %f159, %f167, %f231;
fma.rn.ftz.f32 %f226, %f160, %f168, %f226;
fma.rn.ftz.f32 %f225, %f160, %f167, %f225;
fma.rn.ftz.f32 %f220, %f161, %f168, %f220;
fma.rn.ftz.f32 %f219, %f161, %f167, %f219;
fma.rn.ftz.f32 %f252, %f162, %f168, %f252;
fma.rn.ftz.f32 %f251, %f162, %f167, %f251;
fma.rn.ftz.f32 %f258, %f163, %f168, %f258;
fma.rn.ftz.f32 %f257, %f163, %f167, %f257;
fma.rn.ftz.f32 %f264, %f168, %f164, %f264;
fma.rn.ftz.f32 %f263, %f167, %f164, %f263;
add.s32 %r39, %r39, 1;
add.s32 %r38, %r38, 1024;
setp.ne.s32 %p2, %r39, 32;
@%p2 bra LBB0_2;
// BB#3: // %"end for prod.s1.r4$x.r6"
// in Loop: Header=BB0_1 Depth=1
add.s32 %r37, %r37, 1;
add.s32 %r36, %r36, 32768;
setp.ne.s32 %p3, %r37, 32;
@%p3 bra LBB0_1;
// BB#4: // %"consume prod"
shl.b32 %r33, %r13, 10;
sub.s32 %r34, %r2, %r33;
add.s32 %r35, %r34, %r1;
mul.wide.s32 %rd11, %r35, 4;
add.s64 %rd12, %rd1, %rd11;
st.global.v2.f32 [%rd12], {%f247, %f248};
st.global.v2.f32 [%rd12+256], {%f245, %f246};
st.global.v2.f32 [%rd12+512], {%f243, %f244};
st.global.v2.f32 [%rd12+4096], {%f241, %f242};
st.global.v2.f32 [%rd12+4352], {%f239, %f240};
st.global.v2.f32 [%rd12+4608], {%f237, %f238};
st.global.v2.f32 [%rd12+8192], {%f235, %f236};
st.global.v2.f32 [%rd12+8448], {%f233, %f234};
st.global.v2.f32 [%rd12+8704], {%f231, %f232};
st.global.v2.f32 [%rd12+12288], {%f229, %f230};
st.global.v2.f32 [%rd12+12544], {%f227, %f228};
st.global.v2.f32 [%rd12+12800], {%f225, %f226};
st.global.v2.f32 [%rd12+16384], {%f223, %f224};
st.global.v2.f32 [%rd12+16640], {%f221, %f222};
st.global.v2.f32 [%rd12+16896], {%f219, %f220};
st.global.v2.f32 [%rd12+20480], {%f217, %f218};
st.global.v2.f32 [%rd12+20736], {%f249, %f250};
st.global.v2.f32 [%rd12+20992], {%f251, %f252};
st.global.v2.f32 [%rd12+24576], {%f253, %f254};
st.global.v2.f32 [%rd12+24832], {%f255, %f256};
st.global.v2.f32 [%rd12+25088], {%f257, %f258};
st.global.v2.f32 [%rd12+28672], {%f259, %f260};
st.global.v2.f32 [%rd12+28928], {%f261, %f262};
st.global.v2.f32 [%rd12+29184], {%f263, %f264};
ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment