Created
October 20, 2017 21:49
-
-
Save abadams/ff9c5a32ca2473760414c925048de100 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.version 4.0 | |
.target sm_50 | |
.address_size 64 | |
// .globl kernel_out_s0_y_y___block_id_y // -- Begin function kernel_out_s0_y_y___block_id_y | |
// @kernel_out_s0_y_y___block_id_y | |
.visible .entry kernel_out_s0_y_y___block_id_y( | |
.param .u32 kernel_out_s0_y_y___block_id_y_param_0, | |
.param .u32 kernel_out_s0_y_y___block_id_y_param_1, | |
.param .u32 kernel_out_s0_y_y___block_id_y_param_2, | |
.param .u64 kernel_out_s0_y_y___block_id_y_param_3, | |
.param .u64 kernel_out_s0_y_y___block_id_y_param_4, | |
.param .u64 kernel_out_s0_y_y___block_id_y_param_5 | |
) | |
{ | |
.reg .pred %p<4>; | |
.reg .f32 %f<265>; | |
.reg .b32 %r<40>; | |
.reg .b64 %rd<13>; | |
// BB#0: // %entry | |
ld.param.u32 %r13, [kernel_out_s0_y_y___block_id_y_param_2]; | |
ld.param.u32 %r15, [kernel_out_s0_y_y___block_id_y_param_0]; | |
ld.param.u64 %rd4, [kernel_out_s0_y_y___block_id_y_param_5]; | |
cvta.to.global.u64 %rd1, %rd4; | |
ld.param.u32 %r16, [kernel_out_s0_y_y___block_id_y_param_1]; | |
ld.param.u64 %rd5, [kernel_out_s0_y_y___block_id_y_param_4]; | |
cvta.to.global.u64 %rd2, %rd5; | |
ld.param.u64 %rd6, [kernel_out_s0_y_y___block_id_y_param_3]; | |
cvta.to.global.u64 %rd3, %rd6; | |
mov.u32 %r17, %ctaid.y; | |
mov.u32 %r18, %ctaid.x; | |
mov.u32 %r19, %tid.x; | |
shl.b32 %r20, %r19, 1; | |
mul.lo.s32 %r21, %r18, 192; | |
setp.lt.u32 %p1, %r21, 832; | |
selp.s32 %r22, -1, 0, %p1; | |
not.b32 %r23, %r22; | |
and.b32 %r24, %r21, %r22; | |
and.b32 %r25, %r23, 832; | |
or.b32 %r26, %r25, %r24; | |
add.s32 %r1, %r26, %r20; | |
shl.b32 %r27, %r16, 10; | |
shl.b32 %r2, %r17, 13; | |
sub.s32 %r28, %r2, %r27; | |
or.b32 %r3, %r28, %r19; | |
shl.b32 %r29, %r15, 10; | |
sub.s32 %r36, %r1, %r29; | |
mov.f32 %f217, 0f00000000; | |
mov.u32 %r14, 0; | |
mov.f32 %f218, %f217; | |
mov.f32 %f219, %f217; | |
mov.f32 %f220, %f217; | |
mov.f32 %f221, %f217; | |
mov.f32 %f222, %f217; | |
mov.f32 %f223, %f217; | |
mov.f32 %f224, %f217; | |
mov.f32 %f225, %f217; | |
mov.f32 %f226, %f217; | |
mov.f32 %f227, %f217; | |
mov.f32 %f228, %f217; | |
mov.f32 %f229, %f217; | |
mov.f32 %f230, %f217; | |
mov.f32 %f231, %f217; | |
mov.f32 %f232, %f217; | |
mov.f32 %f233, %f217; | |
mov.f32 %f234, %f217; | |
mov.f32 %f235, %f217; | |
mov.f32 %f236, %f217; | |
mov.f32 %f237, %f217; | |
mov.f32 %f238, %f217; | |
mov.f32 %f239, %f217; | |
mov.f32 %f240, %f217; | |
mov.f32 %f241, %f217; | |
mov.f32 %f242, %f217; | |
mov.f32 %f243, %f217; | |
mov.f32 %f244, %f217; | |
mov.f32 %f245, %f217; | |
mov.f32 %f246, %f217; | |
mov.f32 %f247, %f217; | |
mov.f32 %f248, %f217; | |
mov.f32 %f249, %f217; | |
mov.f32 %f250, %f217; | |
mov.f32 %f251, %f217; | |
mov.f32 %f252, %f217; | |
mov.f32 %f253, %f217; | |
mov.f32 %f254, %f217; | |
mov.f32 %f255, %f217; | |
mov.f32 %f256, %f217; | |
mov.f32 %f257, %f217; | |
mov.f32 %f258, %f217; | |
mov.f32 %f259, %f217; | |
mov.f32 %f260, %f217; | |
mov.f32 %f261, %f217; | |
mov.f32 %f262, %f217; | |
mov.f32 %f263, %f217; | |
mov.f32 %f264, %f217; | |
mov.u32 %r37, %r14; | |
LBB0_1: // %"for prod.s1.r4$x.r5" | |
// =>This Loop Header: Depth=1 | |
// Child Loop BB0_2 Depth 2 | |
shl.b32 %r31, %r37, 5; | |
add.s32 %r32, %r3, %r31; | |
mul.wide.s32 %rd7, %r32, 4; | |
add.s64 %rd8, %rd2, %rd7; | |
ld.global.nc.f32 %f49, [%rd8]; | |
ld.global.nc.f32 %f50, [%rd8+4096]; | |
ld.global.nc.f32 %f51, [%rd8+8192]; | |
ld.global.nc.f32 %f52, [%rd8+12288]; | |
ld.global.nc.f32 %f53, [%rd8+16384]; | |
ld.global.nc.f32 %f54, [%rd8+20480]; | |
ld.global.nc.f32 %f55, [%rd8+24576]; | |
ld.global.nc.f32 %f56, [%rd8+28672]; | |
mov.u32 %r38, %r36; | |
mov.u32 %r39, %r14; | |
LBB0_2: // %"for prod.s1.r4$x.r6" | |
// Parent Loop BB0_1 Depth=1 | |
// => This Inner Loop Header: Depth=2 | |
shfl.idx.b32 %f155, %f49, %r39, 31; | |
mul.wide.s32 %rd9, %r38, 4; | |
add.s64 %rd10, %rd3, %rd9; | |
ld.global.nc.v2.f32 {%f156, %f157}, [%rd10]; | |
fma.rn.ftz.f32 %f248, %f157, %f155, %f248; | |
fma.rn.ftz.f32 %f247, %f156, %f155, %f247; | |
shfl.idx.b32 %f158, %f50, %r39, 31; | |
fma.rn.ftz.f32 %f242, %f157, %f158, %f242; | |
fma.rn.ftz.f32 %f241, %f156, %f158, %f241; | |
shfl.idx.b32 %f159, %f51, %r39, 31; | |
fma.rn.ftz.f32 %f236, %f157, %f159, %f236; | |
fma.rn.ftz.f32 %f235, %f156, %f159, %f235; | |
shfl.idx.b32 %f160, %f52, %r39, 31; | |
fma.rn.ftz.f32 %f230, %f157, %f160, %f230; | |
fma.rn.ftz.f32 %f229, %f156, %f160, %f229; | |
shfl.idx.b32 %f161, %f53, %r39, 31; | |
fma.rn.ftz.f32 %f224, %f157, %f161, %f224; | |
fma.rn.ftz.f32 %f223, %f156, %f161, %f223; | |
shfl.idx.b32 %f162, %f54, %r39, 31; | |
fma.rn.ftz.f32 %f218, %f157, %f162, %f218; | |
fma.rn.ftz.f32 %f217, %f156, %f162, %f217; | |
shfl.idx.b32 %f163, %f55, %r39, 31; | |
fma.rn.ftz.f32 %f254, %f157, %f163, %f254; | |
fma.rn.ftz.f32 %f253, %f156, %f163, %f253; | |
shfl.idx.b32 %f164, %f56, %r39, 31; | |
fma.rn.ftz.f32 %f260, %f157, %f164, %f260; | |
fma.rn.ftz.f32 %f259, %f156, %f164, %f259; | |
ld.global.nc.v2.f32 {%f165, %f166}, [%rd10+256]; | |
fma.rn.ftz.f32 %f246, %f155, %f166, %f246; | |
fma.rn.ftz.f32 %f245, %f155, %f165, %f245; | |
fma.rn.ftz.f32 %f240, %f158, %f166, %f240; | |
fma.rn.ftz.f32 %f239, %f158, %f165, %f239; | |
fma.rn.ftz.f32 %f234, %f159, %f166, %f234; | |
fma.rn.ftz.f32 %f233, %f159, %f165, %f233; | |
fma.rn.ftz.f32 %f228, %f160, %f166, %f228; | |
fma.rn.ftz.f32 %f227, %f160, %f165, %f227; | |
fma.rn.ftz.f32 %f222, %f161, %f166, %f222; | |
fma.rn.ftz.f32 %f221, %f161, %f165, %f221; | |
fma.rn.ftz.f32 %f250, %f162, %f166, %f250; | |
fma.rn.ftz.f32 %f249, %f162, %f165, %f249; | |
fma.rn.ftz.f32 %f256, %f166, %f163, %f256; | |
fma.rn.ftz.f32 %f255, %f165, %f163, %f255; | |
fma.rn.ftz.f32 %f262, %f166, %f164, %f262; | |
fma.rn.ftz.f32 %f261, %f165, %f164, %f261; | |
ld.global.nc.v2.f32 {%f167, %f168}, [%rd10+512]; | |
fma.rn.ftz.f32 %f244, %f155, %f168, %f244; | |
fma.rn.ftz.f32 %f243, %f155, %f167, %f243; | |
fma.rn.ftz.f32 %f238, %f158, %f168, %f238; | |
fma.rn.ftz.f32 %f237, %f158, %f167, %f237; | |
fma.rn.ftz.f32 %f232, %f159, %f168, %f232; | |
fma.rn.ftz.f32 %f231, %f159, %f167, %f231; | |
fma.rn.ftz.f32 %f226, %f160, %f168, %f226; | |
fma.rn.ftz.f32 %f225, %f160, %f167, %f225; | |
fma.rn.ftz.f32 %f220, %f161, %f168, %f220; | |
fma.rn.ftz.f32 %f219, %f161, %f167, %f219; | |
fma.rn.ftz.f32 %f252, %f162, %f168, %f252; | |
fma.rn.ftz.f32 %f251, %f162, %f167, %f251; | |
fma.rn.ftz.f32 %f258, %f163, %f168, %f258; | |
fma.rn.ftz.f32 %f257, %f163, %f167, %f257; | |
fma.rn.ftz.f32 %f264, %f168, %f164, %f264; | |
fma.rn.ftz.f32 %f263, %f167, %f164, %f263; | |
add.s32 %r39, %r39, 1; | |
add.s32 %r38, %r38, 1024; | |
setp.ne.s32 %p2, %r39, 32; | |
@%p2 bra LBB0_2; | |
// BB#3: // %"end for prod.s1.r4$x.r6" | |
// in Loop: Header=BB0_1 Depth=1 | |
add.s32 %r37, %r37, 1; | |
add.s32 %r36, %r36, 32768; | |
setp.ne.s32 %p3, %r37, 32; | |
@%p3 bra LBB0_1; | |
// BB#4: // %"consume prod" | |
shl.b32 %r33, %r13, 10; | |
sub.s32 %r34, %r2, %r33; | |
add.s32 %r35, %r34, %r1; | |
mul.wide.s32 %rd11, %r35, 4; | |
add.s64 %rd12, %rd1, %rd11; | |
st.global.v2.f32 [%rd12], {%f247, %f248}; | |
st.global.v2.f32 [%rd12+256], {%f245, %f246}; | |
st.global.v2.f32 [%rd12+512], {%f243, %f244}; | |
st.global.v2.f32 [%rd12+4096], {%f241, %f242}; | |
st.global.v2.f32 [%rd12+4352], {%f239, %f240}; | |
st.global.v2.f32 [%rd12+4608], {%f237, %f238}; | |
st.global.v2.f32 [%rd12+8192], {%f235, %f236}; | |
st.global.v2.f32 [%rd12+8448], {%f233, %f234}; | |
st.global.v2.f32 [%rd12+8704], {%f231, %f232}; | |
st.global.v2.f32 [%rd12+12288], {%f229, %f230}; | |
st.global.v2.f32 [%rd12+12544], {%f227, %f228}; | |
st.global.v2.f32 [%rd12+12800], {%f225, %f226}; | |
st.global.v2.f32 [%rd12+16384], {%f223, %f224}; | |
st.global.v2.f32 [%rd12+16640], {%f221, %f222}; | |
st.global.v2.f32 [%rd12+16896], {%f219, %f220}; | |
st.global.v2.f32 [%rd12+20480], {%f217, %f218}; | |
st.global.v2.f32 [%rd12+20736], {%f249, %f250}; | |
st.global.v2.f32 [%rd12+20992], {%f251, %f252}; | |
st.global.v2.f32 [%rd12+24576], {%f253, %f254}; | |
st.global.v2.f32 [%rd12+24832], {%f255, %f256}; | |
st.global.v2.f32 [%rd12+25088], {%f257, %f258}; | |
st.global.v2.f32 [%rd12+28672], {%f259, %f260}; | |
st.global.v2.f32 [%rd12+28928], {%f261, %f262}; | |
st.global.v2.f32 [%rd12+29184], {%f263, %f264}; | |
ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment