Skip to content

Instantly share code, notes, and snippets.

@scott-gray
Created June 9, 2020 17:42
Show Gist options
  • Save scott-gray/b52c2051b7f7da91994e497233188410 to your computer and use it in GitHub Desktop.
Save scott-gray/b52c2051b7f7da91994e497233188410 to your computer and use it in GitHub Desktop.
// A case for making the compiler more threadIdx aware in conditional code.
// Proposed solution:
// Walk the dependacies of any predicate gating a shfl.sync to look for threadIdx.
// Simulate all 1024 values of threadIdx with full predicate expression to see if it's warp uniform.
// Or you can also check if only single thread is active for other opimizations (like in that atomic add).
// This can't be that complicated to do.
__device__ __forceinline__ float shfl_xor(float var, int laneMask)
{
float ret;
asm ("shfl.sync.bfly.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret) : "f"(var), "r"(laneMask));
return ret;
}
__device__ __forceinline__ float sum4(float4 a) { return (a.x + a.y) + (a.z + a.w); }
__device__ __forceinline__ float cta_reduce_sum(float xsum)
{
uint tid = threadIdx.x;
// reduce across the warp
// This shuffle also produces messy sass when cta_reduce_sum is called conditinally
for (int i = 16; i > 0; i >>= 1)
xsum += shfl_xor(xsum, i);
// if block is bigger than a warp, then reduce warps
if (blockDim.x > 32)
{
__shared__ float Share[32];
float4* Share4 = (float4*)Share;
// Init shared to zero if needed
if (blockDim.x != 1024)
{
if (tid < 32)
Share[tid] = 0.0f;
__syncthreads();
}
// store 1 warp reduced value to shared for each warp
if ((tid & 31) == 0)
Share[tid/32] = xsum;
__syncthreads();
// This is the problem code here:
if (1)
{
// warp uniform shuffle.
// compiler can't figure this out and generates messy branching code
if (tid < 32)
{
// we could trim these shuffle ops depending on blockDim.x
// keep it simple for illustrative purposes
xsum = Share[tid];
for (int i = 16; i > 0; i >>= 1)
xsum += shfl_xor(xsum, i);
}
}
else
{
// Alternative using shared that's at least clean:
if (tid == 0)
{
xsum = 0.0f;
#pragma unroll 1
for (int j = 0, s = 0; j < blockDim.x; j += 256, s += 2)
xsum += sum4(Share4[s]) + sum4(Share4[s+1]);
*Share = xsum;
}
__syncthreads();
xsum = *Share;
}
}
return xsum;
}
__device__ __forceinline__ uint store_partial(float* PartialSum, uint* PartialCnt, float partial)
{
uint tid = threadIdx.x;
__shared__ uint Share[1];
// You can try swapping out conditional here to confuse compiler
// if ((tid & 1023) == 0)
if (tid == 0)
{
// store partial sum to global
PartialSum += blockIdx.x;
asm volatile ("st.relaxed.gpu.global.f32 [%0], %1;" :: "l"(PartialSum), "f"(partial) );
// given stg, atom and ldgs are "strong" I believe this isn't needed to ensure ordering?
//__threadfence();
// Count the number of stored partial sums
// Note here is a case that the compiler IS aware that there is only 1 thread active.
// Otherwise it would see the warp uniform address and multiply constant atomic add value by number of active threads.
// Try swapping out conditional above to see this.
uint count;
asm volatile ("atom.relaxed.gpu.global.add.u32 %0, [%1], 1;" : "=r"(count): "l"(PartialCnt) );
*Share = count + 1;
}
__syncthreads();
return *Share;
}
// Do a tensor wide sum squared reduction deterministically
__global__ void sum_squared_reduce(float* SumSquared, float* PartialSum, uint* PartialCnt, const float* X, uint size)
{
uint tid = threadIdx.x;
uint bid = blockIdx.x;
float sum_squared = 0.0f;
// tile the reduction among blocks and compute partial sums
#pragma unroll 1
for (uint offset = bid*blockDim.x + tid; offset < size; offset += gridDim.x*blockDim.x)
{
float x = __ldg(X + offset);
sum_squared += x*x;
}
// reduce within this cta
sum_squared = cta_reduce_sum(sum_squared);
// store partial sum to global and check if we're the last block
uint partial_cnt = store_partial(PartialSum, PartialCnt, sum_squared);
// Note that this conditional is guaranteed to be warp uniform (no tid involved) but compiler is unable to deduce this.
if (partial_cnt == gridDim.x)
{
// Last block completes the sum
sum_squared = 0.0f;
#pragma unroll 1
for (uint offset = tid; offset <= gridDim.x; offset += blockDim.x)
{
// Do these ldg's also need to be strong to ensure ordering?
float partial;
asm ("ld.relaxed.gpu.global.f32 %0, [%1];" : "=f"(partial): "l"(PartialSum + offset) );
sum_squared += partial;
}
// Final cta reduction
// This time there's no way to avoid nasty shfl.sync branching because we're in the partial_cnt conditional
sum_squared = cta_reduce_sum(sum_squared);
// first thread outputs final reduction
if (tid == 0)
*SumSquared = sum_squared;
}
}
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)"
.elftype @"ET_EXEC"
//--------------------- .text._Z18sum_squared_reducePfS_PjPKfj --------------------------
.section .text._Z18sum_squared_reducePfS_PjPKfj,"ax",@progbits
.sectionflags @"SHF_BARRIERS=1"
.sectioninfo @"SHI_REGISTERS=14"
.align 128
.global _Z18sum_squared_reducePfS_PjPKfj
.type _Z18sum_squared_reducePfS_PjPKfj,@function
.size _Z18sum_squared_reducePfS_PjPKfj,(.L_56 - _Z18sum_squared_reducePfS_PjPKfj)
.other _Z18sum_squared_reducePfS_PjPKfj,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z18sum_squared_reducePfS_PjPKfj:
.text._Z18sum_squared_reducePfS_PjPKfj:
/*0000*/ MOV R1, c[0x0][0x28] ;
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
/*0020*/ S2R R3, SR_CTAID.X ;
/*0030*/ BMOV.32.CLEAR RZ, B0 ;
/*0040*/ BSSY B0, `(.L_1) ;
/*0050*/ IMAD.MOV.U32 R6, RZ, RZ, RZ ;
/*0060*/ S2R R0, SR_TID.X ;
/*0070*/ IMAD R2, R3, c[0x0][0x0], R0 ;
/*0080*/ ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ;
/*0090*/ @P0 BRA `(.L_2) ;
/*00a0*/ MOV R6, RZ ;
.L_3:
/*00b0*/ IMAD.MOV.U32 R5, RZ, RZ, 0x4 ;
/*00c0*/ IMAD.WIDE.U32 R4, R2, R5, c[0x0][0x178] ;
/*00d0*/ LDG.E.CONSTANT.SYS R5, [R4] ;
/*00e0*/ MOV R7, c[0x0][0xc] ;
/*00f0*/ IMAD R2, R7, c[0x0][0x0], R2 ;
/*0100*/ ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x180], PT ;
/*0110*/ FFMA R6, R5, R5, R6 ;
/*0120*/ @!P0 BRA `(.L_3) ;
.L_2:
/*0130*/ BSYNC B0 ;
.L_1:
/*0140*/ SHFL.BFLY PT, R5, R6, 0x10, 0x1f ;
/*0150*/ IMAD.MOV.U32 R10, RZ, RZ, c[0x0][0x0] ;
/*0160*/ SHF.R.U32.HI R4, RZ, 0x3, R0 ;
/*0170*/ ISETP.GE.U32.AND P0, PT, R10, 0x21, PT ;
/*0180*/ LOP3.LUT R4, R4, 0x1ffffffc, RZ, 0xc0, !PT ;
/*0190*/ FADD R5, R5, R6 ;
/*01a0*/ SHFL.BFLY PT, R2, R5, 0x8, 0x1f ;
/*01b0*/ FADD R7, R5, R2 ;
/*01c0*/ SHFL.BFLY PT, R2, R7, 0x4, 0x1f ;
/*01d0*/ FADD R8, R7, R2 ;
/*01e0*/ SHFL.BFLY PT, R9, R8, 0x2, 0x1f ;
/*01f0*/ FADD R9, R8, R9 ;
/*0200*/ SHFL.BFLY PT, R2, R9, 0x1, 0x1f ;
/*0210*/ FADD R11, R9, R2 ;
/*0220*/ SHF.L.U32 R2, R0, 0x2, RZ ;
/*0230*/ @!P0 BRA `(.L_4) ;
/*0240*/ ISETP.NE.AND P2, PT, R10, 0x400, PT ;
/*0250*/ LOP3.LUT P1, RZ, R0, 0x1f, RZ, 0xc0, !PT ;
/*0260*/ @!P2 BRA `(.L_5) ;
/*0270*/ ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ;
/*0280*/ @!P2 STS [R2], RZ ;
/*0290*/ NOP ;
/*02a0*/ BAR.SYNC 0x0 ;
.L_5:
/*02b0*/ @!P1 STS [R4], R11 ;
/*02c0*/ NOP ;
/*02d0*/ BAR.SYNC 0x0 ;
/*02e0*/ ISETP.GT.U32.AND P1, PT, R0, 0x1f, PT ;
/*02f0*/ BMOV.32.CLEAR RZ, B0 ;
/*0300*/ BSSY B0, `(.L_4) ;
/*0310*/ @P1 BRA `(.L_6) ;
/*0320*/ LDS.U R5, [R2] ;
/*0330*/ BRA.DIV `(.L_7) ;
/*0340*/ SHFL.BFLY PT, R6, R5, 0x10, 0x1f ;
/*0350*/ FADD R6, R5, R6 ;
/*0360*/ SHFL.BFLY PT, R7, R6, 0x8, 0x1f ;
/*0370*/ FADD R7, R6, R7 ;
/*0380*/ SHFL.BFLY PT, R8, R7, 0x4, 0x1f ;
/*0390*/ FADD R8, R7, R8 ;
/*03a0*/ SHFL.BFLY PT, R9, R8, 0x2, 0x1f ;
/*03b0*/ FADD R9, R8, R9 ;
/*03c0*/ SHFL.BFLY PT, R10, R9, 0x1, 0x1f ;
.L_18:
/*03d0*/ FADD R11, R10, R9 ;
.L_6:
/*03e0*/ BSYNC B0 ;
.L_4:
/*03f0*/ ISETP.NE.AND P1, PT, R0, RZ, PT ;
/*0400*/ BMOV.32.CLEAR RZ, B0 ;
/*0410*/ BSSY B0, `(.L_8) ;
/*0420*/ @P1 BRA `(.L_9) ;
/*0430*/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
/*0440*/ MOV R5, 0x1 ;
/*0450*/ IMAD.MOV.U32 R8, RZ, RZ, c[0x0][0x170] ;
/*0460*/ MOV R9, c[0x0][0x174] ;
/*0470*/ IMAD.WIDE.U32 R6, R3, R6, c[0x0][0x168] ;
/*0480*/ STG.E.STRONG.GPU [R6], R11 ;
/*0490*/ ATOMG.E.ADD.STRONG.GPU PT, R8, [R8], R5 ;
/*04a0*/ IADD3 R3, R8, 0x1, RZ ;
/*04b0*/ STS [0x80], R3 ;
.L_9:
/*04c0*/ BSYNC B0 ;
.L_8:
/*04d0*/ WARPSYNC 0xffffffff ;
/*04e0*/ NOP ;
/*04f0*/ BAR.SYNC 0x0 ;
/*0500*/ LDS.U R3, [0x80] ;
/*0510*/ ISETP.NE.AND P2, PT, R3, c[0x0][0xc], PT ;
/*0520*/ @P2 EXIT ;
/*0530*/ ISETP.GT.U32.AND P2, PT, R0, c[0x0][0xc], PT ;
/*0540*/ BMOV.32.CLEAR RZ, B0 ;
/*0550*/ BSSY B0, `(.L_10) ;
/*0560*/ IMAD.MOV.U32 R3, RZ, RZ, RZ ;
/*0570*/ @P2 BRA `(.L_11) ;
/*0580*/ MOV R5, R0 ;
.L_12:
/*0590*/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
/*05a0*/ IMAD.WIDE.U32 R6, R5, R6, c[0x0][0x168] ;
/*05b0*/ LDG.E.STRONG.GPU R6, [R6] ;
/*05c0*/ IADD3 R5, R5, c[0x0][0x0], RZ ;
/*05d0*/ YIELD ;
/*05e0*/ ISETP.GT.U32.AND P2, PT, R5, c[0x0][0xc], PT ;
/*05f0*/ FADD R3, R6, R3 ;
/*0600*/ @!P2 BRA `(.L_12) ;
.L_11:
/*0610*/ BSYNC B0 ;
.L_10:
/*0620*/ BRA.DIV `(.L_13) ;
/*0630*/ SHFL.BFLY PT, R6, R3, 0x10, 0x1f ;
/*0640*/ FADD R6, R6, R3 ;
/*0650*/ SHFL.BFLY PT, R5, R6, 0x8, 0x1f ;
/*0660*/ FADD R5, R6, R5 ;
/*0670*/ SHFL.BFLY PT, R8, R5, 0x4, 0x1f ;
/*0680*/ FADD R8, R5, R8 ;
/*0690*/ SHFL.BFLY PT, R7, R8, 0x2, 0x1f ;
/*06a0*/ FADD R9, R8, R7 ;
/*06b0*/ SHFL.BFLY PT, R10, R9, 0x1, 0x1f ;
.L_19:
/*06c0*/ FADD R9, R10, R9 ;
/*06d0*/ @!P0 BRA `(.L_14) ;
/*06e0*/ MOV R3, c[0x0][0x0] ;
/*06f0*/ LOP3.LUT P0, RZ, R0, 0x1f, RZ, 0xc0, !PT ;
/*0700*/ ISETP.NE.AND P2, PT, R3, 0x400, PT ;
/*0710*/ @!P2 BRA `(.L_15) ;
/*0720*/ ISETP.GT.U32.AND P2, PT, R0, 0x1f, PT ;
/*0730*/ WARPSYNC 0xffffffff ;
/*0740*/ @!P2 STS [R2], RZ ;
/*0750*/ NOP ;
/*0760*/ BAR.SYNC 0x0 ;
.L_15:
/*0770*/ @!P0 STS [R4], R9 ;
/*0780*/ ISETP.GT.U32.AND P0, PT, R0, 0x1f, PT ;
/*0790*/ WARPSYNC 0xffffffff ;
/*07a0*/ BMOV.32.CLEAR RZ, B0 ;
/*07b0*/ BSSY B0, `(.L_14) ;
/*07c0*/ NOP ;
/*07d0*/ BAR.SYNC 0x0 ;
/*07e0*/ @P0 BRA `(.L_16) ;
/*07f0*/ LDS.U R2, [R2] ;
/*0800*/ BRA.DIV `(.L_17) ;
/*0810*/ SHFL.BFLY PT, R3, R2, 0x10, 0x1f ;
/*0820*/ FADD R3, R2, R3 ;
/*0830*/ SHFL.BFLY PT, R0, R3, 0x8, 0x1f ;
/*0840*/ FADD R0, R3, R0 ;
/*0850*/ SHFL.BFLY PT, R5, R0, 0x4, 0x1f ;
/*0860*/ FADD R5, R0, R5 ;
/*0870*/ SHFL.BFLY PT, R4, R5, 0x2, 0x1f ;
/*0880*/ FADD R4, R5, R4 ;
/*0890*/ SHFL.BFLY PT, R9, R4, 0x1, 0x1f ;
.L_20:
/*08a0*/ FADD R9, R9, R4 ;
.L_16:
/*08b0*/ BSYNC B0 ;
.L_14:
/*08c0*/ @P1 EXIT ;
/*08d0*/ MOV R2, c[0x0][0x160] ;
/*08e0*/ IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ;
/*08f0*/ STG.E.SYS [R2], R9 ;
/*0900*/ EXIT ;
.L_7:
/*0910*/ IMAD.MOV.U32 R9, RZ, RZ, R5 ;
/*0920*/ MOV R6, 0x10 ;
/*0930*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
/*0940*/ MOV R10, 0xffffffff ;
/*0950*/ MOV R8, 0x970 ;
/*0960*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0970*/ FADD R9, R5, R10 ;
/*0980*/ MOV R7, 0x1f ;
/*0990*/ IMAD.MOV.U32 R6, RZ, RZ, 0x8 ;
/*09a0*/ MOV R8, 0x9d0 ;
/*09b0*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
/*09c0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*09d0*/ FADD R9, R9, R10 ;
/*09e0*/ MOV R6, 0x4 ;
/*09f0*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
/*0a00*/ MOV R10, 0xffffffff ;
/*0a10*/ MOV R8, 0xa30 ;
/*0a20*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0a30*/ FADD R9, R9, R10 ;
/*0a40*/ MOV R7, 0x1f ;
/*0a50*/ IMAD.MOV.U32 R6, RZ, RZ, 0x2 ;
/*0a60*/ MOV R8, 0xa90 ;
/*0a70*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
/*0a80*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0a90*/ FADD R9, R9, R10 ;
/*0aa0*/ MOV R6, 0x1 ;
/*0ab0*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
/*0ac0*/ MOV R10, 0xffffffff ;
/*0ad0*/ MOV R8, 0xaf0 ;
/*0ae0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0af0*/ BRA `(.L_18) ;
.L_13:
/*0b00*/ MOV R9, R3 ;
/*0b10*/ IMAD.MOV.U32 R6, RZ, RZ, 0x10 ;
/*0b20*/ MOV R7, 0x1f ;
/*0b30*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
/*0b40*/ MOV R8, 0xb60 ;
/*0b50*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0b60*/ FADD R9, R3, R10 ;
/*0b70*/ MOV R6, 0x8 ;
/*0b80*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
/*0b90*/ MOV R10, 0xffffffff ;
/*0ba0*/ MOV R8, 0xbc0 ;
/*0bb0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0bc0*/ FADD R9, R9, R10 ;
/*0bd0*/ MOV R7, 0x1f ;
/*0be0*/ IMAD.MOV.U32 R6, RZ, RZ, 0x4 ;
/*0bf0*/ MOV R8, 0xc20 ;
/*0c00*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
/*0c10*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0c20*/ FADD R9, R9, R10 ;
/*0c30*/ MOV R6, 0x2 ;
/*0c40*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
/*0c50*/ MOV R10, 0xffffffff ;
/*0c60*/ MOV R8, 0xc80 ;
/*0c70*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0c80*/ FADD R9, R9, R10 ;
/*0c90*/ MOV R7, 0x1f ;
/*0ca0*/ IMAD.MOV.U32 R6, RZ, RZ, 0x1 ;
/*0cb0*/ MOV R8, 0xce0 ;
/*0cc0*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
/*0cd0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0ce0*/ BRA `(.L_19) ;
.L_17:
/*0cf0*/ IMAD.MOV.U32 R9, RZ, RZ, R2 ;
/*0d00*/ MOV R6, 0x10 ;
/*0d10*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
/*0d20*/ MOV R10, 0xffffffff ;
/*0d30*/ MOV R8, 0xd50 ;
/*0d40*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0d50*/ FADD R9, R2, R10 ;
/*0d60*/ MOV R7, 0x1f ;
/*0d70*/ IMAD.MOV.U32 R6, RZ, RZ, 0x8 ;
/*0d80*/ MOV R8, 0xdb0 ;
/*0d90*/ IMAD.MOV.U32 R10, RZ, RZ, -0x1 ;
/*0da0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0db0*/ FADD R9, R9, R10 ;
/*0dc0*/ MOV R6, 0x4 ;
/*0dd0*/ IMAD.MOV.U32 R7, RZ, RZ, 0x1f ;
/*0de0*/ MOV R10, 0xffffffff ;
/*0df0*/ MOV R8, 0xe10 ;
/*0e00*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0e10*/ FADD R9, R9, R10 ;
/*0e20*/ MOV R7, 0x1f ;
/*0e30*/ IMAD.MOV.U32 R6, RZ, RZ, 0x2 ;
/*0e40*/ MOV R10, 0xffffffff ;
/*0e50*/ MOV R8, 0xe70 ;
/*0e60*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0e70*/ FADD R4, R9, R10 ;
/*0e80*/ MOV R7, 0x1f ;
/*0e90*/ IMAD.MOV.U32 R6, RZ, RZ, 0x1 ;
/*0ea0*/ MOV R10, 0xffffffff ;
/*0eb0*/ IMAD.MOV.U32 R9, RZ, RZ, R4 ;
/*0ec0*/ MOV R8, 0xee0 ;
/*0ed0*/ CALL.REL.NOINC `($_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly) ;
/*0ee0*/ MOV R9, R10 ;
/*0ef0*/ BRA `(.L_20) ;
.weak $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly
.type $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,@function
.size $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly,(.L_56 - $_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly)
$_Z18sum_squared_reducePfS_PjPKfj$__cuda_sm70_shflsync_bfly:
/*0f00*/ WARPSYNC R10 ;
/*0f10*/ SHFL.BFLY PT, R10, R9, R6, R7 ;
/*0f20*/ MOV R6, R8 ;
/*0f30*/ MOV R7, 0x0 ;
/*0f40*/ RET.REL.NODEC R6 `(_Z18sum_squared_reducePfS_PjPKfj) ;
.L_21:
/*0f50*/ BRA `(.L_21);
/*0f60*/ NOP;
/*0f70*/ NOP;
.L_56:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment