Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active December 10, 2015 08:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/4409111 to your computer and use it in GitHub Desktop.
Save allanmac/4409111 to your computer and use it in GitHub Desktop.
__global__
void fmuladdTest(float* const values)
{
const unsigned int tidx = threadIdx.x;
const float b = values[ tidx];
float a = values[2*tidx];
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.73f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.37f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.24f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.57f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.58f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.14f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.74f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.48f);
a = __fmul_rn(a, b);
a = __fadd_rn(a, 0.21f);
values[2*tidx] = a;
}
@allanmac
Copy link
Author

cuobjdump -sass fmuladd.cubin

code for sm_30
    Function : _Z7fmaTestPf
/*0008*/     /*0x10005de428004001*/     MOV R1, c [0x0] [0x44];
/*0010*/     /*0x84001c042c000000*/     S2R R0, SR_Tid_X;
/*0018*/     /*0x00011de428004005*/     MOV R4, c [0x0] [0x140];
/*0020*/     /*0x1000dc435000c000*/     IMUL.U32.U32.HI R3, R0, 0x4;
/*0028*/     /*0x04009c036000c000*/     SHL R2, R0, 0x1;
/*0030*/     /*0x10019c032009c000*/     IMAD.U32.U32 R6.CC, R0, 0x4, R4;
/*0038*/     /*0x10201c435000c000*/     IMUL.U32.U32.HI R0, R2, 0x4;
/*0048*/     /*0x1031dc4348004005*/     IADD.X R7, R3, c [0x0] [0x144];
/*0050*/     /*0x10209c032009c000*/     IMAD.U32.U32 R2.CC, R2, 0x4, R4;
/*0058*/     /*0x1000dc4348004005*/     IADD.X R3, R0, c [0x0] [0x144];
/*0060*/     /*0x00601c8584000000*/     LD.E R0, [R6];
/*0068*/     /*0x00211c8584000000*/     LD.E R4, [R2];
/*0070*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*0078*/     /*0x20411c0228fceb85*/     FADD32I R4, R4, 0x3f3ae148;
/*0088*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*0090*/     /*0x90411c0228faf5c2*/     FADD32I R4, R4, 0x3ebd70a4;
/*0098*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*00a0*/     /*0x3c411c0228f9d70a*/     FADD32I R4, R4, 0x3e75c28f;
/*00a8*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*00b0*/     /*0x14411c0228fc47ae*/     FADD32I R4, R4, 0x3f11eb85;
/*00b8*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*00c8*/     /*0x84411c0228fc51eb*/     FADD32I R4, R4, 0x3f147ae1;
/*00d0*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*00d8*/     /*0xa4411c0228f83d70*/     FADD32I R4, R4, 0x3e0f5c29;
/*00e0*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*00e8*/     /*0x90411c0228fcf5c2*/     FADD32I R4, R4, 0x3f3d70a4;
/*00f0*/     /*0x00411c0058000000*/     FMUL R4, R4, R0;
/*00f8*/     /*0x3c411c0228fbd70a*/     FADD32I R4, R4, 0x3ef5c28f;
/*0108*/     /*0x00401c0058000000*/     FMUL R0, R4, R0;
/*0110*/     /*0xf4001c0228f95c28*/     FADD32I R0, R0, 0x3e570a3d;
/*0118*/     /*0x00201c8594000000*/     ST.E [R2], R0;
/*0120*/     /*0x00001de780000000*/     EXIT;
/*0128*/     /*0xe0001de74003ffff*/     BRA 0x128;
/*0130*/     /*0x00001de440000000*/     NOP CC.T;
/*0138*/     /*0x00001de440000000*/     NOP CC.T;
    .............................

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment