Skip to content

Instantly share code, notes, and snippets.

@piquan
Last active March 25, 2017 05:56
Show Gist options
  • Save piquan/55453cd4abb0153978c42a605dbdc3a4 to your computer and use it in GitHub Desktop.
Save piquan/55453cd4abb0153978c42a605dbdc3a4 to your computer and use it in GitHub Desktop.
CUDA compile of 64-bit *c = a % b; on compute capability 3.7
//--------------------- .text._Z4testmmPm --------------------------
.section .text._Z4testmmPm,"ax",@progbits
.sectioninfo @"SHI_REGISTERS=14"
.align 64
.global _Z4testmmPm
.type _Z4testmmPm,@function
.size _Z4testmmPm,(.L_28 - _Z4testmmPm)
.other _Z4testmmPm,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z4testmmPm:
.text._Z4testmmPm:
/*0008*/ MOV R1, c[0x0][0x44];
/*0010*/ MOV R0, c[0x0][0x144];
/*0018*/ ISUB RZ.CC, RZ, RZ;
/*0020*/ LOP.OR R0, R0, c[0x0][0x14c];
/*0028*/ ISETP.EQ.X.AND P0, PT, R0, RZ, PT;
/*0030*/ MOV R2, c[0x0][0x150];
/*0038*/ MOV R3, c[0x0][0x154];
/*0048*/ @P0 BRA `(.L_1);
/*0050*/ MOV R4, c[0x0][0x140];
/*0058*/ MOV R5, c[0x0][0x144];
/*0060*/ MOV R7, c[0x0][0x14c];
/*0068*/ MOV R6, c[0x0][0x148];
/*0070*/ CAL `($_Z4testmmPm$__cuda_sm20_rem_u64);
/*0078*/ BRA `(.L_2);
.L_1:
/*0088*/ I2F.F32.U32.RP R0, c[0x0] [0x148];
/*0090*/ MOV R5, RZ;
/*0098*/ MUFU.RCP R0, R0;
/*00a0*/ IADD32I R0, R0, 0xffffffe;
/*00a8*/ F2I.FTZ.U32.F32.TRUNC R0, R0;
/*00b0*/ IMUL.U32.U32 R4, R0, c[0x0][0x148];
/*00b8*/ I2I.S32.S32 R4, -R4;
/*00c8*/ IMAD.U32.U32.HI R0, R0, R4, R0;
/*00d0*/ MOV R4, c[0x0][0x148];
/*00d8*/ IMUL.U32.U32.HI R0, R0, c[0x0][0x140];
/*00e0*/ IMAD.U32.U32 R0, -R0, R4, c[0x0][0x140];
/*00e8*/ LOP.PASS_B R4, RZ, ~R4;
/*00f0*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x148], PT;
/*00f8*/ @P0 ISUB R0, R0, c[0x0][0x148];
/*0108*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x148], PT;
/*0110*/ @P0 ISUB R0, R0, c[0x0][0x148];
/*0118*/ ICMP.EQ.U32 R4, R4, R0, c[0x0][0x148];
.L_2:
/*0120*/ ST.E.64 [R2], R4;
/*0128*/ EXIT;
.weak $_Z4testmmPm$__cuda_sm20_rem_u64
.type $_Z4testmmPm$__cuda_sm20_rem_u64,@function
.size $_Z4testmmPm$__cuda_sm20_rem_u64,(.L_28 - $_Z4testmmPm$__cuda_sm20_rem_u64)
$_Z4testmmPm$__cuda_sm20_rem_u64:
/*0130*/ I2F.F32.U64.RP R0, R6;
/*0138*/ MUFU.RCP R0, R0;
/*0148*/ IADD32I R0, R0, 0x1ffffffe;
/*0150*/ F2I.U64.F32.TRUNC R8, R0;
/*0158*/ IMUL.U32.U32.HI R11, R8, R6;
/*0160*/ IMAD.U32.U32 R0, R8, R7, R11;
/*0168*/ IMAD.U32.U32 R12.CC, -R8, R6, RZ;
/*0170*/ IMAD.U32.U32 R0, R9, R6, R0;
/*0178*/ IMUL.U32.U32.HI R11, R8, R12;
/*0188*/ IADD.X R13, -R0, RZ;
/*0190*/ IMAD.U32.U32 R0.CC, R8, R13, R11;
/*0198*/ IMAD.U32.U32.HI.X R8.CC, R8, R13, R8;
/*01a0*/ IMAD.U32.U32.HI.X R10, R9, R13, R9;
/*01a8*/ IMAD.U32.U32 RZ.CC, R9, R12, R0;
/*01b0*/ IMAD.U32.U32.HI.X R0.CC, R9, R12, R8;
/*01b8*/ IADD.X R8, R10, RZ;
/*01c8*/ IMAD.U32.U32 R0.CC, R9, R13, R0;
/*01d0*/ IMUL.U32.U32.HI R9, R0, R6;
/*01d8*/ IADD.X R8, R8, RZ;
/*01e0*/ IMAD.U32.U32 R9, R0, R7, R9;
/*01e8*/ IMAD.U32.U32 R12.CC, -R0, R6, RZ;
/*01f0*/ IMAD.U32.U32 R9, R8, R6, R9;
/*01f8*/ IMUL.U32.U32.HI R11, R0, R12;
/*0208*/ IADD.X R13, -R9, RZ;
/*0210*/ IMAD.U32.U32 R9.CC, R0, R13, R11;
/*0218*/ IMAD.U32.U32.HI.X R11.CC, R0, R13, R0;
/*0220*/ IMAD.U32.U32.HI.X R0, R8, R13, R8;
/*0228*/ IMAD.U32.U32 RZ.CC, R8, R12, R9;
/*0230*/ IMAD.U32.U32.HI.X R9.CC, R8, R12, R11;
/*0238*/ IADD.X R10, R0, RZ;
/*0248*/ IMAD.U32.U32 R0.CC, R8, R13, R9;
/*0250*/ IMUL.U32.U32.HI R9, R0, R4;
/*0258*/ IADD.X R8, R10, RZ;
/*0260*/ IMAD.U32.U32 R9.CC, R0, R5, R9;
/*0268*/ IMAD.U32.U32.HI.X R11, R0, R5, RZ;
/*0270*/ IMAD.U32.U32 RZ.CC, R8, R4, R9;
/*0278*/ IMAD.U32.U32.HI.X R9.CC, R8, R4, R11;
/*0288*/ IMAD.U32.U32.HI.X R10, R8, R5, RZ;
/*0290*/ IMAD.U32.U32 R0.CC, R8, R5, R9;
/*0298*/ IMUL.U32.U32.HI R9, R0, R6;
/*02a0*/ IADD.X R8, R10, RZ;
/*02a8*/ IMAD.U32.U32 R10.CC, -R0, R6, R4;
/*02b0*/ IMAD.U32.U32 R9, R0, R7, R9;
/*02b8*/ IMAD.U32.U32 R0, R8, R6, R9;
/*02c8*/ ISUB.X R8, R5, R0;
/*02d0*/ ISUB RZ.CC, R10, R6;
/*02d8*/ ISETP.GE.U32.X.AND P0, PT, R8, R7, PT;
/*02e0*/ ISUB RZ.CC, R6, RZ;
/*02e8*/ ISETP.EQ.X.AND P1, PT, R7, RZ, PT;
/*02f0*/ @P0 ISUB R10.CC, R10, R6;
/*02f8*/ @P0 ISUB.X R8, R8, R7;
/*0308*/ ISUB R4.CC, R10, R6;
/*0310*/ ISETP.GE.U32.X.AND P0, PT, R8, R7, PT;
/*0318*/ SEL R0, R4, R10, P0;
/*0320*/ @P0 ISUB.X R8, R8, R7;
/*0328*/ SEL R4, R0, -0x1, !P1;
/*0330*/ SEL R5, R8, -0x1, !P1;
/*0338*/ RET;
.L_3:
/*0340*/ BRA `(.L_3);
.L_28:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment