Last active
March 25, 2017 05:56
-
-
Save piquan/55453cd4abb0153978c42a605dbdc3a4 to your computer and use it in GitHub Desktop.
CUDA compile of 64-bit *c = a % b; on compute capability 3.7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//--------------------- .text._Z4testmmPm -------------------------- | |
.section .text._Z4testmmPm,"ax",@progbits | |
.sectioninfo @"SHI_REGISTERS=14" | |
.align 64 | |
.global _Z4testmmPm | |
.type _Z4testmmPm,@function | |
.size _Z4testmmPm,(.L_28 - _Z4testmmPm) | |
.other _Z4testmmPm,@"STO_CUDA_ENTRY STV_DEFAULT" | |
_Z4testmmPm: | |
.text._Z4testmmPm: | |
/*0008*/ MOV R1, c[0x0][0x44]; | |
/*0010*/ MOV R0, c[0x0][0x144]; | |
/*0018*/ ISUB RZ.CC, RZ, RZ; | |
/*0020*/ LOP.OR R0, R0, c[0x0][0x14c]; | |
/*0028*/ ISETP.EQ.X.AND P0, PT, R0, RZ, PT; | |
/*0030*/ MOV R2, c[0x0][0x150]; | |
/*0038*/ MOV R3, c[0x0][0x154]; | |
/*0048*/ @P0 BRA `(.L_1); | |
/*0050*/ MOV R4, c[0x0][0x140]; | |
/*0058*/ MOV R5, c[0x0][0x144]; | |
/*0060*/ MOV R7, c[0x0][0x14c]; | |
/*0068*/ MOV R6, c[0x0][0x148]; | |
/*0070*/ CAL `($_Z4testmmPm$__cuda_sm20_rem_u64); | |
/*0078*/ BRA `(.L_2); | |
.L_1: | |
/*0088*/ I2F.F32.U32.RP R0, c[0x0] [0x148]; | |
/*0090*/ MOV R5, RZ; | |
/*0098*/ MUFU.RCP R0, R0; | |
/*00a0*/ IADD32I R0, R0, 0xffffffe; | |
/*00a8*/ F2I.FTZ.U32.F32.TRUNC R0, R0; | |
/*00b0*/ IMUL.U32.U32 R4, R0, c[0x0][0x148]; | |
/*00b8*/ I2I.S32.S32 R4, -R4; | |
/*00c8*/ IMAD.U32.U32.HI R0, R0, R4, R0; | |
/*00d0*/ MOV R4, c[0x0][0x148]; | |
/*00d8*/ IMUL.U32.U32.HI R0, R0, c[0x0][0x140]; | |
/*00e0*/ IMAD.U32.U32 R0, -R0, R4, c[0x0][0x140]; | |
/*00e8*/ LOP.PASS_B R4, RZ, ~R4; | |
/*00f0*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x148], PT; | |
/*00f8*/ @P0 ISUB R0, R0, c[0x0][0x148]; | |
/*0108*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x148], PT; | |
/*0110*/ @P0 ISUB R0, R0, c[0x0][0x148]; | |
/*0118*/ ICMP.EQ.U32 R4, R4, R0, c[0x0][0x148]; | |
.L_2: | |
/*0120*/ ST.E.64 [R2], R4; | |
/*0128*/ EXIT; | |
.weak $_Z4testmmPm$__cuda_sm20_rem_u64 | |
.type $_Z4testmmPm$__cuda_sm20_rem_u64,@function | |
.size $_Z4testmmPm$__cuda_sm20_rem_u64,(.L_28 - $_Z4testmmPm$__cuda_sm20_rem_u64) | |
$_Z4testmmPm$__cuda_sm20_rem_u64: | |
/*0130*/ I2F.F32.U64.RP R0, R6; | |
/*0138*/ MUFU.RCP R0, R0; | |
/*0148*/ IADD32I R0, R0, 0x1ffffffe; | |
/*0150*/ F2I.U64.F32.TRUNC R8, R0; | |
/*0158*/ IMUL.U32.U32.HI R11, R8, R6; | |
/*0160*/ IMAD.U32.U32 R0, R8, R7, R11; | |
/*0168*/ IMAD.U32.U32 R12.CC, -R8, R6, RZ; | |
/*0170*/ IMAD.U32.U32 R0, R9, R6, R0; | |
/*0178*/ IMUL.U32.U32.HI R11, R8, R12; | |
/*0188*/ IADD.X R13, -R0, RZ; | |
/*0190*/ IMAD.U32.U32 R0.CC, R8, R13, R11; | |
/*0198*/ IMAD.U32.U32.HI.X R8.CC, R8, R13, R8; | |
/*01a0*/ IMAD.U32.U32.HI.X R10, R9, R13, R9; | |
/*01a8*/ IMAD.U32.U32 RZ.CC, R9, R12, R0; | |
/*01b0*/ IMAD.U32.U32.HI.X R0.CC, R9, R12, R8; | |
/*01b8*/ IADD.X R8, R10, RZ; | |
/*01c8*/ IMAD.U32.U32 R0.CC, R9, R13, R0; | |
/*01d0*/ IMUL.U32.U32.HI R9, R0, R6; | |
/*01d8*/ IADD.X R8, R8, RZ; | |
/*01e0*/ IMAD.U32.U32 R9, R0, R7, R9; | |
/*01e8*/ IMAD.U32.U32 R12.CC, -R0, R6, RZ; | |
/*01f0*/ IMAD.U32.U32 R9, R8, R6, R9; | |
/*01f8*/ IMUL.U32.U32.HI R11, R0, R12; | |
/*0208*/ IADD.X R13, -R9, RZ; | |
/*0210*/ IMAD.U32.U32 R9.CC, R0, R13, R11; | |
/*0218*/ IMAD.U32.U32.HI.X R11.CC, R0, R13, R0; | |
/*0220*/ IMAD.U32.U32.HI.X R0, R8, R13, R8; | |
/*0228*/ IMAD.U32.U32 RZ.CC, R8, R12, R9; | |
/*0230*/ IMAD.U32.U32.HI.X R9.CC, R8, R12, R11; | |
/*0238*/ IADD.X R10, R0, RZ; | |
/*0248*/ IMAD.U32.U32 R0.CC, R8, R13, R9; | |
/*0250*/ IMUL.U32.U32.HI R9, R0, R4; | |
/*0258*/ IADD.X R8, R10, RZ; | |
/*0260*/ IMAD.U32.U32 R9.CC, R0, R5, R9; | |
/*0268*/ IMAD.U32.U32.HI.X R11, R0, R5, RZ; | |
/*0270*/ IMAD.U32.U32 RZ.CC, R8, R4, R9; | |
/*0278*/ IMAD.U32.U32.HI.X R9.CC, R8, R4, R11; | |
/*0288*/ IMAD.U32.U32.HI.X R10, R8, R5, RZ; | |
/*0290*/ IMAD.U32.U32 R0.CC, R8, R5, R9; | |
/*0298*/ IMUL.U32.U32.HI R9, R0, R6; | |
/*02a0*/ IADD.X R8, R10, RZ; | |
/*02a8*/ IMAD.U32.U32 R10.CC, -R0, R6, R4; | |
/*02b0*/ IMAD.U32.U32 R9, R0, R7, R9; | |
/*02b8*/ IMAD.U32.U32 R0, R8, R6, R9; | |
/*02c8*/ ISUB.X R8, R5, R0; | |
/*02d0*/ ISUB RZ.CC, R10, R6; | |
/*02d8*/ ISETP.GE.U32.X.AND P0, PT, R8, R7, PT; | |
/*02e0*/ ISUB RZ.CC, R6, RZ; | |
/*02e8*/ ISETP.EQ.X.AND P1, PT, R7, RZ, PT; | |
/*02f0*/ @P0 ISUB R10.CC, R10, R6; | |
/*02f8*/ @P0 ISUB.X R8, R8, R7; | |
/*0308*/ ISUB R4.CC, R10, R6; | |
/*0310*/ ISETP.GE.U32.X.AND P0, PT, R8, R7, PT; | |
/*0318*/ SEL R0, R4, R10, P0; | |
/*0320*/ @P0 ISUB.X R8, R8, R7; | |
/*0328*/ SEL R4, R0, -0x1, !P1; | |
/*0330*/ SEL R5, R8, -0x1, !P1; | |
/*0338*/ RET; | |
.L_3: | |
/*0340*/ BRA `(.L_3); | |
.L_28: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment