Created
May 21, 2021 02:59
-
-
Save cheshire/2b1c002b61dec1a173163424602914f9 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 6.0 | |
.target sm_70 | |
.address_size 64 | |
// .globl fusion_2287 | |
.shared .align 4 .b8 shared_cache_0[128]; | |
.shared .align 4 .b8 shared_cache_01[128]; | |
.global .align 16 .b8 rng_state[16] = {149, 35, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |
.visible .global .align 64 .b8 buffer_for_constant_217[8] = {32, 0, 0, 0, 0, 0, 0, 0}; | |
.shared .align 4 .b8 shared_cache_02[128]; | |
.shared .align 4 .b8 shared_cache_03[128]; | |
.shared .align 4 .b8 shared_cache_04[128]; | |
.shared .align 4 .b8 shared_cache_05[128]; | |
.shared .align 4 .b8 shared_cache_06[128]; | |
.shared .align 4 .b8 shared_cache_07[128]; | |
.shared .align 4 .b8 shared_cache_08[128]; | |
.shared .align 4 .b8 shared_cache_09[128]; | |
.shared .align 4 .b8 shared_cache_010[128]; | |
.shared .align 4 .b8 shared_cache_011[128]; | |
.shared .align 4 .b8 shared_cache_012[128]; | |
.shared .align 4 .b8 shared_cache_013[128]; | |
.shared .align 4 .b8 shared_cache_014[128]; | |
.shared .align 4 .b8 shared_cache_015[128]; | |
.shared .align 4 .b8 shared_cache_016[128]; | |
.shared .align 4 .b8 shared_cache_017[128]; | |
.shared .align 4 .b8 shared_cache_018[128]; | |
.shared .align 4 .b8 shared_cache_019[128]; | |
.shared .align 4 .b8 shared_cache_020[128]; | |
.shared .align 4 .b8 shared_cache_021[128]; | |
.shared .align 4 .b8 shared_cache_022[128]; | |
.shared .align 4 .b8 shared_cache_023[128]; | |
.shared .align 4 .b8 shared_cache_024[128]; | |
.shared .align 4 .b8 shared_cache_025[128]; | |
.shared .align 4 .b8 shared_cache_026[128]; | |
.shared .align 4 .b8 shared_cache_027[128]; | |
.shared .align 4 .b8 shared_cache_028[128]; | |
.shared .align 4 .b8 shared_cache_029[128]; | |
.shared .align 4 .b8 shared_cache_030[128]; | |
.shared .align 4 .b8 shared_cache_031[128]; | |
.shared .align 4 .b8 shared_cache_032[128]; | |
.shared .align 4 .b8 shared_cache_033[128]; | |
.shared .align 4 .b8 shared_cache_034[128]; | |
.shared .align 4 .b8 shared_cache_035[128]; | |
.shared .align 4 .b8 shared_cache_036[128]; | |
.shared .align 4 .b8 shared_cache_037[128]; | |
.shared .align 4 .b8 shared_cache_038[128]; | |
.shared .align 4 .b8 shared_cache_039[128]; | |
.shared .align 4 .b8 shared_cache_040[128]; | |
.shared .align 4 .b8 shared_cache_041[128]; | |
.shared .align 4 .b8 shared_cache_042[128]; | |
.shared .align 4 .b8 shared_cache_043[128]; | |
.shared .align 4 .b8 shared_cache_044[128]; | |
.shared .align 4 .b8 shared_cache_045[128]; | |
.shared .align 4 .b8 shared_cache_046[128]; | |
.shared .align 4 .b8 shared_cache_047[128]; | |
.shared .align 4 .b8 shared_cache_048[128]; | |
.shared .align 4 .b8 shared_cache_049[128]; | |
.shared .align 4 .b8 shared_cache_050[128]; | |
.shared .align 4 .b8 shared_cache_051[128]; | |
.shared .align 4 .b8 shared_cache_052[128]; | |
.shared .align 4 .b8 shared_cache_053[128]; | |
.shared .align 4 .b8 shared_cache_054[128]; | |
.shared .align 4 .b8 shared_cache_055[128]; | |
.shared .align 4 .b8 shared_cache_056[128]; | |
.shared .align 4 .b8 shared_cache_057[128]; | |
.shared .align 4 .b8 shared_cache_058[128]; | |
.shared .align 4 .b8 shared_cache_059[128]; | |
.shared .align 4 .b8 shared_cache_060[128]; | |
.shared .align 4 .b8 shared_cache_061[128]; | |
.shared .align 4 .b8 shared_cache_062[128]; | |
.shared .align 4 .b8 shared_cache_063[128]; | |
.shared .align 4 .b8 shared_cache_064[128]; | |
.shared .align 4 .b8 shared_cache_065[128]; | |
.shared .align 4 .b8 shared_cache_066[128]; | |
.shared .align 4 .b8 shared_cache_067[128]; | |
.shared .align 4 .b8 shared_cache_068[128]; | |
.shared .align 4 .b8 shared_cache_069[128]; | |
.shared .align 4 .b8 shared_cache_070[128]; | |
.shared .align 4 .b8 shared_cache_071[128]; | |
.shared .align 4 .b8 shared_cache_072[128]; | |
.shared .align 4 .b8 shared_cache_073[128]; | |
.shared .align 4 .b8 shared_cache_074[128]; | |
.shared .align 4 .b8 shared_cache_075[128]; | |
.shared .align 4 .b8 shared_cache_076[128]; | |
.shared .align 4 .b8 shared_cache_077[128]; | |
.shared .align 4 .b8 shared_cache_078[128]; | |
.shared .align 4 .b8 shared_cache_079[128]; | |
.shared .align 4 .b8 shared_cache_080[128]; | |
.shared .align 4 .b8 shared_cache_081[128]; | |
.shared .align 4 .b8 shared_cache_082[128]; | |
.shared .align 4 .b8 shared_cache_083[128]; | |
.shared .align 4 .b8 shared_cache_084[128]; | |
.shared .align 4 .b8 shared_cache_085[128]; | |
.shared .align 4 .b8 shared_cache_086[128]; | |
.shared .align 4 .b8 shared_cache_087[128]; | |
.shared .align 4 .b8 shared_cache_088[128]; | |
.shared .align 4 .b8 shared_cache_089[128]; | |
.shared .align 4 .b8 shared_cache_090[128]; | |
.shared .align 4 .b8 shared_cache_091[128]; | |
.shared .align 4 .b8 shared_cache_092[128]; | |
.shared .align 4 .b8 shared_cache_093[128]; | |
.shared .align 4 .b8 shared_cache_094[128]; | |
.shared .align 4 .b8 shared_cache_095[128]; | |
.shared .align 4 .b8 shared_cache_096[128]; | |
.shared .align 4 .b8 shared_cache_097[128]; | |
.shared .align 4 .b8 shared_cache_098[128]; | |
.shared .align 4 .b8 shared_cache_099[128]; | |
.shared .align 4 .b8 shared_cache_0100[128]; | |
.shared .align 4 .b8 shared_cache_0101[128]; | |
.shared .align 4 .b8 shared_cache_0102[128]; | |
.shared .align 4 .b8 shared_cache_0103[128]; | |
.shared .align 4 .b8 shared_cache_0104[128]; | |
.shared .align 4 .b8 shared_cache_0105[128]; | |
.shared .align 4 .b8 shared_cache_0106[128]; | |
.shared .align 4 .b8 shared_cache_0107[128]; | |
.shared .align 4 .b8 shared_cache_0108[128]; | |
.shared .align 4 .b8 shared_cache_0109[128]; | |
.shared .align 4 .b8 shared_cache_0110[128]; | |
.shared .align 4 .b8 shared_cache_0111[128]; | |
.shared .align 4 .b8 shared_cache_0112[128]; | |
.shared .align 4 .b8 shared_cache_0113[128]; | |
.shared .align 4 .b8 shared_cache_0114[128]; | |
.shared .align 4 .b8 shared_cache_0115[128]; | |
.shared .align 4 .b8 shared_cache_0116[128]; | |
.shared .align 4 .b8 shared_cache_0117[128]; | |
.shared .align 4 .b8 shared_cache_0118[128]; | |
.shared .align 4 .b8 shared_cache_0119[128]; | |
.shared .align 4 .b8 shared_cache_0120[128]; | |
.shared .align 4 .b8 shared_cache_0121[128]; | |
.shared .align 4 .b8 shared_cache_0122[128]; | |
.shared .align 4 .b8 shared_cache_0123[128]; | |
.shared .align 4 .b8 shared_cache_0124[128]; | |
.shared .align 4 .b8 shared_cache_0125[128]; | |
.shared .align 4 .b8 shared_cache_0126[128]; | |
.shared .align 4 .b8 shared_cache_0127[128]; | |
.shared .align 4 .b8 shared_cache_0128[128]; | |
.shared .align 4 .b8 shared_cache_0129[128]; | |
.shared .align 4 .b8 shared_cache_0130[128]; | |
.shared .align 4 .b8 shared_cache_0131[128]; | |
.shared .align 4 .b8 shared_cache_0132[128]; | |
.shared .align 4 .b8 shared_cache_0133[128]; | |
.shared .align 4 .b8 shared_cache_0134[128]; | |
.shared .align 4 .b8 shared_cache_0135[128]; | |
.shared .align 4 .b8 shared_cache_0136[128]; | |
.shared .align 4 .b8 shared_cache_0137[128]; | |
.shared .align 4 .b8 shared_cache_0138[128]; | |
.shared .align 4 .b8 shared_cache_0139[128]; | |
.shared .align 4 .b8 shared_cache_0140[128]; | |
.shared .align 4 .b8 shared_cache_0141[128]; | |
.shared .align 4 .b8 shared_cache_0142[128]; | |
.shared .align 4 .b8 shared_cache_0143[128]; | |
.shared .align 4 .b8 shared_cache_0144[128]; | |
.shared .align 4 .b8 shared_cache_0145[128]; | |
.shared .align 4 .b8 shared_cache_0146[4224]; | |
.shared .align 4 .b8 shared_cache_0147[4224]; | |
.visible .global .align 64 .b8 buffer_for_constant_6681[2] = {0, 252}; | |
.shared .align 4 .b8 shared_cache_0148[128]; | |
.shared .align 4 .b8 shared_cache_0149[128]; | |
.shared .align 2 .b8 shared_cache_0150[64]; | |
.shared .align 4 .b8 shared_cache_0151[128]; | |
.shared .align 4 .b8 shared_cache_0152[128]; | |
.shared .align 4 .b8 shared_cache_1[128]; | |
.shared .align 8 .b8 shared_cache_0153[256]; | |
.shared .align 4 .b8 shared_cache_0154[128]; | |
.shared .align 4 .b8 shared_cache_1155[128]; | |
.shared .align 4 .b8 shared_cache_2[128]; | |
.shared .align 4 .b8 shared_cache_0156[128]; | |
.shared .align 4 .b8 shared_cache_0157[128]; | |
.shared .align 4 .b8 shared_cache_0158[4224]; | |
.visible .global .align 64 .b8 buffer_for_constant_1375[2]; | |
.shared .align 4 .b8 shared_cache_0159[128]; | |
.shared .align 4 .b8 shared_cache_1160[128]; | |
.shared .align 4 .b8 shared_cache_0161[8448]; | |
.shared .align 4 .b8 shared_cache_2162[8448]; | |
.shared .align 4 .b8 shared_cache_3[8448]; | |
.shared .align 4 .b8 shared_cache_0163[128]; | |
.shared .align 4 .b8 shared_cache_1164[128]; | |
.shared .align 4 .b8 shared_cache_0165[128]; | |
.shared .align 4 .b8 shared_cache_1166[128]; | |
.shared .align 4 .b8 shared_cache_0167[128]; | |
.shared .align 4 .b8 shared_cache_0168[128]; | |
.shared .align 4 .b8 shared_cache_1169[128]; | |
.shared .align 4 .b8 shared_cache_0170[128]; | |
.shared .align 4 .b8 shared_cache_1171[128]; | |
.shared .align 4 .b8 shared_cache_0172[128]; | |
.shared .align 4 .b8 shared_cache_0173[128]; | |
.shared .align 4 .b8 shared_cache_1174[128]; | |
.shared .align 4 .b8 shared_cache_0175[128]; | |
.shared .align 4 .b8 shared_cache_1176[128]; | |
.shared .align 4 .b8 shared_cache_0177[128]; | |
.shared .align 4 .b8 shared_cache_0178[128]; | |
.shared .align 4 .b8 shared_cache_1179[128]; | |
.shared .align 4 .b8 shared_cache_0180[128]; | |
.shared .align 4 .b8 shared_cache_1181[128]; | |
.shared .align 4 .b8 shared_cache_0182[128]; | |
.shared .align 4 .b8 shared_cache_0183[128]; | |
.shared .align 4 .b8 shared_cache_1184[128]; | |
.shared .align 4 .b8 shared_cache_0185[128]; | |
.shared .align 4 .b8 shared_cache_1186[128]; | |
.shared .align 4 .b8 shared_cache_0187[128]; | |
.shared .align 4 .b8 shared_cache_0188[128]; | |
.shared .align 4 .b8 shared_cache_1189[128]; | |
.shared .align 4 .b8 shared_cache_0190[128]; | |
.shared .align 4 .b8 shared_cache_1191[128]; | |
.shared .align 4 .b8 shared_cache_0192[128]; | |
.shared .align 4 .b8 shared_cache_0193[128]; | |
.shared .align 4 .b8 shared_cache_1194[128]; | |
.shared .align 4 .b8 shared_cache_0195[128]; | |
.shared .align 4 .b8 shared_cache_1196[128]; | |
.shared .align 4 .b8 shared_cache_0197[128]; | |
.shared .align 4 .b8 shared_cache_0198[128]; | |
.shared .align 4 .b8 shared_cache_1199[128]; | |
.shared .align 4 .b8 shared_cache_0200[128]; | |
.shared .align 4 .b8 shared_cache_1201[128]; | |
.shared .align 4 .b8 shared_cache_0202[128]; | |
.shared .align 4 .b8 shared_cache_0203[128]; | |
.shared .align 4 .b8 shared_cache_1204[128]; | |
.shared .align 4 .b8 shared_cache_0205[128]; | |
.shared .align 4 .b8 shared_cache_1206[128]; | |
.shared .align 4 .b8 shared_cache_0207[128]; | |
.shared .align 4 .b8 shared_cache_0208[128]; | |
.shared .align 4 .b8 shared_cache_1209[128]; | |
.shared .align 4 .b8 shared_cache_0210[128]; | |
.shared .align 4 .b8 shared_cache_1211[128]; | |
.shared .align 4 .b8 shared_cache_0212[128]; | |
.shared .align 4 .b8 shared_cache_0213[128]; | |
.shared .align 4 .b8 shared_cache_1214[128]; | |
.shared .align 4 .b8 shared_cache_0215[128]; | |
.shared .align 4 .b8 shared_cache_1216[128]; | |
.shared .align 4 .b8 shared_cache_0217[128]; | |
.shared .align 4 .b8 shared_cache_0218[128]; | |
.shared .align 4 .b8 shared_cache_1219[128]; | |
.shared .align 4 .b8 shared_cache_0220[128]; | |
.shared .align 4 .b8 shared_cache_1221[128]; | |
.shared .align 4 .b8 shared_cache_0222[128]; | |
.shared .align 4 .b8 shared_cache_0223[128]; | |
.shared .align 4 .b8 shared_cache_1224[128]; | |
.shared .align 4 .b8 shared_cache_0225[128]; | |
.shared .align 4 .b8 shared_cache_1226[128]; | |
.shared .align 4 .b8 shared_cache_0227[128]; | |
.shared .align 4 .b8 shared_cache_0228[128]; | |
.shared .align 4 .b8 shared_cache_1229[128]; | |
.shared .align 4 .b8 shared_cache_0230[128]; | |
.shared .align 4 .b8 shared_cache_1231[128]; | |
.shared .align 4 .b8 shared_cache_0232[128]; | |
.shared .align 4 .b8 shared_cache_0233[128]; | |
.shared .align 4 .b8 shared_cache_1234[128]; | |
.shared .align 4 .b8 shared_cache_0235[128]; | |
.shared .align 4 .b8 shared_cache_1236[128]; | |
.shared .align 4 .b8 shared_cache_0237[128]; | |
.shared .align 4 .b8 shared_cache_0238[128]; | |
.shared .align 4 .b8 shared_cache_1239[128]; | |
.shared .align 4 .b8 shared_cache_0240[128]; | |
.shared .align 4 .b8 shared_cache_1241[128]; | |
.shared .align 4 .b8 shared_cache_0242[128]; | |
.shared .align 4 .b8 shared_cache_0243[128]; | |
.shared .align 4 .b8 shared_cache_1244[128]; | |
.shared .align 4 .b8 shared_cache_0245[128]; | |
.shared .align 4 .b8 shared_cache_1246[128]; | |
.shared .align 4 .b8 shared_cache_0247[128]; | |
.shared .align 4 .b8 shared_cache_0248[128]; | |
.shared .align 4 .b8 shared_cache_1249[128]; | |
.shared .align 4 .b8 shared_cache_0250[128]; | |
.shared .align 4 .b8 shared_cache_1251[128]; | |
.shared .align 4 .b8 shared_cache_0252[128]; | |
.shared .align 4 .b8 shared_cache_0253[128]; | |
.shared .align 4 .b8 shared_cache_1254[128]; | |
.shared .align 4 .b8 shared_cache_0255[128]; | |
.shared .align 4 .b8 shared_cache_1256[128]; | |
.shared .align 4 .b8 shared_cache_0257[128]; | |
.shared .align 4 .b8 shared_cache_0258[128]; | |
.shared .align 4 .b8 shared_cache_1259[128]; | |
.shared .align 4 .b8 shared_cache_0260[128]; | |
.shared .align 4 .b8 shared_cache_1261[128]; | |
.shared .align 4 .b8 shared_cache_0262[128]; | |
.shared .align 4 .b8 shared_cache_0263[128]; | |
.shared .align 4 .b8 shared_cache_1264[128]; | |
.shared .align 4 .b8 shared_cache_0265[128]; | |
.shared .align 4 .b8 shared_cache_1266[128]; | |
.shared .align 4 .b8 shared_cache_0267[128]; | |
.shared .align 4 .b8 shared_cache_0268[128]; | |
.shared .align 4 .b8 shared_cache_1269[128]; | |
.shared .align 4 .b8 shared_cache_0270[128]; | |
.shared .align 4 .b8 shared_cache_1271[128]; | |
.shared .align 4 .b8 shared_cache_0272[128]; | |
.shared .align 4 .b8 shared_cache_0273[128]; | |
.shared .align 4 .b8 shared_cache_1274[128]; | |
.shared .align 4 .b8 shared_cache_0275[128]; | |
.shared .align 4 .b8 shared_cache_1276[128]; | |
.shared .align 4 .b8 shared_cache_0277[128]; | |
.shared .align 4 .b8 shared_cache_0278[128]; | |
.shared .align 4 .b8 shared_cache_1279[128]; | |
.shared .align 4 .b8 shared_cache_0280[128]; | |
.shared .align 4 .b8 shared_cache_1281[128]; | |
.shared .align 4 .b8 shared_cache_0282[128]; | |
.shared .align 4 .b8 shared_cache_0283[128]; | |
.shared .align 4 .b8 shared_cache_1284[128]; | |
.shared .align 4 .b8 shared_cache_0285[4224]; | |
.shared .align 4 .b8 shared_cache_1286[4224]; | |
.shared .align 4 .b8 shared_cache_2287[4224]; | |
.shared .align 4 .b8 shared_cache_3288[4224]; | |
.shared .align 4 .b8 shared_cache_0289[4224]; | |
.shared .align 4 .b8 shared_cache_1290[4224]; | |
.shared .align 4 .b8 shared_cache_2291[4224]; | |
.shared .align 4 .b8 shared_cache_3292[4224]; | |
.shared .align 4 .b8 shared_cache_0293[4224]; | |
.shared .align 4 .b8 shared_cache_1294[4224]; | |
.shared .align 4 .b8 shared_cache_2295[4224]; | |
.shared .align 4 .b8 shared_cache_3296[4224]; | |
.shared .align 4 .b8 shared_cache_0297[4224]; | |
.shared .align 4 .b8 shared_cache_1298[4224]; | |
.shared .align 4 .b8 shared_cache_2299[4224]; | |
.shared .align 4 .b8 shared_cache_3300[4224]; | |
.shared .align 4 .b8 shared_cache_0301[4224]; | |
.shared .align 4 .b8 shared_cache_1302[4224]; | |
.shared .align 4 .b8 shared_cache_2303[4224]; | |
.shared .align 4 .b8 shared_cache_3304[4224]; | |
.shared .align 4 .b8 shared_cache_0305[4224]; | |
.shared .align 4 .b8 shared_cache_1306[4224]; | |
.shared .align 4 .b8 shared_cache_2307[4224]; | |
.shared .align 4 .b8 shared_cache_3308[4224]; | |
.shared .align 4 .b8 shared_cache_0309[4224]; | |
.shared .align 4 .b8 shared_cache_1310[4224]; | |
.shared .align 4 .b8 shared_cache_2311[4224]; | |
.shared .align 4 .b8 shared_cache_3312[4224]; | |
.shared .align 4 .b8 shared_cache_0313[4224]; | |
.shared .align 4 .b8 shared_cache_1314[4224]; | |
.shared .align 4 .b8 shared_cache_2315[4224]; | |
.shared .align 4 .b8 shared_cache_3316[4224]; | |
.shared .align 4 .b8 shared_cache_0317[4224]; | |
.shared .align 4 .b8 shared_cache_1318[4224]; | |
.shared .align 4 .b8 shared_cache_2319[4224]; | |
.shared .align 4 .b8 shared_cache_3320[4224]; | |
.shared .align 4 .b8 shared_cache_0321[4224]; | |
.shared .align 4 .b8 shared_cache_1322[4224]; | |
.shared .align 4 .b8 shared_cache_2323[4224]; | |
.shared .align 4 .b8 shared_cache_3324[4224]; | |
.shared .align 4 .b8 shared_cache_0325[4224]; | |
.shared .align 4 .b8 shared_cache_1326[4224]; | |
.shared .align 4 .b8 shared_cache_2327[4224]; | |
.shared .align 4 .b8 shared_cache_3328[4224]; | |
.shared .align 4 .b8 shared_cache_0329[4224]; | |
.shared .align 4 .b8 shared_cache_1330[4224]; | |
.shared .align 4 .b8 shared_cache_2331[4224]; | |
.shared .align 4 .b8 shared_cache_3332[4224]; | |
.shared .align 4 .b8 shared_cache_0333[4224]; | |
.shared .align 4 .b8 shared_cache_1334[4224]; | |
.shared .align 4 .b8 shared_cache_2335[4224]; | |
.shared .align 4 .b8 shared_cache_3336[4224]; | |
.shared .align 4 .b8 shared_cache_0337[4224]; | |
.shared .align 4 .b8 shared_cache_1338[4224]; | |
.shared .align 4 .b8 shared_cache_2339[4224]; | |
.shared .align 4 .b8 shared_cache_3340[4224]; | |
.shared .align 4 .b8 shared_cache_0341[4224]; | |
.shared .align 4 .b8 shared_cache_1342[4224]; | |
.shared .align 4 .b8 shared_cache_2343[4224]; | |
.shared .align 4 .b8 shared_cache_3344[4224]; | |
.shared .align 4 .b8 shared_cache_0345[4224]; | |
.shared .align 4 .b8 shared_cache_1346[4224]; | |
.shared .align 4 .b8 shared_cache_2347[4224]; | |
.shared .align 4 .b8 shared_cache_3348[4224]; | |
.shared .align 4 .b8 shared_cache_0349[4224]; | |
.shared .align 4 .b8 shared_cache_1350[4224]; | |
.shared .align 4 .b8 shared_cache_2351[4224]; | |
.shared .align 4 .b8 shared_cache_3352[4224]; | |
.shared .align 4 .b8 shared_cache_0353[4224]; | |
.shared .align 4 .b8 shared_cache_1354[4224]; | |
.shared .align 4 .b8 shared_cache_2355[4224]; | |
.shared .align 4 .b8 shared_cache_3356[4224]; | |
.shared .align 4 .b8 shared_cache_0357[4224]; | |
.shared .align 4 .b8 shared_cache_1358[4224]; | |
.shared .align 4 .b8 shared_cache_2359[4224]; | |
.shared .align 4 .b8 shared_cache_3360[4224]; | |
.shared .align 4 .b8 shared_cache_0361[4224]; | |
.shared .align 4 .b8 shared_cache_1362[4224]; | |
.shared .align 4 .b8 shared_cache_2363[4224]; | |
.shared .align 4 .b8 shared_cache_3364[4224]; | |
.shared .align 4 .b8 shared_cache_0365[4224]; | |
.shared .align 4 .b8 shared_cache_1366[4224]; | |
.shared .align 4 .b8 shared_cache_2367[4224]; | |
.shared .align 4 .b8 shared_cache_3368[4224]; | |
.shared .align 4 .b8 shared_cache_0369[4224]; | |
.shared .align 4 .b8 shared_cache_1370[4224]; | |
.shared .align 4 .b8 shared_cache_2371[4224]; | |
.shared .align 4 .b8 shared_cache_3372[4224]; | |
.shared .align 4 .b8 shared_cache_0373[4224]; | |
.shared .align 4 .b8 shared_cache_1374[4224]; | |
.shared .align 4 .b8 shared_cache_2375[4224]; | |
.shared .align 4 .b8 shared_cache_3376[4224]; | |
.shared .align 4 .b8 shared_cache_0377[4224]; | |
.shared .align 4 .b8 shared_cache_1378[4224]; | |
.shared .align 4 .b8 shared_cache_2379[4224]; | |
.shared .align 4 .b8 shared_cache_3380[4224]; | |
.shared .align 4 .b8 shared_cache_0381[4224]; | |
.shared .align 4 .b8 shared_cache_1382[4224]; | |
.visible .global .align 64 .b8 buffer_for_constant_519[4]; | |
.shared .align 4 .b8 shared_cache_0383[128]; | |
.shared .align 4 .b8 shared_cache_0384[4224]; | |
.shared .align 4 .b8 shared_cache_0385[4224]; | |
.shared .align 4 .b8 shared_cache_0386[4224]; | |
.shared .align 4 .b8 shared_cache_0387[4224]; | |
.shared .align 4 .b8 shared_cache_0388[4224]; | |
.shared .align 4 .b8 shared_cache_0389[128]; | |
.shared .align 4 .b8 shared_cache_0390[4224]; | |
.shared .align 4 .b8 shared_cache_0391[4224]; | |
.shared .align 4 .b8 shared_cache_0392[4224]; | |
.shared .align 4 .b8 shared_cache_0393[4224]; | |
.shared .align 4 .b8 shared_cache_0394[4224]; | |
.shared .align 4 .b8 shared_cache_0395[128]; | |
.shared .align 4 .b8 shared_cache_0396[4224]; | |
.shared .align 4 .b8 shared_cache_0397[4224]; | |
.shared .align 4 .b8 shared_cache_0398[4224]; | |
.shared .align 4 .b8 shared_cache_0399[4224]; | |
.shared .align 4 .b8 shared_cache_0400[4224]; | |
.shared .align 4 .b8 shared_cache_0401[128]; | |
.shared .align 4 .b8 shared_cache_0402[4224]; | |
.shared .align 4 .b8 shared_cache_0403[4224]; | |
.shared .align 4 .b8 shared_cache_0404[4224]; | |
.shared .align 4 .b8 shared_cache_0405[4224]; | |
.shared .align 4 .b8 shared_cache_0406[4224]; | |
.shared .align 4 .b8 shared_cache_0407[128]; | |
.shared .align 4 .b8 shared_cache_0408[4224]; | |
.shared .align 4 .b8 shared_cache_0409[4224]; | |
.shared .align 4 .b8 shared_cache_0410[4224]; | |
.shared .align 4 .b8 shared_cache_0411[4224]; | |
.shared .align 4 .b8 shared_cache_0412[4224]; | |
.shared .align 4 .b8 shared_cache_0413[128]; | |
.shared .align 4 .b8 shared_cache_0414[4224]; | |
.shared .align 4 .b8 shared_cache_0415[4224]; | |
.shared .align 4 .b8 shared_cache_0416[4224]; | |
.shared .align 4 .b8 shared_cache_0417[4224]; | |
.shared .align 4 .b8 shared_cache_0418[4224]; | |
.shared .align 4 .b8 shared_cache_0419[128]; | |
.shared .align 4 .b8 shared_cache_0420[4224]; | |
.shared .align 4 .b8 shared_cache_0421[4224]; | |
.shared .align 4 .b8 shared_cache_0422[4224]; | |
.shared .align 4 .b8 shared_cache_0423[4224]; | |
.shared .align 4 .b8 shared_cache_0424[4224]; | |
.shared .align 4 .b8 shared_cache_0425[128]; | |
.shared .align 4 .b8 shared_cache_0426[4224]; | |
.shared .align 4 .b8 shared_cache_0427[4224]; | |
.shared .align 4 .b8 shared_cache_0428[4224]; | |
.shared .align 4 .b8 shared_cache_0429[4224]; | |
.shared .align 4 .b8 shared_cache_0430[4224]; | |
.shared .align 4 .b8 shared_cache_0431[128]; | |
.shared .align 4 .b8 shared_cache_0432[4224]; | |
.shared .align 4 .b8 shared_cache_0433[4224]; | |
.shared .align 4 .b8 shared_cache_0434[4224]; | |
.shared .align 4 .b8 shared_cache_0435[4224]; | |
.shared .align 4 .b8 shared_cache_0436[4224]; | |
.shared .align 4 .b8 shared_cache_0437[128]; | |
.shared .align 4 .b8 shared_cache_0438[4224]; | |
.shared .align 4 .b8 shared_cache_0439[4224]; | |
.shared .align 4 .b8 shared_cache_0440[4224]; | |
.shared .align 4 .b8 shared_cache_0441[4224]; | |
.shared .align 4 .b8 shared_cache_0442[4224]; | |
.shared .align 4 .b8 shared_cache_0443[128]; | |
.shared .align 4 .b8 shared_cache_0444[4224]; | |
.shared .align 4 .b8 shared_cache_0445[4224]; | |
.shared .align 4 .b8 shared_cache_0446[4224]; | |
.shared .align 4 .b8 shared_cache_0447[4224]; | |
.shared .align 4 .b8 shared_cache_0448[4224]; | |
.shared .align 4 .b8 shared_cache_0449[128]; | |
.shared .align 4 .b8 shared_cache_0450[4224]; | |
.shared .align 4 .b8 shared_cache_0451[4224]; | |
.shared .align 4 .b8 shared_cache_0452[4224]; | |
.shared .align 4 .b8 shared_cache_0453[4224]; | |
.shared .align 4 .b8 shared_cache_0454[4224]; | |
.shared .align 4 .b8 shared_cache_0455[128]; | |
.shared .align 4 .b8 shared_cache_0456[4224]; | |
.shared .align 4 .b8 shared_cache_0457[4224]; | |
.shared .align 4 .b8 shared_cache_0458[4224]; | |
.shared .align 4 .b8 shared_cache_0459[4224]; | |
.shared .align 4 .b8 shared_cache_0460[4224]; | |
.shared .align 4 .b8 shared_cache_0461[128]; | |
.shared .align 4 .b8 shared_cache_0462[4224]; | |
.shared .align 4 .b8 shared_cache_0463[4224]; | |
.shared .align 4 .b8 shared_cache_0464[4224]; | |
.shared .align 4 .b8 shared_cache_0465[4224]; | |
.shared .align 4 .b8 shared_cache_0466[4224]; | |
.shared .align 4 .b8 shared_cache_0467[128]; | |
.shared .align 4 .b8 shared_cache_0468[4224]; | |
.shared .align 4 .b8 shared_cache_0469[4224]; | |
.shared .align 4 .b8 shared_cache_0470[4224]; | |
.shared .align 4 .b8 shared_cache_0471[4224]; | |
.shared .align 4 .b8 shared_cache_0472[4224]; | |
.shared .align 4 .b8 shared_cache_0473[128]; | |
.shared .align 4 .b8 shared_cache_0474[4224]; | |
.shared .align 4 .b8 shared_cache_0475[4224]; | |
.shared .align 4 .b8 shared_cache_0476[4224]; | |
.shared .align 4 .b8 shared_cache_0477[4224]; | |
.shared .align 4 .b8 shared_cache_0478[4224]; | |
.shared .align 4 .b8 shared_cache_0479[128]; | |
.shared .align 4 .b8 shared_cache_0480[4224]; | |
.shared .align 4 .b8 shared_cache_0481[4224]; | |
.shared .align 4 .b8 shared_cache_0482[4224]; | |
.shared .align 4 .b8 shared_cache_0483[4224]; | |
.shared .align 4 .b8 shared_cache_0484[4224]; | |
.shared .align 4 .b8 shared_cache_0485[128]; | |
.shared .align 4 .b8 shared_cache_0486[4224]; | |
.shared .align 4 .b8 shared_cache_0487[4224]; | |
.shared .align 4 .b8 shared_cache_0488[4224]; | |
.shared .align 4 .b8 shared_cache_0489[4224]; | |
.shared .align 4 .b8 shared_cache_0490[4224]; | |
.shared .align 4 .b8 shared_cache_0491[128]; | |
.shared .align 4 .b8 shared_cache_0492[4224]; | |
.shared .align 4 .b8 shared_cache_0493[4224]; | |
.shared .align 4 .b8 shared_cache_0494[4224]; | |
.shared .align 4 .b8 shared_cache_0495[4224]; | |
.shared .align 4 .b8 shared_cache_0496[4224]; | |
.shared .align 4 .b8 shared_cache_0497[128]; | |
.shared .align 4 .b8 shared_cache_0498[4224]; | |
.shared .align 4 .b8 shared_cache_0499[4224]; | |
.shared .align 4 .b8 shared_cache_0500[4224]; | |
.shared .align 4 .b8 shared_cache_0501[4224]; | |
.shared .align 4 .b8 shared_cache_0502[4224]; | |
.shared .align 4 .b8 shared_cache_0503[128]; | |
.shared .align 4 .b8 shared_cache_0504[4224]; | |
.shared .align 4 .b8 shared_cache_0505[4224]; | |
.shared .align 4 .b8 shared_cache_0506[4224]; | |
.shared .align 4 .b8 shared_cache_0507[4224]; | |
.shared .align 4 .b8 shared_cache_0508[4224]; | |
.shared .align 4 .b8 shared_cache_0509[128]; | |
.shared .align 4 .b8 shared_cache_0510[4224]; | |
.shared .align 4 .b8 shared_cache_0511[4224]; | |
.shared .align 4 .b8 shared_cache_0512[4224]; | |
.shared .align 4 .b8 shared_cache_0513[4224]; | |
.shared .align 4 .b8 shared_cache_0514[4224]; | |
.shared .align 4 .b8 shared_cache_0515[128]; | |
.shared .align 4 .b8 shared_cache_0516[4224]; | |
.shared .align 4 .b8 shared_cache_0517[4224]; | |
.shared .align 4 .b8 shared_cache_0518[4224]; | |
.shared .align 4 .b8 shared_cache_0519[4224]; | |
.shared .align 4 .b8 shared_cache_0520[4224]; | |
.shared .align 4 .b8 shared_cache_0521[128]; | |
.shared .align 4 .b8 shared_cache_0522[4224]; | |
.shared .align 4 .b8 shared_cache_0523[4224]; | |
.shared .align 4 .b8 shared_cache_0524[4224]; | |
.shared .align 4 .b8 shared_cache_0525[4224]; | |
.shared .align 4 .b8 shared_cache_0526[4224]; | |
.shared .align 4 .b8 shared_cache_0527[128]; | |
.visible .entry fusion_2287( | |
.param .u64 fusion_2287_param_0, | |
.param .u64 fusion_2287_param_1, | |
.param .u64 fusion_2287_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<8>; | |
.reg .b16 %h<10>; | |
.reg .b32 %r<12>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd4, [fusion_2287_param_0]; | |
ld.param.u64 %rd5, [fusion_2287_param_1]; | |
cvta.to.global.u64 %rd6, %rd5; | |
cvta.to.global.u64 %rd1, %rd4; | |
mov.u32 %r3, %ctaid.x; | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r5, %r3, 10; | |
shl.b32 %r6, %r4, 2; | |
or.b32 %r1, %r6, %r5; | |
shr.u32 %r7, %r1, 3; | |
or.b32 %r8, %r6, 1; | |
and.b32 %r2, %r8, 5; | |
and.b32 %r9, %r4, 1; | |
setp.eq.b32 %p1, %r9, 1; | |
mov.pred %p2, 0; | |
xor.pred %p3, %p1, %p2; | |
not.pred %p4, %p3; | |
mul.wide.u32 %rd7, %r7, 4; | |
add.s64 %rd2, %rd6, %rd7; | |
mov.b16 %h8, 0x0000; | |
@%p4 bra LBB0_3; | |
bra.uni LBB0_1; | |
LBB0_3: | |
ld.global.nc.u32 %r10, [%rd2]; | |
setp.eq.s32 %p5, %r10, 0; | |
selp.b16 %h8, 0x3C00, 0x0000, %p5; | |
LBB0_1: | |
mul.wide.u32 %rd8, %r1, 2; | |
add.s64 %rd3, %rd1, %rd8; | |
st.global.b16 [%rd3], %h8; | |
setp.eq.s32 %p6, %r2, 1; | |
mov.b16 %h9, 0x0000; | |
@%p6 bra LBB0_4; | |
bra.uni LBB0_2; | |
LBB0_4: | |
ld.global.nc.u32 %r11, [%rd2]; | |
setp.eq.s32 %p7, %r11, 1; | |
selp.b16 %h9, 0x3C00, 0x0000, %p7; | |
LBB0_2: | |
st.global.b16 [%rd3+2], %h9; | |
mov.b16 %h7, 0x0000; | |
st.global.v2.b16 [%rd3+4], {%h7, %h7}; | |
ret; | |
} | |
// .globl fusion_2285 | |
.visible .entry fusion_2285( | |
.param .u64 fusion_2285_param_0, | |
.param .u64 fusion_2285_param_1, | |
.param .u64 fusion_2285_param_2, | |
.param .u64 fusion_2285_param_3, | |
.param .u64 fusion_2285_param_4 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<9>; | |
.reg .b32 %r<11>; | |
.reg .b64 %rd<23>; | |
ld.param.u64 %rd1, [fusion_2285_param_0]; | |
ld.param.u64 %rd2, [fusion_2285_param_3]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2285_param_1]; | |
ld.param.u64 %rd5, [fusion_2285_param_2]; | |
cvta.to.global.u64 %rd6, %rd5; | |
cvta.to.global.u64 %rd7, %rd4; | |
cvta.to.global.u64 %rd8, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
mul.wide.u32 %rd9, %r1, 4; | |
add.s64 %rd10, %rd3, %rd9; | |
ld.global.nc.u32 %r8, [%rd10]; | |
max.s32 %r9, %r8, 0; | |
min.s32 %r10, %r9, 30521; | |
mul.wide.u32 %rd11, %r10, 4096; | |
add.s64 %rd12, %rd8, %rd11; | |
mul.wide.u32 %rd13, %r4, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd15, %r5, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.v4.f32 {%f2, %f3, %f4, %f5}, [%rd16]; | |
cvt.rn.f16.f32 %h2, %f2; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd17, %r5, 2; | |
add.s64 %rd18, %rd7, %rd17; | |
mul.wide.u32 %rd19, %r6, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f6, [%rd20]; | |
cvt.rn.f16.f32 %h4, %f6; | |
cvt.rn.f16.f32 %h5, %f3; | |
add.rn.f16 %h6, %h4, %h5; | |
mul.wide.u32 %rd21, %r7, 4; | |
add.s64 %rd22, %rd12, %rd21; | |
ld.global.nc.f32 %f7, [%rd22]; | |
cvt.rn.f16.f32 %h7, %f7; | |
cvt.rn.f16.f32 %h8, %f4; | |
add.rn.f16 %h9, %h7, %h8; | |
ld.global.nc.f32 %f8, [%rd14+12]; | |
cvt.rn.f16.f32 %h10, %f8; | |
cvt.rn.f16.f32 %h11, %f5; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd18], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2286 | |
.visible .entry fusion_2286( | |
.param .u64 fusion_2286_param_0, | |
.param .u64 fusion_2286_param_1, | |
.param .u64 fusion_2286_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<4>; | |
.reg .b16 %h<14>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<10>; | |
.reg .b64 %rd<18>; | |
ld.param.u64 %rd6, [fusion_2286_param_0]; | |
ld.param.u64 %rd7, [fusion_2286_param_1]; | |
cvta.to.global.u64 %rd1, %rd7; | |
cvta.to.global.u64 %rd2, %rd6; | |
mov.u32 %r4, %ctaid.x; | |
mov.u32 %r5, %tid.x; | |
shl.b32 %r6, %r4, 10; | |
shl.b32 %r7, %r5, 2; | |
or.b32 %r1, %r7, %r6; | |
setp.lt.u32 %p1, %r4, 2; | |
cvt.u64.u32 %rd3, %r4; | |
mul.wide.u32 %rd8, %r4, 4096; | |
add.s64 %rd9, %rd1, %rd8; | |
mul.wide.u32 %rd10, %r7, 4; | |
add.s64 %rd4, %rd9, %rd10; | |
mov.b16 %h11, 0x0000; | |
@%p1 bra LBB2_5; | |
bra.uni LBB2_1; | |
LBB2_5: | |
ld.global.nc.f32 %f1, [%rd4]; | |
cvt.rn.f16.f32 %h11, %f1; | |
LBB2_1: | |
cvt.u32.u64 %r8, %rd3; | |
setp.lt.u32 %p2, %r8, 2; | |
mul.wide.u32 %rd11, %r1, 2; | |
add.s64 %rd5, %rd2, %rd11; | |
st.global.b16 [%rd5], %h11; | |
@%p2 bra LBB2_6; | |
bra.uni LBB2_2; | |
LBB2_6: | |
or.b32 %r2, %r7, 1; | |
or.b32 %r3, %r7, 2; | |
shl.b64 %rd12, %rd3, 12; | |
add.s64 %rd13, %rd1, %rd12; | |
mul.wide.u32 %rd14, %r2, 4; | |
add.s64 %rd15, %rd13, %rd14; | |
ld.global.nc.f32 %f2, [%rd15]; | |
cvt.rn.f16.f32 %h9, %f2; | |
st.global.b16 [%rd5+2], %h9; | |
mul.wide.u32 %rd16, %r3, 4; | |
add.s64 %rd17, %rd13, %rd16; | |
ld.global.nc.f32 %f3, [%rd17]; | |
cvt.rn.f16.f32 %h12, %f3; | |
bra.uni LBB2_3; | |
LBB2_2: | |
mov.b16 %h12, 0x0000; | |
st.global.b16 [%rd5+2], %h12; | |
LBB2_3: | |
st.global.b16 [%rd5+4], %h12; | |
mov.b16 %h13, 0x0000; | |
@%p2 bra LBB2_7; | |
bra.uni LBB2_4; | |
LBB2_7: | |
ld.global.nc.f32 %f4, [%rd4+12]; | |
cvt.rn.f16.f32 %h13, %f4; | |
LBB2_4: | |
st.global.b16 [%rd5+6], %h13; | |
ret; | |
} | |
// .globl fusion_2283 | |
.visible .entry fusion_2283( | |
.param .u64 fusion_2283_param_0, | |
.param .u64 fusion_2283_param_1, | |
.param .u64 fusion_2283_param_2 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot3[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<56>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<22>; | |
mov.u64 %SPL, __local_depot3; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2283_param_0]; | |
cvta.to.global.u64 %rd8, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd10, %r4, 2048; | |
add.s64 %rd11, %rd8, %rd10; | |
mul.wide.u32 %rd12, %r5, 2; | |
add.s64 %rd13, %rd11, %rd12; | |
ld.global.nc.b32 %hh1, [%rd13]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
add.rn.f32 %f3, %f2, 0f00000000; | |
cvt.f32.f16 %f4, %h2; | |
add.rn.f32 %f5, %f3, %f4; | |
ld.global.nc.b32 %hh2, [%rd13+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f6, %h3; | |
add.rn.f32 %f7, %f5, %f6; | |
cvt.f32.f16 %f8, %h4; | |
add.rn.f32 %f9, %f7, %f8; | |
ld.global.nc.b32 %hh3, [%rd13+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f10, %h5; | |
add.rn.f32 %f11, %f9, %f10; | |
cvt.f32.f16 %f12, %h6; | |
add.rn.f32 %f13, %f11, %f12; | |
ld.global.nc.b32 %hh4, [%rd13+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f14, %h7; | |
add.rn.f32 %f15, %f13, %f14; | |
cvt.f32.f16 %f16, %h8; | |
add.rn.f32 %f17, %f15, %f16; | |
ld.global.nc.b32 %hh5, [%rd13+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f18, %h9; | |
add.rn.f32 %f19, %f17, %f18; | |
cvt.f32.f16 %f20, %h10; | |
add.rn.f32 %f21, %f19, %f20; | |
ld.global.nc.b32 %hh6, [%rd13+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f22, %h11; | |
add.rn.f32 %f23, %f21, %f22; | |
cvt.f32.f16 %f24, %h12; | |
add.rn.f32 %f25, %f23, %f24; | |
ld.global.nc.b32 %hh7, [%rd13+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f26, %h13; | |
add.rn.f32 %f27, %f25, %f26; | |
cvt.f32.f16 %f28, %h14; | |
add.rn.f32 %f29, %f27, %f28; | |
ld.global.nc.b32 %hh8, [%rd13+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f30, %h15; | |
add.rn.f32 %f31, %f29, %f30; | |
cvt.f32.f16 %f32, %h16; | |
add.rn.f32 %f33, %f31, %f32; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
add.rn.f32 %f35, %f34, %f33; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
add.rn.f32 %f37, %f36, %f35; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
add.rn.f32 %f39, %f38, %f37; | |
shfl.sync.down.b32 %f40, %f39, 2, 31, -1; | |
add.rn.f32 %f41, %f40, %f39; | |
shfl.sync.down.b32 %f42, %f41, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd16, shared_cache_0; | |
@%p1 bra LBB3_3; | |
bra.uni LBB3_1; | |
LBB3_3: | |
mul.wide.u32 %rd15, %r3, 4; | |
add.s64 %rd3, %rd16, %rd15; | |
add.rn.f32 %f1, %f42, %f41; | |
st.shared.f32 [%rd3], %f1; | |
LBB3_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB3_4; | |
bra.uni LBB3_2; | |
LBB3_4: | |
add.u64 %rd9, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd17, %r2, 4; | |
add.s64 %rd4, %rd16, %rd17; | |
cvta.shared.u64 %rd19, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd21, %rd19, %rd9, %p3; | |
ld.f32 %f43, [%rd21]; | |
shfl.sync.down.b32 %f44, %f43, 16, 31, -1; | |
add.rn.f32 %f45, %f43, %f44; | |
shfl.sync.down.b32 %f46, %f45, 8, 31, -1; | |
add.rn.f32 %f47, %f45, %f46; | |
shfl.sync.down.b32 %f48, %f47, 4, 31, -1; | |
add.rn.f32 %f49, %f47, %f48; | |
shfl.sync.down.b32 %f50, %f49, 2, 31, -1; | |
add.rn.f32 %f51, %f49, %f50; | |
shfl.sync.down.b32 %f52, %f51, 1, 31, -1; | |
add.rn.f32 %f53, %f51, %f52; | |
st.f32 [%rd21], %f53; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB3_2; | |
ld.param.u64 %rd6, [fusion_2283_param_1]; | |
cvta.to.global.u64 %rd7, %rd6; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd2, %rd7, %rd14; | |
ld.shared.f32 %f54, [%rd4]; | |
atom.global.add.f32 %f55, [%rd2], %f54; | |
LBB3_2: | |
ret; | |
} | |
// .globl fusion_2281 | |
.visible .entry fusion_2281( | |
.param .u64 fusion_2281_param_0, | |
.param .u64 fusion_2281_param_1, | |
.param .u64 fusion_2281_param_2, | |
.param .u64 fusion_2281_param_3 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot4[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<90>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<25>; | |
mov.u64 %SPL, __local_depot4; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2281_param_0]; | |
ld.param.u64 %rd6, [fusion_2281_param_2]; | |
cvta.to.global.u64 %rd7, %rd6; | |
cvta.to.global.u64 %rd10, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd12, %r4, 2048; | |
add.s64 %rd13, %rd10, %rd12; | |
mul.wide.u32 %rd14, %r5, 2; | |
add.s64 %rd15, %rd13, %rd14; | |
ld.global.nc.b32 %hh1, [%rd15]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
mul.wide.u32 %rd16, %r4, 4; | |
add.s64 %rd17, %rd7, %rd16; | |
ld.global.nc.f32 %f3, [%rd17]; | |
mul.rn.f32 %f4, %f3, 0f3A800000; | |
sub.rn.f32 %f5, %f2, %f4; | |
mul.rn.f32 %f6, %f5, %f5; | |
add.rn.f32 %f7, %f6, 0f00000000; | |
cvt.f32.f16 %f8, %h2; | |
sub.rn.f32 %f9, %f8, %f4; | |
mul.rn.f32 %f10, %f9, %f9; | |
add.rn.f32 %f11, %f7, %f10; | |
ld.global.nc.b32 %hh2, [%rd15+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f12, %h3; | |
sub.rn.f32 %f13, %f12, %f4; | |
mul.rn.f32 %f14, %f13, %f13; | |
add.rn.f32 %f15, %f11, %f14; | |
cvt.f32.f16 %f16, %h4; | |
sub.rn.f32 %f17, %f16, %f4; | |
mul.rn.f32 %f18, %f17, %f17; | |
add.rn.f32 %f19, %f15, %f18; | |
ld.global.nc.b32 %hh3, [%rd15+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f20, %h5; | |
sub.rn.f32 %f21, %f20, %f4; | |
mul.rn.f32 %f22, %f21, %f21; | |
add.rn.f32 %f23, %f19, %f22; | |
cvt.f32.f16 %f24, %h6; | |
sub.rn.f32 %f25, %f24, %f4; | |
mul.rn.f32 %f26, %f25, %f25; | |
add.rn.f32 %f27, %f23, %f26; | |
ld.global.nc.b32 %hh4, [%rd15+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f28, %h7; | |
sub.rn.f32 %f29, %f28, %f4; | |
mul.rn.f32 %f30, %f29, %f29; | |
add.rn.f32 %f31, %f27, %f30; | |
cvt.f32.f16 %f32, %h8; | |
sub.rn.f32 %f33, %f32, %f4; | |
mul.rn.f32 %f34, %f33, %f33; | |
add.rn.f32 %f35, %f31, %f34; | |
ld.global.nc.b32 %hh5, [%rd15+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f36, %h9; | |
sub.rn.f32 %f37, %f36, %f4; | |
mul.rn.f32 %f38, %f37, %f37; | |
add.rn.f32 %f39, %f35, %f38; | |
cvt.f32.f16 %f40, %h10; | |
sub.rn.f32 %f41, %f40, %f4; | |
mul.rn.f32 %f42, %f41, %f41; | |
add.rn.f32 %f43, %f39, %f42; | |
ld.global.nc.b32 %hh6, [%rd15+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f44, %h11; | |
sub.rn.f32 %f45, %f44, %f4; | |
mul.rn.f32 %f46, %f45, %f45; | |
add.rn.f32 %f47, %f43, %f46; | |
cvt.f32.f16 %f48, %h12; | |
sub.rn.f32 %f49, %f48, %f4; | |
mul.rn.f32 %f50, %f49, %f49; | |
add.rn.f32 %f51, %f47, %f50; | |
ld.global.nc.b32 %hh7, [%rd15+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f52, %h13; | |
sub.rn.f32 %f53, %f52, %f4; | |
mul.rn.f32 %f54, %f53, %f53; | |
add.rn.f32 %f55, %f51, %f54; | |
cvt.f32.f16 %f56, %h14; | |
sub.rn.f32 %f57, %f56, %f4; | |
mul.rn.f32 %f58, %f57, %f57; | |
add.rn.f32 %f59, %f55, %f58; | |
ld.global.nc.b32 %hh8, [%rd15+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f60, %h15; | |
sub.rn.f32 %f61, %f60, %f4; | |
mul.rn.f32 %f62, %f61, %f61; | |
add.rn.f32 %f63, %f59, %f62; | |
cvt.f32.f16 %f64, %h16; | |
sub.rn.f32 %f65, %f64, %f4; | |
mul.rn.f32 %f66, %f65, %f65; | |
add.rn.f32 %f67, %f63, %f66; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f68, %f67, 16, 31, -1; | |
add.rn.f32 %f69, %f68, %f67; | |
shfl.sync.down.b32 %f70, %f69, 8, 31, -1; | |
add.rn.f32 %f71, %f70, %f69; | |
shfl.sync.down.b32 %f72, %f71, 4, 31, -1; | |
add.rn.f32 %f73, %f72, %f71; | |
shfl.sync.down.b32 %f74, %f73, 2, 31, -1; | |
add.rn.f32 %f75, %f74, %f73; | |
shfl.sync.down.b32 %f76, %f75, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd19, shared_cache_01; | |
@%p1 bra LBB4_3; | |
bra.uni LBB4_1; | |
LBB4_3: | |
mul.wide.u32 %rd18, %r3, 4; | |
add.s64 %rd3, %rd19, %rd18; | |
add.rn.f32 %f1, %f76, %f75; | |
st.shared.f32 [%rd3], %f1; | |
LBB4_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB4_4; | |
bra.uni LBB4_2; | |
LBB4_4: | |
add.u64 %rd11, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd20, %r2, 4; | |
add.s64 %rd4, %rd19, %rd20; | |
cvta.shared.u64 %rd22, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd24, %rd22, %rd11, %p3; | |
ld.f32 %f77, [%rd24]; | |
shfl.sync.down.b32 %f78, %f77, 16, 31, -1; | |
add.rn.f32 %f79, %f77, %f78; | |
shfl.sync.down.b32 %f80, %f79, 8, 31, -1; | |
add.rn.f32 %f81, %f79, %f80; | |
shfl.sync.down.b32 %f82, %f81, 4, 31, -1; | |
add.rn.f32 %f83, %f81, %f82; | |
shfl.sync.down.b32 %f84, %f83, 2, 31, -1; | |
add.rn.f32 %f85, %f83, %f84; | |
shfl.sync.down.b32 %f86, %f85, 1, 31, -1; | |
add.rn.f32 %f87, %f85, %f86; | |
st.f32 [%rd24], %f87; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB4_2; | |
ld.param.u64 %rd8, [fusion_2281_param_1]; | |
cvta.to.global.u64 %rd9, %rd8; | |
add.s64 %rd2, %rd9, %rd16; | |
ld.shared.f32 %f88, [%rd4]; | |
atom.global.add.f32 %f89, [%rd2], %f88; | |
LBB4_2: | |
ret; | |
} | |
// .globl rng_get_and_update_state | |
.visible .entry rng_get_and_update_state( | |
.param .u64 rng_get_and_update_state_param_0, | |
.param .u64 rng_get_and_update_state_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 524288; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 524288; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2278 | |
.visible .entry fusion_2278( | |
.param .u64 fusion_2278_param_0, | |
.param .u64 fusion_2278_param_1, | |
.param .u64 fusion_2278_param_2, | |
.param .u64 fusion_2278_param_3, | |
.param .u64 fusion_2278_param_4, | |
.param .u64 fusion_2278_param_5, | |
.param .u64 fusion_2278_param_6, | |
.param .u64 fusion_2278_param_7 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<27>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<47>; | |
.reg .b32 %r<31>; | |
.reg .b64 %rd<138>; | |
ld.param.u64 %rd1, [fusion_2278_param_0]; | |
ld.param.u64 %rd2, [fusion_2278_param_6]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2278_param_1]; | |
ld.param.u64 %rd5, [fusion_2278_param_5]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2278_param_2]; | |
ld.param.u64 %rd8, [fusion_2278_param_4]; | |
cvta.to.global.u64 %rd9, %rd8; | |
ld.param.u64 %rd10, [fusion_2278_param_3]; | |
cvta.to.global.u64 %rd11, %rd10; | |
cvta.to.global.u64 %rd12, %rd7; | |
cvta.to.global.u64 %rd13, %rd4; | |
cvta.to.global.u64 %rd14, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
shr.u32 %r8, %r5, 2; | |
ld.global.nc.v2.u64 {%rd15, %rd16}, [%rd12]; | |
cvt.u64.u32 %rd17, %r8; | |
add.s64 %rd18, %rd15, %rd17; | |
setp.lt.u64 %p1, %rd18, %rd15; | |
and.b64 %rd19, %rd18, 4294967295; | |
mul.lo.s64 %rd20, %rd19, 3528531795; | |
selp.u64 %rd21, 1, 0, %p1; | |
add.s64 %rd22, %rd16, %rd21; | |
xor.b64 %rd23, %rd22, %rd20; | |
shr.u64 %rd24, %rd23, 32; | |
mul.lo.s64 %rd25, %rd24, 3449720151; | |
shr.u64 %rd26, %rd25, 32; | |
and.b64 %rd27, %rd22, 4294967295; | |
mul.lo.s64 %rd28, %rd27, 3449720151; | |
and.b64 %rd29, %rd28, 4294967295; | |
xor.b64 %rd30, %rd29, %rd26; | |
xor.b64 %rd31, %rd30, 2654435769; | |
mul.lo.s64 %rd32, %rd31, 3528531795; | |
shr.u64 %rd33, %rd32, 32; | |
xor.b64 %rd34, %rd28, %rd18; | |
shr.u64 %rd35, %rd34, 32; | |
mul.lo.s64 %rd36, %rd35, 3528531795; | |
and.b64 %rd37, %rd36, 4294967295; | |
xor.b64 %rd38, %rd37, %rd33; | |
xor.b64 %rd39, %rd38, 1993301258; | |
mul.lo.s64 %rd40, %rd39, 3449720151; | |
shr.u64 %rd41, %rd40, 32; | |
shr.u64 %rd42, %rd36, 32; | |
and.b64 %rd43, %rd20, 4294967295; | |
xor.b64 %rd44, %rd43, %rd42; | |
xor.b64 %rd45, %rd44, 3144134277; | |
mul.lo.s64 %rd46, %rd45, 3449720151; | |
and.b64 %rd47, %rd46, 4294967295; | |
xor.b64 %rd48, %rd47, %rd41; | |
xor.b64 %rd49, %rd48, 3668340011; | |
mul.lo.s64 %rd50, %rd49, 3528531795; | |
shr.u64 %rd51, %rd50, 32; | |
shr.u64 %rd52, %rd46, 32; | |
and.b64 %rd53, %rd25, 4294967295; | |
xor.b64 %rd54, %rd53, %rd52; | |
xor.b64 %rd55, %rd54, 1013904242; | |
mul.lo.s64 %rd56, %rd55, 3528531795; | |
and.b64 %rd57, %rd56, 4294967295; | |
xor.b64 %rd58, %rd57, %rd51; | |
xor.b64 %rd59, %rd58, 3986602516; | |
mul.lo.s64 %rd60, %rd59, 3449720151; | |
shr.u64 %rd61, %rd60, 32; | |
shr.u64 %rd62, %rd56, 32; | |
and.b64 %rd63, %rd32, 4294967295; | |
xor.b64 %rd64, %rd63, %rd62; | |
xor.b64 %rd65, %rd64, 842468239; | |
mul.lo.s64 %rd66, %rd65, 3449720151; | |
and.b64 %rd67, %rd66, 4294967295; | |
xor.b64 %rd68, %rd67, %rd61; | |
xor.b64 %rd69, %rd68, 387276957; | |
mul.lo.s64 %rd70, %rd69, 3528531795; | |
shr.u64 %rd71, %rd70, 32; | |
shr.u64 %rd72, %rd66, 32; | |
and.b64 %rd73, %rd40, 4294967295; | |
xor.b64 %rd74, %rd73, %rd72; | |
xor.b64 %rd75, %rd74, 2027808484; | |
mul.lo.s64 %rd76, %rd75, 3528531795; | |
and.b64 %rd77, %rd76, 4294967295; | |
shr.u64 %rd78, %rd76, 32; | |
and.b64 %rd79, %rd50, 4294967295; | |
xor.b64 %rd80, %rd79, %rd78; | |
xor.b64 %rd81, %rd80, 2835769497; | |
mul.lo.s64 %rd82, %rd81, 3449720151; | |
and.b64 %rd83, %rd82, 4294967295; | |
shr.u64 %rd84, %rd82, 32; | |
and.b64 %rd85, %rd60, 4294967295; | |
xor.b64 %rd86, %rd85, %rd84; | |
xor.b64 %rd87, %rd86, 3041712726; | |
mul.lo.s64 %rd88, %rd87, 3528531795; | |
and.b64 %rd89, %rd88, 4294967295; | |
xor.b64 %rd90, %rd77, %rd71; | |
xor.b64 %rd91, %rd90, 1684936478; | |
mul.lo.s64 %rd92, %rd91, 3449720151; | |
shr.u64 %rd93, %rd92, 32; | |
xor.b64 %rd94, %rd83, %rd93; | |
xor.b64 %rd95, %rd94, 1401181199; | |
mul.lo.s64 %rd96, %rd95, 3528531795; | |
shr.u64 %rd97, %rd96, 32; | |
xor.b64 %rd98, %rd89, %rd97; | |
xor.b64 %rd99, %rd98, 3678237736; | |
mul.lo.s64 %rd100, %rd99, 3449720151; | |
shr.u64 %rd101, %rd100, 32; | |
cvt.u32.u64 %r9, %rd101; | |
shr.u64 %rd102, %rd88, 32; | |
xor.b64 %rd103, %rd102, %rd70; | |
cvt.u32.u64 %r10, %rd103; | |
xor.b32 %r11, %r10, 534103459; | |
mul.lo.s32 %r12, %r11, -845247145; | |
xor.b32 %r13, %r12, %r9; | |
shr.u32 %r14, %r13, 9; | |
xor.b32 %r15, %r14, 4716963; | |
cvt.rn.f32.u32 %f1, %r15; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f2; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p2, %h1, %h2; | |
mul.wide.u32 %rd104, %r5, 2; | |
add.s64 %rd105, %rd13, %rd104; | |
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd105]; | |
mov.b32 %hh1, {%h5, %h6}; | |
mov.b32 %hh2, {%h3, %h4}; | |
mov.b32 {%h7, %h8}, %hh2; | |
mov.b32 {%h9, %h10}, %hh1; | |
cvt.f32.f16 %f3, %h7; | |
mul.wide.u32 %rd106, %r1, 4; | |
add.s64 %rd107, %rd11, %rd106; | |
ld.global.nc.f32 %f4, [%rd107]; | |
mul.rn.f32 %f5, %f4, 0f3A800000; | |
add.rn.f32 %f6, %f5, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f7, %f6; | |
mul.wide.u32 %rd108, %r4, 4; | |
add.s64 %rd109, %rd3, %rd108; | |
ld.global.nc.f32 %f8, [%rd109]; | |
mul.rn.f32 %f9, %f7, %f8; | |
mul.rn.f32 %f10, %f9, %f3; | |
add.s64 %rd110, %rd6, %rd108; | |
ld.global.nc.f32 %f11, [%rd110]; | |
add.s64 %rd111, %rd9, %rd106; | |
ld.global.nc.f32 %f12, [%rd111]; | |
mul.rn.f32 %f13, %f12, 0f3A800000; | |
mul.rn.f32 %f14, %f9, %f13; | |
sub.rn.f32 %f15, %f11, %f14; | |
add.rn.f32 %f16, %f10, %f15; | |
cvt.rn.f16.f32 %h11, %f16; | |
mov.b16 %h12, 0x3C72; | |
mul.rn.f16 %h13, %h11, %h12; | |
selp.b16 %h14, %h13, 0x0000, %p2; | |
add.s64 %rd112, %rd14, %rd104; | |
xor.b64 %rd113, %rd60, %rd84; | |
xor.b64 %rd114, %rd113, 3041712726; | |
mul.lo.s64 %rd115, %rd114, 3528531795; | |
xor.b64 %rd116, %rd97, %rd115; | |
cvt.u32.u64 %r16, %rd116; | |
xor.b32 %r17, %r16, -616729560; | |
mul.lo.s32 %r18, %r17, -845247145; | |
shr.u32 %r19, %r18, 9; | |
cvt.rn.f32.u32 %f17, %r19; | |
mul.rn.f32 %f18, %f17, 0f34000000; | |
cvt.rn.f16.f32 %h15, %f18; | |
setp.ge.f16 %p3, %h15, %h2; | |
cvt.f32.f16 %f19, %h8; | |
mul.wide.u32 %rd117, %r6, 4; | |
add.s64 %rd118, %rd3, %rd117; | |
ld.global.nc.f32 %f20, [%rd118]; | |
mul.rn.f32 %f21, %f7, %f20; | |
mul.rn.f32 %f22, %f21, %f19; | |
add.s64 %rd119, %rd6, %rd117; | |
ld.global.nc.f32 %f23, [%rd119]; | |
mul.rn.f32 %f24, %f13, %f21; | |
sub.rn.f32 %f25, %f23, %f24; | |
add.rn.f32 %f26, %f22, %f25; | |
cvt.rn.f16.f32 %h16, %f26; | |
mul.rn.f16 %h17, %h16, %h12; | |
selp.b16 %h18, %h17, 0x0000, %p3; | |
and.b64 %rd120, %rd92, 4294967295; | |
and.b64 %rd121, %rd70, 4294967295; | |
xor.b64 %rd122, %rd121, %rd102; | |
xor.b64 %rd123, %rd122, 534103459; | |
mul.lo.s64 %rd124, %rd123, 3449720151; | |
shr.u64 %rd125, %rd124, 32; | |
xor.b64 %rd126, %rd120, %rd125; | |
xor.b64 %rd127, %rd126, 4055616968; | |
mul.lo.s64 %rd128, %rd127, 3528531795; | |
shr.u64 %rd129, %rd128, 32; | |
cvt.u32.u64 %r20, %rd129; | |
xor.b64 %rd130, %rd93, %rd82; | |
cvt.u32.u64 %r21, %rd130; | |
xor.b32 %r22, %r21, 1401181199; | |
mul.lo.s32 %r23, %r22, -766435501; | |
xor.b32 %r24, %r23, %r20; | |
shr.u32 %r25, %r24, 9; | |
xor.b32 %r26, %r25, 4936337; | |
cvt.rn.f32.u32 %f27, %r26; | |
mul.rn.f32 %f28, %f27, 0f34000000; | |
cvt.rn.f16.f32 %h19, %f28; | |
setp.ge.f16 %p4, %h19, %h2; | |
cvt.f32.f16 %f29, %h9; | |
mul.wide.u32 %rd131, %r7, 4; | |
add.s64 %rd132, %rd3, %rd131; | |
ld.global.nc.f32 %f30, [%rd132]; | |
mul.rn.f32 %f31, %f7, %f30; | |
mul.rn.f32 %f32, %f31, %f29; | |
add.s64 %rd133, %rd6, %rd131; | |
ld.global.nc.f32 %f33, [%rd133]; | |
mul.rn.f32 %f34, %f13, %f31; | |
sub.rn.f32 %f35, %f33, %f34; | |
add.rn.f32 %f36, %f32, %f35; | |
cvt.rn.f16.f32 %h20, %f36; | |
mul.rn.f16 %h21, %h20, %h12; | |
selp.b16 %h22, %h21, 0x0000, %p4; | |
xor.b64 %rd134, %rd71, %rd76; | |
xor.b64 %rd135, %rd134, 1684936478; | |
mul.lo.s64 %rd136, %rd135, 3449720151; | |
xor.b64 %rd137, %rd125, %rd136; | |
cvt.u32.u64 %r27, %rd137; | |
xor.b32 %r28, %r27, -239350328; | |
mul.lo.s32 %r29, %r28, -766435501; | |
shr.u32 %r30, %r29, 9; | |
cvt.rn.f32.u32 %f37, %r30; | |
mul.rn.f32 %f38, %f37, 0f34000000; | |
cvt.rn.f16.f32 %h23, %f38; | |
setp.ge.f16 %p5, %h23, %h2; | |
cvt.f32.f16 %f39, %h10; | |
ld.global.nc.f32 %f40, [%rd109+12]; | |
mul.rn.f32 %f41, %f7, %f40; | |
mul.rn.f32 %f42, %f41, %f39; | |
ld.global.nc.f32 %f43, [%rd110+12]; | |
mul.rn.f32 %f44, %f13, %f41; | |
sub.rn.f32 %f45, %f43, %f44; | |
add.rn.f32 %f46, %f42, %f45; | |
cvt.rn.f16.f32 %h24, %f46; | |
mul.rn.f16 %h25, %h24, %h12; | |
selp.b16 %h26, %h25, 0x0000, %p5; | |
st.global.v4.b16 [%rd112], {%h14, %h18, %h22, %h26}; | |
ret; | |
} | |
// .globl fusion_2710 | |
.visible .entry fusion_2710( | |
.param .u64 fusion_2710_param_0, | |
.param .u64 fusion_2710_param_1, | |
.param .u64 fusion_2710_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2710_param_0]; | |
ld.param.u64 %rd2, [fusion_2710_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2274 | |
.visible .entry fusion_2274( | |
.param .u64 fusion_2274_param_0, | |
.param .u64 fusion_2274_param_1, | |
.param .u64 fusion_2274_param_2, | |
.param .u64 fusion_2274_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2274_param_0]; | |
ld.param.u64 %rd2, [fusion_2274_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2274_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd5, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd6, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2711 | |
.visible .entry fusion_2711( | |
.param .u64 fusion_2711_param_0, | |
.param .u64 fusion_2711_param_1, | |
.param .u64 fusion_2711_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2711_param_0]; | |
ld.param.u64 %rd2, [fusion_2711_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2275 | |
.visible .entry fusion_2275( | |
.param .u64 fusion_2275_param_0, | |
.param .u64 fusion_2275_param_1, | |
.param .u64 fusion_2275_param_2, | |
.param .u64 fusion_2275_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2275_param_0]; | |
ld.param.u64 %rd2, [fusion_2275_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2275_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd5, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd6, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2271 | |
.visible .entry fusion_2271( | |
.param .u64 fusion_2271_param_0, | |
.param .u64 fusion_2271_param_1, | |
.param .u64 fusion_2271_param_2, | |
.param .u64 fusion_2271_param_3 | |
) | |
.reqntid 32, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot11[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<4>; | |
.reg .b16 %h<83>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<57>; | |
.reg .b32 %r<37>; | |
.reg .b64 %rd<37>; | |
mov.u64 %SPL, __local_depot11; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd4, [fusion_2271_param_0]; | |
ld.param.u64 %rd5, [fusion_2271_param_2]; | |
cvta.to.global.u64 %rd6, %rd5; | |
cvta.to.global.u64 %rd9, %rd4; | |
add.u64 %rd10, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r5, %ctaid.x; | |
shl.b32 %r6, %r1, 1; | |
shl.b32 %r7, %r5, 9; | |
or.b32 %r8, %r7, %r6; | |
mul.wide.u32 %rd11, %r8, 2; | |
add.s64 %rd12, %rd9, %rd11; | |
ld.global.nc.b32 %hh1, [%rd12]; | |
mov.b32 {%h1, %h2}, %hh1; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd6, %rd13; | |
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14]; | |
cvt.rn.f16.s32 %h3, %r9; | |
mov.b16 %h4, 0x3C00; | |
sub.rn.f16 %h5, %h4, %h3; | |
mov.b16 %h6, 0x70E2; | |
mul.rn.f16 %h7, %h5, %h6; | |
sub.rn.f16 %h8, %h1, %h7; | |
cvt.f32.f16 %f2, %h8; | |
max.f32 %f3, %f2, 0fFF800000; | |
cvt.rn.f16.s32 %h9, %r10; | |
sub.rn.f16 %h10, %h4, %h9; | |
mul.rn.f16 %h11, %h10, %h6; | |
sub.rn.f16 %h12, %h2, %h11; | |
cvt.f32.f16 %f4, %h12; | |
max.f32 %f5, %f3, %f4; | |
or.b32 %r11, %r6, 64; | |
ld.global.nc.b32 %hh2, [%rd12+128]; | |
mov.b32 {%h13, %h14}, %hh2; | |
mul.wide.u32 %rd15, %r11, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.u32 %r12, [%rd16]; | |
cvt.rn.f16.s32 %h15, %r12; | |
sub.rn.f16 %h16, %h4, %h15; | |
mul.rn.f16 %h17, %h16, %h6; | |
sub.rn.f16 %h18, %h13, %h17; | |
cvt.f32.f16 %f6, %h18; | |
max.f32 %f7, %f5, %f6; | |
ld.global.nc.u32 %r13, [%rd14+260]; | |
cvt.rn.f16.s32 %h19, %r13; | |
sub.rn.f16 %h20, %h4, %h19; | |
mul.rn.f16 %h21, %h20, %h6; | |
sub.rn.f16 %h22, %h14, %h21; | |
cvt.f32.f16 %f8, %h22; | |
max.f32 %f9, %f7, %f8; | |
or.b32 %r14, %r6, 128; | |
ld.global.nc.b32 %hh3, [%rd12+256]; | |
mov.b32 {%h23, %h24}, %hh3; | |
mul.wide.u32 %rd17, %r14, 4; | |
add.s64 %rd18, %rd6, %rd17; | |
ld.global.nc.u32 %r15, [%rd18]; | |
cvt.rn.f16.s32 %h25, %r15; | |
sub.rn.f16 %h26, %h4, %h25; | |
mul.rn.f16 %h27, %h26, %h6; | |
sub.rn.f16 %h28, %h23, %h27; | |
cvt.f32.f16 %f10, %h28; | |
max.f32 %f11, %f9, %f10; | |
ld.global.nc.u32 %r16, [%rd14+516]; | |
cvt.rn.f16.s32 %h29, %r16; | |
sub.rn.f16 %h30, %h4, %h29; | |
mul.rn.f16 %h31, %h30, %h6; | |
sub.rn.f16 %h32, %h24, %h31; | |
cvt.f32.f16 %f12, %h32; | |
max.f32 %f13, %f11, %f12; | |
or.b32 %r17, %r6, 192; | |
ld.global.nc.b32 %hh4, [%rd12+384]; | |
mov.b32 {%h33, %h34}, %hh4; | |
mul.wide.u32 %rd19, %r17, 4; | |
add.s64 %rd20, %rd6, %rd19; | |
ld.global.nc.u32 %r18, [%rd20]; | |
cvt.rn.f16.s32 %h35, %r18; | |
sub.rn.f16 %h36, %h4, %h35; | |
mul.rn.f16 %h37, %h36, %h6; | |
sub.rn.f16 %h38, %h33, %h37; | |
cvt.f32.f16 %f14, %h38; | |
max.f32 %f15, %f13, %f14; | |
ld.global.nc.u32 %r19, [%rd14+772]; | |
cvt.rn.f16.s32 %h39, %r19; | |
sub.rn.f16 %h40, %h4, %h39; | |
mul.rn.f16 %h41, %h40, %h6; | |
sub.rn.f16 %h42, %h34, %h41; | |
cvt.f32.f16 %f16, %h42; | |
max.f32 %f17, %f15, %f16; | |
or.b32 %r20, %r6, 256; | |
ld.global.nc.b32 %hh5, [%rd12+512]; | |
mov.b32 {%h43, %h44}, %hh5; | |
mul.wide.u32 %rd21, %r20, 4; | |
add.s64 %rd22, %rd6, %rd21; | |
ld.global.nc.u32 %r21, [%rd22]; | |
cvt.rn.f16.s32 %h45, %r21; | |
sub.rn.f16 %h46, %h4, %h45; | |
mul.rn.f16 %h47, %h46, %h6; | |
sub.rn.f16 %h48, %h43, %h47; | |
cvt.f32.f16 %f18, %h48; | |
max.f32 %f19, %f17, %f18; | |
ld.global.nc.u32 %r22, [%rd14+1028]; | |
cvt.rn.f16.s32 %h49, %r22; | |
sub.rn.f16 %h50, %h4, %h49; | |
mul.rn.f16 %h51, %h50, %h6; | |
sub.rn.f16 %h52, %h44, %h51; | |
cvt.f32.f16 %f20, %h52; | |
max.f32 %f21, %f19, %f20; | |
or.b32 %r23, %r6, 320; | |
ld.global.nc.b32 %hh6, [%rd12+640]; | |
mov.b32 {%h53, %h54}, %hh6; | |
mul.wide.u32 %rd23, %r23, 4; | |
add.s64 %rd24, %rd6, %rd23; | |
ld.global.nc.u32 %r24, [%rd24]; | |
cvt.rn.f16.s32 %h55, %r24; | |
sub.rn.f16 %h56, %h4, %h55; | |
mul.rn.f16 %h57, %h56, %h6; | |
sub.rn.f16 %h58, %h53, %h57; | |
cvt.f32.f16 %f22, %h58; | |
max.f32 %f23, %f21, %f22; | |
ld.global.nc.u32 %r25, [%rd14+1284]; | |
cvt.rn.f16.s32 %h59, %r25; | |
sub.rn.f16 %h60, %h4, %h59; | |
mul.rn.f16 %h61, %h60, %h6; | |
sub.rn.f16 %h62, %h54, %h61; | |
cvt.f32.f16 %f24, %h62; | |
max.f32 %f25, %f23, %f24; | |
or.b32 %r26, %r6, 384; | |
ld.global.nc.b32 %hh7, [%rd12+768]; | |
mov.b32 {%h63, %h64}, %hh7; | |
mul.wide.u32 %rd25, %r26, 4; | |
add.s64 %rd26, %rd6, %rd25; | |
ld.global.nc.u32 %r27, [%rd26]; | |
cvt.rn.f16.s32 %h65, %r27; | |
sub.rn.f16 %h66, %h4, %h65; | |
mul.rn.f16 %h67, %h66, %h6; | |
sub.rn.f16 %h68, %h63, %h67; | |
cvt.f32.f16 %f26, %h68; | |
max.f32 %f27, %f25, %f26; | |
ld.global.nc.u32 %r28, [%rd14+1540]; | |
cvt.rn.f16.s32 %h69, %r28; | |
sub.rn.f16 %h70, %h4, %h69; | |
mul.rn.f16 %h71, %h70, %h6; | |
sub.rn.f16 %h72, %h64, %h71; | |
cvt.f32.f16 %f28, %h72; | |
max.f32 %f29, %f27, %f28; | |
or.b32 %r29, %r6, 448; | |
ld.global.nc.b32 %hh8, [%rd12+896]; | |
mov.b32 {%h73, %h74}, %hh8; | |
mul.wide.u32 %rd27, %r29, 4; | |
add.s64 %rd28, %rd6, %rd27; | |
ld.global.nc.u32 %r30, [%rd28]; | |
cvt.rn.f16.s32 %h75, %r30; | |
sub.rn.f16 %h76, %h4, %h75; | |
mul.rn.f16 %h77, %h76, %h6; | |
sub.rn.f16 %h78, %h73, %h77; | |
cvt.f32.f16 %f30, %h78; | |
max.f32 %f31, %f29, %f30; | |
ld.global.nc.u32 %r31, [%rd14+1796]; | |
cvt.rn.f16.s32 %h79, %r31; | |
sub.rn.f16 %h80, %h4, %h79; | |
mul.rn.f16 %h81, %h80, %h6; | |
sub.rn.f16 %h82, %h74, %h81; | |
cvt.f32.f16 %f32, %h82; | |
max.f32 %f33, %f31, %f32; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
max.f32 %f35, %f33, %f34; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
max.f32 %f37, %f35, %f36; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
max.f32 %f39, %f37, %f38; | |
shfl.sync.down.b32 %f40, %f39, 2, 31, -1; | |
max.f32 %f41, %f39, %f40; | |
shfl.sync.down.b32 %f42, %f41, 1, 31, -1; | |
setp.eq.s32 %p1, %r1, 0; | |
@%p1 bra LBB11_3; | |
bra.uni LBB11_1; | |
LBB11_3: | |
max.f32 %f1, %f41, %f42; | |
st.shared.f32 [shared_cache_02], %f1; | |
LBB11_1: | |
bar.sync 0; | |
mul.wide.u32 %rd32, %r1, 4; | |
mov.u64 %rd33, shared_cache_02; | |
add.s64 %rd3, %rd33, %rd32; | |
cvta.shared.u64 %rd34, %rd3; | |
mov.u32 %r34, -8388608; | |
st.local.u32 [%rd1], %r34; | |
selp.b64 %rd36, %rd34, %rd10, %p1; | |
ld.f32 %f43, [%rd36]; | |
shfl.sync.down.b32 %f44, %f43, 16, 31, -1; | |
max.f32 %f45, %f43, %f44; | |
shfl.sync.down.b32 %f46, %f45, 8, 31, -1; | |
max.f32 %f47, %f45, %f46; | |
shfl.sync.down.b32 %f48, %f47, 4, 31, -1; | |
max.f32 %f49, %f47, %f48; | |
shfl.sync.down.b32 %f50, %f49, 2, 31, -1; | |
max.f32 %f51, %f49, %f50; | |
shfl.sync.down.b32 %f52, %f51, 1, 31, -1; | |
max.f32 %f53, %f51, %f52; | |
st.f32 [%rd36], %f53; | |
@%p1 bra LBB11_4; | |
bra.uni LBB11_2; | |
LBB11_4: | |
ld.param.u64 %rd7, [fusion_2271_param_1]; | |
shr.u32 %r33, %r5, 9; | |
cvta.to.global.u64 %rd8, %rd7; | |
and.b32 %r32, %r5, 511; | |
mul.wide.u32 %rd29, %r33, 2048; | |
add.s64 %rd30, %rd8, %rd29; | |
mul.wide.u32 %rd31, %r32, 4; | |
add.s64 %rd2, %rd30, %rd31; | |
ld.global.u32 %r36, [%rd2]; | |
LBB11_5: | |
mov.b32 %f54, %r36; | |
ld.shared.f32 %f55, [%rd3]; | |
max.f32 %f56, %f54, %f55; | |
mov.b32 %r35, %f56; | |
atom.global.cas.b32 %r4, [%rd2], %r36, %r35; | |
setp.eq.s32 %p3, %r4, %r36; | |
mov.u32 %r36, %r4; | |
@%p3 bra LBB11_2; | |
bra.uni LBB11_5; | |
LBB11_2: | |
ret; | |
} | |
// .globl fusion_2269 | |
.visible .entry fusion_2269( | |
.param .u64 fusion_2269_param_0, | |
.param .u64 fusion_2269_param_1, | |
.param .u64 fusion_2269_param_2, | |
.param .u64 fusion_2269_param_3, | |
.param .u64 fusion_2269_param_4 | |
) | |
.reqntid 32, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot12[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<35>; | |
.reg .b16 %h<83>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<249>; | |
.reg .b32 %r<32>; | |
.reg .b64 %rd<41>; | |
mov.u64 %SPL, __local_depot12; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd4, [fusion_2269_param_0]; | |
ld.param.u64 %rd5, [fusion_2269_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd8, [fusion_2269_param_2]; | |
cvta.to.global.u64 %rd9, %rd8; | |
cvta.to.global.u64 %rd11, %rd4; | |
add.u64 %rd12, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 9; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd13, %r5, 2; | |
add.s64 %rd14, %rd11, %rd13; | |
ld.global.nc.b32 %hh1, [%rd14]; | |
mov.b32 {%h1, %h2}, %hh1; | |
mul.wide.u32 %rd15, %r3, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.v2.u32 {%r6, %r7}, [%rd16]; | |
cvt.rn.f16.s32 %h3, %r6; | |
mov.b16 %h4, 0x3C00; | |
sub.rn.f16 %h5, %h4, %h3; | |
mov.b16 %h6, 0x70E2; | |
mul.rn.f16 %h7, %h5, %h6; | |
sub.rn.f16 %h8, %h1, %h7; | |
cvt.f32.f16 %f2, %h8; | |
mul.wide.u32 %rd17, %r2, 4; | |
add.s64 %rd18, %rd9, %rd17; | |
ld.global.nc.f32 %f3, [%rd18]; | |
sub.rn.f32 %f4, %f2, %f3; | |
mul.rn.f32 %f5, %f4, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f6, %f5; | |
add.rn.f32 %f7, %f6, 0f00000000; | |
ex2.approx.f32 %f8, %f7; | |
fma.rn.f32 %f9, %f6, 0fBF317200, %f4; | |
fma.rn.f32 %f10, %f6, 0fB5BFBE8E, %f9; | |
mul.rn.f32 %f11, %f10, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f12, %f11; | |
mul.rn.f32 %f13, %f8, %f12; | |
setp.lt.f32 %p1, %f4, 0fC2D20000; | |
setp.gt.f32 %p2, %f4, 0f42D20000; | |
add.rn.f32 %f14, %f13, 0f00000000; | |
selp.f32 %f15, 0f00000000, %f14, %p1; | |
selp.f32 %f16, 0f7F800000, %f15, %p2; | |
cvt.rn.f16.s32 %h9, %r7; | |
sub.rn.f16 %h10, %h4, %h9; | |
mul.rn.f16 %h11, %h10, %h6; | |
sub.rn.f16 %h12, %h2, %h11; | |
cvt.f32.f16 %f17, %h12; | |
sub.rn.f32 %f18, %f17, %f3; | |
mul.rn.f32 %f19, %f18, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f20, %f19; | |
add.rn.f32 %f21, %f20, 0f00000000; | |
ex2.approx.f32 %f22, %f21; | |
fma.rn.f32 %f23, %f20, 0fBF317200, %f18; | |
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23; | |
mul.rn.f32 %f25, %f24, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f26, %f25; | |
mul.rn.f32 %f27, %f22, %f26; | |
setp.lt.f32 %p3, %f18, 0fC2D20000; | |
selp.f32 %f28, 0f00000000, %f27, %p3; | |
setp.gt.f32 %p4, %f18, 0f42D20000; | |
selp.f32 %f29, 0f7F800000, %f28, %p4; | |
add.rn.f32 %f30, %f16, %f29; | |
or.b32 %r8, %r3, 64; | |
ld.global.nc.b32 %hh2, [%rd14+128]; | |
mov.b32 {%h13, %h14}, %hh2; | |
mul.wide.u32 %rd19, %r8, 4; | |
add.s64 %rd20, %rd6, %rd19; | |
ld.global.nc.u32 %r9, [%rd20]; | |
cvt.rn.f16.s32 %h15, %r9; | |
sub.rn.f16 %h16, %h4, %h15; | |
mul.rn.f16 %h17, %h16, %h6; | |
sub.rn.f16 %h18, %h13, %h17; | |
cvt.f32.f16 %f31, %h18; | |
sub.rn.f32 %f32, %f31, %f3; | |
mul.rn.f32 %f33, %f32, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f34, %f33; | |
add.rn.f32 %f35, %f34, 0f00000000; | |
ex2.approx.f32 %f36, %f35; | |
fma.rn.f32 %f37, %f34, 0fBF317200, %f32; | |
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37; | |
mul.rn.f32 %f39, %f38, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f40, %f39; | |
mul.rn.f32 %f41, %f36, %f40; | |
setp.lt.f32 %p5, %f32, 0fC2D20000; | |
selp.f32 %f42, 0f00000000, %f41, %p5; | |
setp.gt.f32 %p6, %f32, 0f42D20000; | |
selp.f32 %f43, 0f7F800000, %f42, %p6; | |
add.rn.f32 %f44, %f30, %f43; | |
ld.global.nc.u32 %r10, [%rd16+260]; | |
cvt.rn.f16.s32 %h19, %r10; | |
sub.rn.f16 %h20, %h4, %h19; | |
mul.rn.f16 %h21, %h20, %h6; | |
sub.rn.f16 %h22, %h14, %h21; | |
cvt.f32.f16 %f45, %h22; | |
sub.rn.f32 %f46, %f45, %f3; | |
mul.rn.f32 %f47, %f46, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f48, %f47; | |
add.rn.f32 %f49, %f48, 0f00000000; | |
ex2.approx.f32 %f50, %f49; | |
fma.rn.f32 %f51, %f48, 0fBF317200, %f46; | |
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51; | |
mul.rn.f32 %f53, %f52, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f54, %f53; | |
mul.rn.f32 %f55, %f50, %f54; | |
setp.lt.f32 %p7, %f46, 0fC2D20000; | |
selp.f32 %f56, 0f00000000, %f55, %p7; | |
setp.gt.f32 %p8, %f46, 0f42D20000; | |
selp.f32 %f57, 0f7F800000, %f56, %p8; | |
add.rn.f32 %f58, %f44, %f57; | |
or.b32 %r11, %r3, 128; | |
ld.global.nc.b32 %hh3, [%rd14+256]; | |
mov.b32 {%h23, %h24}, %hh3; | |
mul.wide.u32 %rd21, %r11, 4; | |
add.s64 %rd22, %rd6, %rd21; | |
ld.global.nc.u32 %r12, [%rd22]; | |
cvt.rn.f16.s32 %h25, %r12; | |
sub.rn.f16 %h26, %h4, %h25; | |
mul.rn.f16 %h27, %h26, %h6; | |
sub.rn.f16 %h28, %h23, %h27; | |
cvt.f32.f16 %f59, %h28; | |
sub.rn.f32 %f60, %f59, %f3; | |
mul.rn.f32 %f61, %f60, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f62, %f61; | |
add.rn.f32 %f63, %f62, 0f00000000; | |
ex2.approx.f32 %f64, %f63; | |
fma.rn.f32 %f65, %f62, 0fBF317200, %f60; | |
fma.rn.f32 %f66, %f62, 0fB5BFBE8E, %f65; | |
mul.rn.f32 %f67, %f66, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f68, %f67; | |
mul.rn.f32 %f69, %f64, %f68; | |
setp.lt.f32 %p9, %f60, 0fC2D20000; | |
selp.f32 %f70, 0f00000000, %f69, %p9; | |
setp.gt.f32 %p10, %f60, 0f42D20000; | |
selp.f32 %f71, 0f7F800000, %f70, %p10; | |
add.rn.f32 %f72, %f58, %f71; | |
ld.global.nc.u32 %r13, [%rd16+516]; | |
cvt.rn.f16.s32 %h29, %r13; | |
sub.rn.f16 %h30, %h4, %h29; | |
mul.rn.f16 %h31, %h30, %h6; | |
sub.rn.f16 %h32, %h24, %h31; | |
cvt.f32.f16 %f73, %h32; | |
sub.rn.f32 %f74, %f73, %f3; | |
mul.rn.f32 %f75, %f74, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f76, %f75; | |
add.rn.f32 %f77, %f76, 0f00000000; | |
ex2.approx.f32 %f78, %f77; | |
fma.rn.f32 %f79, %f76, 0fBF317200, %f74; | |
fma.rn.f32 %f80, %f76, 0fB5BFBE8E, %f79; | |
mul.rn.f32 %f81, %f80, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f82, %f81; | |
mul.rn.f32 %f83, %f78, %f82; | |
setp.lt.f32 %p11, %f74, 0fC2D20000; | |
selp.f32 %f84, 0f00000000, %f83, %p11; | |
setp.gt.f32 %p12, %f74, 0f42D20000; | |
selp.f32 %f85, 0f7F800000, %f84, %p12; | |
add.rn.f32 %f86, %f72, %f85; | |
or.b32 %r14, %r3, 192; | |
ld.global.nc.b32 %hh4, [%rd14+384]; | |
mov.b32 {%h33, %h34}, %hh4; | |
mul.wide.u32 %rd23, %r14, 4; | |
add.s64 %rd24, %rd6, %rd23; | |
ld.global.nc.u32 %r15, [%rd24]; | |
cvt.rn.f16.s32 %h35, %r15; | |
sub.rn.f16 %h36, %h4, %h35; | |
mul.rn.f16 %h37, %h36, %h6; | |
sub.rn.f16 %h38, %h33, %h37; | |
cvt.f32.f16 %f87, %h38; | |
sub.rn.f32 %f88, %f87, %f3; | |
mul.rn.f32 %f89, %f88, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f90, %f89; | |
add.rn.f32 %f91, %f90, 0f00000000; | |
ex2.approx.f32 %f92, %f91; | |
fma.rn.f32 %f93, %f90, 0fBF317200, %f88; | |
fma.rn.f32 %f94, %f90, 0fB5BFBE8E, %f93; | |
mul.rn.f32 %f95, %f94, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f96, %f95; | |
mul.rn.f32 %f97, %f92, %f96; | |
setp.lt.f32 %p13, %f88, 0fC2D20000; | |
selp.f32 %f98, 0f00000000, %f97, %p13; | |
setp.gt.f32 %p14, %f88, 0f42D20000; | |
selp.f32 %f99, 0f7F800000, %f98, %p14; | |
add.rn.f32 %f100, %f86, %f99; | |
ld.global.nc.u32 %r16, [%rd16+772]; | |
cvt.rn.f16.s32 %h39, %r16; | |
sub.rn.f16 %h40, %h4, %h39; | |
mul.rn.f16 %h41, %h40, %h6; | |
sub.rn.f16 %h42, %h34, %h41; | |
cvt.f32.f16 %f101, %h42; | |
sub.rn.f32 %f102, %f101, %f3; | |
mul.rn.f32 %f103, %f102, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f104, %f103; | |
add.rn.f32 %f105, %f104, 0f00000000; | |
ex2.approx.f32 %f106, %f105; | |
fma.rn.f32 %f107, %f104, 0fBF317200, %f102; | |
fma.rn.f32 %f108, %f104, 0fB5BFBE8E, %f107; | |
mul.rn.f32 %f109, %f108, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f110, %f109; | |
mul.rn.f32 %f111, %f106, %f110; | |
setp.lt.f32 %p15, %f102, 0fC2D20000; | |
selp.f32 %f112, 0f00000000, %f111, %p15; | |
setp.gt.f32 %p16, %f102, 0f42D20000; | |
selp.f32 %f113, 0f7F800000, %f112, %p16; | |
add.rn.f32 %f114, %f100, %f113; | |
or.b32 %r17, %r3, 256; | |
ld.global.nc.b32 %hh5, [%rd14+512]; | |
mov.b32 {%h43, %h44}, %hh5; | |
mul.wide.u32 %rd25, %r17, 4; | |
add.s64 %rd26, %rd6, %rd25; | |
ld.global.nc.u32 %r18, [%rd26]; | |
cvt.rn.f16.s32 %h45, %r18; | |
sub.rn.f16 %h46, %h4, %h45; | |
mul.rn.f16 %h47, %h46, %h6; | |
sub.rn.f16 %h48, %h43, %h47; | |
cvt.f32.f16 %f115, %h48; | |
sub.rn.f32 %f116, %f115, %f3; | |
mul.rn.f32 %f117, %f116, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f118, %f117; | |
add.rn.f32 %f119, %f118, 0f00000000; | |
ex2.approx.f32 %f120, %f119; | |
fma.rn.f32 %f121, %f118, 0fBF317200, %f116; | |
fma.rn.f32 %f122, %f118, 0fB5BFBE8E, %f121; | |
mul.rn.f32 %f123, %f122, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f124, %f123; | |
mul.rn.f32 %f125, %f120, %f124; | |
setp.lt.f32 %p17, %f116, 0fC2D20000; | |
selp.f32 %f126, 0f00000000, %f125, %p17; | |
setp.gt.f32 %p18, %f116, 0f42D20000; | |
selp.f32 %f127, 0f7F800000, %f126, %p18; | |
add.rn.f32 %f128, %f114, %f127; | |
ld.global.nc.u32 %r19, [%rd16+1028]; | |
cvt.rn.f16.s32 %h49, %r19; | |
sub.rn.f16 %h50, %h4, %h49; | |
mul.rn.f16 %h51, %h50, %h6; | |
sub.rn.f16 %h52, %h44, %h51; | |
cvt.f32.f16 %f129, %h52; | |
sub.rn.f32 %f130, %f129, %f3; | |
mul.rn.f32 %f131, %f130, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f132, %f131; | |
add.rn.f32 %f133, %f132, 0f00000000; | |
ex2.approx.f32 %f134, %f133; | |
fma.rn.f32 %f135, %f132, 0fBF317200, %f130; | |
fma.rn.f32 %f136, %f132, 0fB5BFBE8E, %f135; | |
mul.rn.f32 %f137, %f136, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f138, %f137; | |
mul.rn.f32 %f139, %f134, %f138; | |
setp.lt.f32 %p19, %f130, 0fC2D20000; | |
selp.f32 %f140, 0f00000000, %f139, %p19; | |
setp.gt.f32 %p20, %f130, 0f42D20000; | |
selp.f32 %f141, 0f7F800000, %f140, %p20; | |
add.rn.f32 %f142, %f128, %f141; | |
or.b32 %r20, %r3, 320; | |
ld.global.nc.b32 %hh6, [%rd14+640]; | |
mov.b32 {%h53, %h54}, %hh6; | |
mul.wide.u32 %rd27, %r20, 4; | |
add.s64 %rd28, %rd6, %rd27; | |
ld.global.nc.u32 %r21, [%rd28]; | |
cvt.rn.f16.s32 %h55, %r21; | |
sub.rn.f16 %h56, %h4, %h55; | |
mul.rn.f16 %h57, %h56, %h6; | |
sub.rn.f16 %h58, %h53, %h57; | |
cvt.f32.f16 %f143, %h58; | |
sub.rn.f32 %f144, %f143, %f3; | |
mul.rn.f32 %f145, %f144, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f146, %f145; | |
add.rn.f32 %f147, %f146, 0f00000000; | |
ex2.approx.f32 %f148, %f147; | |
fma.rn.f32 %f149, %f146, 0fBF317200, %f144; | |
fma.rn.f32 %f150, %f146, 0fB5BFBE8E, %f149; | |
mul.rn.f32 %f151, %f150, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f152, %f151; | |
mul.rn.f32 %f153, %f148, %f152; | |
setp.lt.f32 %p21, %f144, 0fC2D20000; | |
selp.f32 %f154, 0f00000000, %f153, %p21; | |
setp.gt.f32 %p22, %f144, 0f42D20000; | |
selp.f32 %f155, 0f7F800000, %f154, %p22; | |
add.rn.f32 %f156, %f142, %f155; | |
ld.global.nc.u32 %r22, [%rd16+1284]; | |
cvt.rn.f16.s32 %h59, %r22; | |
sub.rn.f16 %h60, %h4, %h59; | |
mul.rn.f16 %h61, %h60, %h6; | |
sub.rn.f16 %h62, %h54, %h61; | |
cvt.f32.f16 %f157, %h62; | |
sub.rn.f32 %f158, %f157, %f3; | |
mul.rn.f32 %f159, %f158, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f160, %f159; | |
add.rn.f32 %f161, %f160, 0f00000000; | |
ex2.approx.f32 %f162, %f161; | |
fma.rn.f32 %f163, %f160, 0fBF317200, %f158; | |
fma.rn.f32 %f164, %f160, 0fB5BFBE8E, %f163; | |
mul.rn.f32 %f165, %f164, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f166, %f165; | |
mul.rn.f32 %f167, %f162, %f166; | |
setp.lt.f32 %p23, %f158, 0fC2D20000; | |
selp.f32 %f168, 0f00000000, %f167, %p23; | |
setp.gt.f32 %p24, %f158, 0f42D20000; | |
selp.f32 %f169, 0f7F800000, %f168, %p24; | |
add.rn.f32 %f170, %f156, %f169; | |
or.b32 %r23, %r3, 384; | |
ld.global.nc.b32 %hh7, [%rd14+768]; | |
mov.b32 {%h63, %h64}, %hh7; | |
mul.wide.u32 %rd29, %r23, 4; | |
add.s64 %rd30, %rd6, %rd29; | |
ld.global.nc.u32 %r24, [%rd30]; | |
cvt.rn.f16.s32 %h65, %r24; | |
sub.rn.f16 %h66, %h4, %h65; | |
mul.rn.f16 %h67, %h66, %h6; | |
sub.rn.f16 %h68, %h63, %h67; | |
cvt.f32.f16 %f171, %h68; | |
sub.rn.f32 %f172, %f171, %f3; | |
mul.rn.f32 %f173, %f172, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f174, %f173; | |
add.rn.f32 %f175, %f174, 0f00000000; | |
ex2.approx.f32 %f176, %f175; | |
fma.rn.f32 %f177, %f174, 0fBF317200, %f172; | |
fma.rn.f32 %f178, %f174, 0fB5BFBE8E, %f177; | |
mul.rn.f32 %f179, %f178, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f180, %f179; | |
mul.rn.f32 %f181, %f176, %f180; | |
setp.lt.f32 %p25, %f172, 0fC2D20000; | |
selp.f32 %f182, 0f00000000, %f181, %p25; | |
setp.gt.f32 %p26, %f172, 0f42D20000; | |
selp.f32 %f183, 0f7F800000, %f182, %p26; | |
add.rn.f32 %f184, %f170, %f183; | |
ld.global.nc.u32 %r25, [%rd16+1540]; | |
cvt.rn.f16.s32 %h69, %r25; | |
sub.rn.f16 %h70, %h4, %h69; | |
mul.rn.f16 %h71, %h70, %h6; | |
sub.rn.f16 %h72, %h64, %h71; | |
cvt.f32.f16 %f185, %h72; | |
sub.rn.f32 %f186, %f185, %f3; | |
mul.rn.f32 %f187, %f186, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f188, %f187; | |
add.rn.f32 %f189, %f188, 0f00000000; | |
ex2.approx.f32 %f190, %f189; | |
fma.rn.f32 %f191, %f188, 0fBF317200, %f186; | |
fma.rn.f32 %f192, %f188, 0fB5BFBE8E, %f191; | |
mul.rn.f32 %f193, %f192, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f194, %f193; | |
mul.rn.f32 %f195, %f190, %f194; | |
setp.lt.f32 %p27, %f186, 0fC2D20000; | |
selp.f32 %f196, 0f00000000, %f195, %p27; | |
setp.gt.f32 %p28, %f186, 0f42D20000; | |
selp.f32 %f197, 0f7F800000, %f196, %p28; | |
add.rn.f32 %f198, %f184, %f197; | |
or.b32 %r26, %r3, 448; | |
ld.global.nc.b32 %hh8, [%rd14+896]; | |
mov.b32 {%h73, %h74}, %hh8; | |
mul.wide.u32 %rd31, %r26, 4; | |
add.s64 %rd32, %rd6, %rd31; | |
ld.global.nc.u32 %r27, [%rd32]; | |
cvt.rn.f16.s32 %h75, %r27; | |
sub.rn.f16 %h76, %h4, %h75; | |
mul.rn.f16 %h77, %h76, %h6; | |
sub.rn.f16 %h78, %h73, %h77; | |
cvt.f32.f16 %f199, %h78; | |
sub.rn.f32 %f200, %f199, %f3; | |
mul.rn.f32 %f201, %f200, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f202, %f201; | |
add.rn.f32 %f203, %f202, 0f00000000; | |
ex2.approx.f32 %f204, %f203; | |
fma.rn.f32 %f205, %f202, 0fBF317200, %f200; | |
fma.rn.f32 %f206, %f202, 0fB5BFBE8E, %f205; | |
mul.rn.f32 %f207, %f206, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f208, %f207; | |
mul.rn.f32 %f209, %f204, %f208; | |
setp.lt.f32 %p29, %f200, 0fC2D20000; | |
selp.f32 %f210, 0f00000000, %f209, %p29; | |
setp.gt.f32 %p30, %f200, 0f42D20000; | |
selp.f32 %f211, 0f7F800000, %f210, %p30; | |
add.rn.f32 %f212, %f198, %f211; | |
ld.global.nc.u32 %r28, [%rd16+1796]; | |
cvt.rn.f16.s32 %h79, %r28; | |
sub.rn.f16 %h80, %h4, %h79; | |
mul.rn.f16 %h81, %h80, %h6; | |
sub.rn.f16 %h82, %h74, %h81; | |
cvt.f32.f16 %f213, %h82; | |
sub.rn.f32 %f214, %f213, %f3; | |
mul.rn.f32 %f215, %f214, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f216, %f215; | |
add.rn.f32 %f217, %f216, 0f00000000; | |
ex2.approx.f32 %f218, %f217; | |
fma.rn.f32 %f219, %f216, 0fBF317200, %f214; | |
fma.rn.f32 %f220, %f216, 0fB5BFBE8E, %f219; | |
mul.rn.f32 %f221, %f220, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f222, %f221; | |
mul.rn.f32 %f223, %f218, %f222; | |
setp.lt.f32 %p31, %f214, 0fC2D20000; | |
selp.f32 %f224, 0f00000000, %f223, %p31; | |
setp.gt.f32 %p32, %f214, 0f42D20000; | |
selp.f32 %f225, 0f7F800000, %f224, %p32; | |
add.rn.f32 %f226, %f212, %f225; | |
shfl.sync.down.b32 %f227, %f226, 16, 31, -1; | |
add.rn.f32 %f228, %f227, %f226; | |
shfl.sync.down.b32 %f229, %f228, 8, 31, -1; | |
add.rn.f32 %f230, %f229, %f228; | |
shfl.sync.down.b32 %f231, %f230, 4, 31, -1; | |
add.rn.f32 %f232, %f231, %f230; | |
shfl.sync.down.b32 %f233, %f232, 2, 31, -1; | |
add.rn.f32 %f234, %f233, %f232; | |
shfl.sync.down.b32 %f235, %f234, 1, 31, -1; | |
setp.eq.s32 %p33, %r1, 0; | |
@%p33 bra LBB12_3; | |
bra.uni LBB12_1; | |
LBB12_3: | |
add.rn.f32 %f1, %f235, %f234; | |
st.shared.f32 [shared_cache_03], %f1; | |
LBB12_1: | |
bar.sync 0; | |
mul.wide.u32 %rd36, %r1, 4; | |
mov.u64 %rd37, shared_cache_03; | |
add.s64 %rd3, %rd37, %rd36; | |
cvta.shared.u64 %rd38, %rd3; | |
mov.u32 %r31, 0; | |
st.local.u32 [%rd1], %r31; | |
selp.b64 %rd40, %rd38, %rd12, %p33; | |
ld.f32 %f236, [%rd40]; | |
shfl.sync.down.b32 %f237, %f236, 16, 31, -1; | |
add.rn.f32 %f238, %f236, %f237; | |
shfl.sync.down.b32 %f239, %f238, 8, 31, -1; | |
add.rn.f32 %f240, %f238, %f239; | |
shfl.sync.down.b32 %f241, %f240, 4, 31, -1; | |
add.rn.f32 %f242, %f240, %f241; | |
shfl.sync.down.b32 %f243, %f242, 2, 31, -1; | |
add.rn.f32 %f244, %f242, %f243; | |
shfl.sync.down.b32 %f245, %f244, 1, 31, -1; | |
add.rn.f32 %f246, %f244, %f245; | |
st.f32 [%rd40], %f246; | |
@%p33 bra LBB12_4; | |
bra.uni LBB12_2; | |
LBB12_4: | |
ld.param.u64 %rd7, [fusion_2269_param_1]; | |
shr.u32 %r30, %r2, 9; | |
cvta.to.global.u64 %rd10, %rd7; | |
and.b32 %r29, %r2, 511; | |
mul.wide.u32 %rd33, %r30, 2048; | |
add.s64 %rd34, %rd10, %rd33; | |
mul.wide.u32 %rd35, %r29, 4; | |
add.s64 %rd2, %rd34, %rd35; | |
ld.shared.f32 %f247, [%rd3]; | |
atom.global.add.f32 %f248, [%rd2], %f247; | |
LBB12_2: | |
ret; | |
} | |
// .globl fusion_2268 | |
.visible .entry fusion_2268( | |
.param .u64 fusion_2268_param_0, | |
.param .u64 fusion_2268_param_1, | |
.param .u64 fusion_2268_param_2, | |
.param .u64 fusion_2268_param_3, | |
.param .u64 fusion_2268_param_4, | |
.param .u64 fusion_2268_param_5 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<9>; | |
.reg .b16 %h<27>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<59>; | |
.reg .b32 %r<18>; | |
.reg .b64 %rd<26>; | |
ld.param.u64 %rd1, [fusion_2268_param_0]; | |
ld.param.u64 %rd2, [fusion_2268_param_4]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2268_param_1]; | |
ld.param.u64 %rd5, [fusion_2268_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2268_param_2]; | |
cvta.to.global.u64 %rd8, %rd7; | |
cvta.to.global.u64 %rd9, %rd4; | |
cvta.to.global.u64 %rd10, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
or.b32 %r8, %r4, 3; | |
shr.u32 %r9, %r5, 9; | |
and.b32 %r10, %r8, 511; | |
and.b32 %r11, %r7, 510; | |
and.b32 %r12, %r6, 509; | |
and.b32 %r13, %r4, 508; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd10, %rd11; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
mul.wide.u32 %rd13, %r13, 4; | |
add.s64 %rd14, %rd3, %rd13; | |
ld.global.nc.u32 %r14, [%rd14]; | |
cvt.rn.f16.s32 %h9, %r14; | |
mov.b16 %h10, 0x3C00; | |
sub.rn.f16 %h11, %h10, %h9; | |
mov.b16 %h12, 0x70E2; | |
mul.rn.f16 %h13, %h11, %h12; | |
sub.rn.f16 %h14, %h5, %h13; | |
cvt.f32.f16 %f1, %h14; | |
mul.wide.u32 %rd15, %r9, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.f32 %f2, [%rd16]; | |
sub.rn.f32 %f3, %f1, %f2; | |
mul.rn.f32 %f4, %f3, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f5, %f4; | |
add.rn.f32 %f6, %f5, 0f00000000; | |
ex2.approx.f32 %f7, %f6; | |
fma.rn.f32 %f8, %f5, 0fBF317200, %f3; | |
fma.rn.f32 %f9, %f5, 0fB5BFBE8E, %f8; | |
mul.rn.f32 %f10, %f9, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f11, %f10; | |
mul.rn.f32 %f12, %f7, %f11; | |
setp.lt.f32 %p1, %f3, 0fC2D20000; | |
selp.f32 %f13, 0f00000000, %f12, %p1; | |
setp.gt.f32 %p2, %f3, 0f42D20000; | |
selp.f32 %f14, 0f7F800000, %f13, %p2; | |
add.s64 %rd17, %rd8, %rd15; | |
ld.global.nc.f32 %f15, [%rd17]; | |
div.full.f32 %f16, %f14, %f15; | |
mul.wide.u32 %rd18, %r5, 4; | |
add.s64 %rd19, %rd9, %rd18; | |
mul.wide.u32 %rd20, %r12, 4; | |
add.s64 %rd21, %rd3, %rd20; | |
ld.global.nc.u32 %r15, [%rd21]; | |
cvt.rn.f16.s32 %h15, %r15; | |
sub.rn.f16 %h16, %h10, %h15; | |
mul.rn.f16 %h17, %h16, %h12; | |
sub.rn.f16 %h18, %h6, %h17; | |
cvt.f32.f16 %f17, %h18; | |
sub.rn.f32 %f18, %f17, %f2; | |
mul.rn.f32 %f19, %f18, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f20, %f19; | |
add.rn.f32 %f21, %f20, 0f00000000; | |
ex2.approx.f32 %f22, %f21; | |
fma.rn.f32 %f23, %f20, 0fBF317200, %f18; | |
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23; | |
mul.rn.f32 %f25, %f24, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f26, %f25; | |
mul.rn.f32 %f27, %f22, %f26; | |
setp.lt.f32 %p3, %f18, 0fC2D20000; | |
selp.f32 %f28, 0f00000000, %f27, %p3; | |
setp.gt.f32 %p4, %f18, 0f42D20000; | |
selp.f32 %f29, 0f7F800000, %f28, %p4; | |
div.full.f32 %f30, %f29, %f15; | |
mul.wide.u32 %rd22, %r11, 4; | |
add.s64 %rd23, %rd3, %rd22; | |
ld.global.nc.u32 %r16, [%rd23]; | |
cvt.rn.f16.s32 %h19, %r16; | |
sub.rn.f16 %h20, %h10, %h19; | |
mul.rn.f16 %h21, %h20, %h12; | |
sub.rn.f16 %h22, %h7, %h21; | |
cvt.f32.f16 %f31, %h22; | |
sub.rn.f32 %f32, %f31, %f2; | |
mul.rn.f32 %f33, %f32, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f34, %f33; | |
add.rn.f32 %f35, %f34, 0f00000000; | |
ex2.approx.f32 %f36, %f35; | |
fma.rn.f32 %f37, %f34, 0fBF317200, %f32; | |
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37; | |
mul.rn.f32 %f39, %f38, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f40, %f39; | |
mul.rn.f32 %f41, %f36, %f40; | |
setp.lt.f32 %p5, %f32, 0fC2D20000; | |
selp.f32 %f42, 0f00000000, %f41, %p5; | |
setp.gt.f32 %p6, %f32, 0f42D20000; | |
selp.f32 %f43, 0f7F800000, %f42, %p6; | |
div.full.f32 %f44, %f43, %f15; | |
mul.wide.u32 %rd24, %r10, 4; | |
add.s64 %rd25, %rd3, %rd24; | |
ld.global.nc.u32 %r17, [%rd25]; | |
cvt.rn.f16.s32 %h23, %r17; | |
sub.rn.f16 %h24, %h10, %h23; | |
mul.rn.f16 %h25, %h24, %h12; | |
sub.rn.f16 %h26, %h8, %h25; | |
cvt.f32.f16 %f45, %h26; | |
sub.rn.f32 %f46, %f45, %f2; | |
mul.rn.f32 %f47, %f46, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f48, %f47; | |
add.rn.f32 %f49, %f48, 0f00000000; | |
ex2.approx.f32 %f50, %f49; | |
fma.rn.f32 %f51, %f48, 0fBF317200, %f46; | |
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51; | |
mul.rn.f32 %f53, %f52, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f54, %f53; | |
mul.rn.f32 %f55, %f50, %f54; | |
setp.lt.f32 %p7, %f46, 0fC2D20000; | |
selp.f32 %f56, 0f00000000, %f55, %p7; | |
setp.gt.f32 %p8, %f46, 0f42D20000; | |
selp.f32 %f57, 0f7F800000, %f56, %p8; | |
div.full.f32 %f58, %f57, %f15; | |
st.global.v4.f32 [%rd19], {%f16, %f30, %f44, %f58}; | |
ret; | |
} | |
// .globl rng_get_and_update_state_3 | |
.visible .entry rng_get_and_update_state_3( | |
.param .u64 rng_get_and_update_state_3_param_0, | |
.param .u64 rng_get_and_update_state_3_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_3_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 4194304; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 4194304; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2267 | |
.visible .entry fusion_2267( | |
.param .u64 fusion_2267_param_0, | |
.param .u64 fusion_2267_param_1, | |
.param .u64 fusion_2267_param_2, | |
.param .u64 fusion_2267_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<19>; | |
.reg .f32 %f<13>; | |
.reg .b32 %r<29>; | |
.reg .b64 %rd<119>; | |
ld.param.u64 %rd1, [fusion_2267_param_0]; | |
ld.param.u64 %rd2, [fusion_2267_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2267_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
ld.global.nc.v2.u64 {%rd7, %rd8}, [%rd3]; | |
shr.u32 %r6, %r5, 2; | |
cvt.u64.u32 %rd9, %r6; | |
add.s64 %rd10, %rd7, %rd9; | |
setp.lt.u64 %p1, %rd10, %rd7; | |
and.b64 %rd11, %rd10, 4294967295; | |
mul.lo.s64 %rd12, %rd11, 3528531795; | |
selp.u64 %rd13, 1, 0, %p1; | |
add.s64 %rd14, %rd8, %rd13; | |
xor.b64 %rd15, %rd14, %rd12; | |
shr.u64 %rd16, %rd15, 32; | |
mul.lo.s64 %rd17, %rd16, 3449720151; | |
shr.u64 %rd18, %rd17, 32; | |
and.b64 %rd19, %rd14, 4294967295; | |
mul.lo.s64 %rd20, %rd19, 3449720151; | |
and.b64 %rd21, %rd20, 4294967295; | |
xor.b64 %rd22, %rd21, %rd18; | |
xor.b64 %rd23, %rd22, 2654435769; | |
mul.lo.s64 %rd24, %rd23, 3528531795; | |
shr.u64 %rd25, %rd24, 32; | |
xor.b64 %rd26, %rd20, %rd10; | |
shr.u64 %rd27, %rd26, 32; | |
mul.lo.s64 %rd28, %rd27, 3528531795; | |
and.b64 %rd29, %rd28, 4294967295; | |
xor.b64 %rd30, %rd29, %rd25; | |
xor.b64 %rd31, %rd30, 1993301258; | |
mul.lo.s64 %rd32, %rd31, 3449720151; | |
shr.u64 %rd33, %rd32, 32; | |
shr.u64 %rd34, %rd28, 32; | |
and.b64 %rd35, %rd12, 4294967295; | |
xor.b64 %rd36, %rd35, %rd34; | |
xor.b64 %rd37, %rd36, 3144134277; | |
mul.lo.s64 %rd38, %rd37, 3449720151; | |
and.b64 %rd39, %rd38, 4294967295; | |
xor.b64 %rd40, %rd39, %rd33; | |
xor.b64 %rd41, %rd40, 3668340011; | |
mul.lo.s64 %rd42, %rd41, 3528531795; | |
shr.u64 %rd43, %rd42, 32; | |
shr.u64 %rd44, %rd38, 32; | |
and.b64 %rd45, %rd17, 4294967295; | |
xor.b64 %rd46, %rd45, %rd44; | |
xor.b64 %rd47, %rd46, 1013904242; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
and.b64 %rd49, %rd48, 4294967295; | |
xor.b64 %rd50, %rd49, %rd43; | |
xor.b64 %rd51, %rd50, 3986602516; | |
mul.lo.s64 %rd52, %rd51, 3449720151; | |
shr.u64 %rd53, %rd52, 32; | |
shr.u64 %rd54, %rd48, 32; | |
and.b64 %rd55, %rd24, 4294967295; | |
xor.b64 %rd56, %rd55, %rd54; | |
xor.b64 %rd57, %rd56, 842468239; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
and.b64 %rd59, %rd58, 4294967295; | |
xor.b64 %rd60, %rd59, %rd53; | |
xor.b64 %rd61, %rd60, 387276957; | |
mul.lo.s64 %rd62, %rd61, 3528531795; | |
shr.u64 %rd63, %rd62, 32; | |
shr.u64 %rd64, %rd58, 32; | |
and.b64 %rd65, %rd32, 4294967295; | |
xor.b64 %rd66, %rd65, %rd64; | |
xor.b64 %rd67, %rd66, 2027808484; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
and.b64 %rd69, %rd68, 4294967295; | |
xor.b64 %rd70, %rd69, %rd63; | |
xor.b64 %rd71, %rd70, 1684936478; | |
mul.lo.s64 %rd72, %rd71, 3449720151; | |
shr.u64 %rd73, %rd72, 32; | |
shr.u64 %rd74, %rd68, 32; | |
and.b64 %rd75, %rd42, 4294967295; | |
xor.b64 %rd76, %rd75, %rd74; | |
xor.b64 %rd77, %rd76, 2835769497; | |
mul.lo.s64 %rd78, %rd77, 3449720151; | |
and.b64 %rd79, %rd78, 4294967295; | |
xor.b64 %rd80, %rd79, %rd73; | |
xor.b64 %rd81, %rd80, 1401181199; | |
mul.lo.s64 %rd82, %rd81, 3528531795; | |
shr.u64 %rd83, %rd82, 32; | |
shr.u64 %rd84, %rd78, 32; | |
and.b64 %rd85, %rd52, 4294967295; | |
xor.b64 %rd86, %rd85, %rd84; | |
xor.b64 %rd87, %rd86, 3041712726; | |
mul.lo.s64 %rd88, %rd87, 3528531795; | |
and.b64 %rd89, %rd88, 4294967295; | |
xor.b64 %rd90, %rd89, %rd83; | |
xor.b64 %rd91, %rd90, 3678237736; | |
mul.lo.s64 %rd92, %rd91, 3449720151; | |
shr.u64 %rd93, %rd92, 32; | |
cvt.u32.u64 %r7, %rd93; | |
shr.u64 %rd94, %rd88, 32; | |
xor.b64 %rd95, %rd94, %rd62; | |
cvt.u32.u64 %r8, %rd95; | |
xor.b32 %r9, %r8, 534103459; | |
mul.lo.s32 %r10, %r9, -845247145; | |
xor.b32 %r11, %r10, %r7; | |
shr.u32 %r12, %r11, 9; | |
xor.b32 %r13, %r12, 4716963; | |
cvt.rn.f32.u32 %f1, %r13; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f2; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p2, %h1, %h2; | |
mul.wide.u32 %rd96, %r5, 4; | |
add.s64 %rd97, %rd5, %rd96; | |
ld.global.nc.v4.f32 {%f3, %f4, %f5, %f6}, [%rd97]; | |
cvt.rn.f16.f32 %h3, %f3; | |
mov.b16 %h4, 0x3C72; | |
mul.rn.f16 %h5, %h3, %h4; | |
selp.b16 %h6, %h5, 0x0000, %p2; | |
mul.wide.u32 %rd98, %r5, 2; | |
add.s64 %rd99, %rd6, %rd98; | |
xor.b64 %rd100, %rd84, %rd52; | |
xor.b64 %rd101, %rd100, 3041712726; | |
mul.lo.s64 %rd102, %rd101, 3528531795; | |
xor.b64 %rd103, %rd83, %rd102; | |
cvt.u32.u64 %r14, %rd103; | |
xor.b32 %r15, %r14, -616729560; | |
mul.lo.s32 %r16, %r15, -845247145; | |
shr.u32 %r17, %r16, 9; | |
cvt.rn.f32.u32 %f7, %r17; | |
mul.rn.f32 %f8, %f7, 0f34000000; | |
cvt.rn.f16.f32 %h7, %f8; | |
setp.ge.f16 %p3, %h7, %h2; | |
cvt.rn.f16.f32 %h8, %f4; | |
mul.rn.f16 %h9, %h8, %h4; | |
selp.b16 %h10, %h9, 0x0000, %p3; | |
and.b64 %rd104, %rd62, 4294967295; | |
xor.b64 %rd105, %rd104, %rd94; | |
xor.b64 %rd106, %rd105, 534103459; | |
mul.lo.s64 %rd107, %rd106, 3449720151; | |
shr.u64 %rd108, %rd107, 32; | |
and.b64 %rd109, %rd72, 4294967295; | |
xor.b64 %rd110, %rd109, %rd108; | |
xor.b64 %rd111, %rd110, 4055616968; | |
mul.lo.s64 %rd112, %rd111, 3528531795; | |
shr.u64 %rd113, %rd112, 32; | |
cvt.u32.u64 %r18, %rd113; | |
xor.b64 %rd114, %rd73, %rd78; | |
cvt.u32.u64 %r19, %rd114; | |
xor.b32 %r20, %r19, 1401181199; | |
mul.lo.s32 %r21, %r20, -766435501; | |
xor.b32 %r22, %r21, %r18; | |
shr.u32 %r23, %r22, 9; | |
xor.b32 %r24, %r23, 4936337; | |
cvt.rn.f32.u32 %f9, %r24; | |
mul.rn.f32 %f10, %f9, 0f34000000; | |
cvt.rn.f16.f32 %h11, %f10; | |
setp.ge.f16 %p4, %h11, %h2; | |
cvt.rn.f16.f32 %h12, %f5; | |
mul.rn.f16 %h13, %h12, %h4; | |
selp.b16 %h14, %h13, 0x0000, %p4; | |
xor.b64 %rd115, %rd63, %rd68; | |
xor.b64 %rd116, %rd115, 1684936478; | |
mul.lo.s64 %rd117, %rd116, 3449720151; | |
xor.b64 %rd118, %rd108, %rd117; | |
cvt.u32.u64 %r25, %rd118; | |
xor.b32 %r26, %r25, -239350328; | |
mul.lo.s32 %r27, %r26, -766435501; | |
shr.u32 %r28, %r27, 9; | |
cvt.rn.f32.u32 %f11, %r28; | |
mul.rn.f32 %f12, %f11, 0f34000000; | |
cvt.rn.f16.f32 %h15, %f12; | |
setp.ge.f16 %p5, %h15, %h2; | |
cvt.rn.f16.f32 %h16, %f6; | |
mul.rn.f16 %h17, %h16, %h4; | |
selp.b16 %h18, %h17, 0x0000, %p5; | |
st.global.v4.b16 [%rd99], {%h6, %h10, %h14, %h18}; | |
ret; | |
} | |
// .globl fusion_2709 | |
.visible .entry fusion_2709( | |
.param .u64 fusion_2709_param_0, | |
.param .u64 fusion_2709_param_1, | |
.param .u64 fusion_2709_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2709_param_0]; | |
ld.param.u64 %rd2, [fusion_2709_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2265 | |
.visible .entry fusion_2265( | |
.param .u64 fusion_2265_param_0, | |
.param .u64 fusion_2265_param_1, | |
.param .u64 fusion_2265_param_2, | |
.param .u64 fusion_2265_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2265_param_0]; | |
ld.param.u64 %rd2, [fusion_2265_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2265_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd6, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd5, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2264 | |
.visible .entry fusion_2264( | |
.param .u64 fusion_2264_param_0, | |
.param .u64 fusion_2264_param_1, | |
.param .u64 fusion_2264_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .b32 %r<12>; | |
.reg .b64 %rd<17>; | |
ld.param.u64 %rd1, [fusion_2264_param_0]; | |
ld.param.u64 %rd2, [fusion_2264_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
and.b32 %r8, %r4, 60; | |
shr.u32 %r9, %r2, 4; | |
mul.wide.u32 %rd5, %r9, 65536; | |
add.s64 %rd6, %rd3, %rd5; | |
mul.wide.u32 %rd7, %r1, 128; | |
add.s64 %rd8, %rd6, %rd7; | |
mul.wide.u32 %rd9, %r8, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd4, %rd11; | |
and.b32 %r10, %r6, 61; | |
mul.wide.u32 %rd13, %r10, 2; | |
add.s64 %rd14, %rd8, %rd13; | |
ld.global.nc.b16 %h2, [%rd14]; | |
and.b32 %r11, %r7, 62; | |
mul.wide.u32 %rd15, %r11, 2; | |
add.s64 %rd16, %rd8, %rd15; | |
ld.global.nc.b16 %h3, [%rd16]; | |
ld.global.nc.b16 %h4, [%rd10+6]; | |
st.global.v4.b16 [%rd12], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2708 | |
.visible .entry fusion_2708( | |
.param .u64 fusion_2708_param_0, | |
.param .u64 fusion_2708_param_1, | |
.param .u64 fusion_2708_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2708_param_0]; | |
ld.param.u64 %rd2, [fusion_2708_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl rng_get_and_update_state_1 | |
.visible .entry rng_get_and_update_state_1( | |
.param .u64 rng_get_and_update_state_1_param_0, | |
.param .u64 rng_get_and_update_state_1_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_1_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 524288; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 524288; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2263 | |
.visible .entry fusion_2263( | |
.param .u64 fusion_2263_param_0, | |
.param .u64 fusion_2263_param_1, | |
.param .u64 fusion_2263_param_2, | |
.param .u64 fusion_2263_param_3, | |
.param .u64 fusion_2263_param_4, | |
.param .u64 fusion_2263_param_5 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<43>; | |
.reg .b32 %hh<5>; | |
.reg .f32 %f<13>; | |
.reg .b32 %r<31>; | |
.reg .b64 %rd<129>; | |
ld.param.u64 %rd1, [fusion_2263_param_0]; | |
ld.param.u64 %rd2, [fusion_2263_param_4]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2263_param_1]; | |
ld.param.u64 %rd5, [fusion_2263_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2263_param_2]; | |
cvta.to.global.u64 %rd8, %rd7; | |
cvta.to.global.u64 %rd9, %rd4; | |
cvta.to.global.u64 %rd10, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd9, %rd11; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
shr.u32 %r8, %r5, 2; | |
ld.global.nc.v2.u64 {%rd13, %rd14}, [%rd6]; | |
cvt.u64.u32 %rd15, %r8; | |
add.s64 %rd16, %rd13, %rd15; | |
setp.lt.u64 %p1, %rd16, %rd13; | |
and.b64 %rd17, %rd16, 4294967295; | |
mul.lo.s64 %rd18, %rd17, 3528531795; | |
selp.u64 %rd19, 1, 0, %p1; | |
add.s64 %rd20, %rd14, %rd19; | |
xor.b64 %rd21, %rd20, %rd18; | |
shr.u64 %rd22, %rd21, 32; | |
mul.lo.s64 %rd23, %rd22, 3449720151; | |
shr.u64 %rd24, %rd23, 32; | |
and.b64 %rd25, %rd20, 4294967295; | |
mul.lo.s64 %rd26, %rd25, 3449720151; | |
and.b64 %rd27, %rd26, 4294967295; | |
xor.b64 %rd28, %rd27, %rd24; | |
xor.b64 %rd29, %rd28, 2654435769; | |
mul.lo.s64 %rd30, %rd29, 3528531795; | |
shr.u64 %rd31, %rd30, 32; | |
xor.b64 %rd32, %rd26, %rd16; | |
shr.u64 %rd33, %rd32, 32; | |
mul.lo.s64 %rd34, %rd33, 3528531795; | |
and.b64 %rd35, %rd34, 4294967295; | |
xor.b64 %rd36, %rd35, %rd31; | |
xor.b64 %rd37, %rd36, 1993301258; | |
mul.lo.s64 %rd38, %rd37, 3449720151; | |
shr.u64 %rd39, %rd38, 32; | |
shr.u64 %rd40, %rd34, 32; | |
and.b64 %rd41, %rd18, 4294967295; | |
xor.b64 %rd42, %rd41, %rd40; | |
xor.b64 %rd43, %rd42, 3144134277; | |
mul.lo.s64 %rd44, %rd43, 3449720151; | |
and.b64 %rd45, %rd44, 4294967295; | |
xor.b64 %rd46, %rd45, %rd39; | |
xor.b64 %rd47, %rd46, 3668340011; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
shr.u64 %rd49, %rd48, 32; | |
shr.u64 %rd50, %rd44, 32; | |
and.b64 %rd51, %rd23, 4294967295; | |
xor.b64 %rd52, %rd51, %rd50; | |
xor.b64 %rd53, %rd52, 1013904242; | |
mul.lo.s64 %rd54, %rd53, 3528531795; | |
and.b64 %rd55, %rd54, 4294967295; | |
xor.b64 %rd56, %rd55, %rd49; | |
xor.b64 %rd57, %rd56, 3986602516; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
shr.u64 %rd59, %rd58, 32; | |
shr.u64 %rd60, %rd54, 32; | |
and.b64 %rd61, %rd30, 4294967295; | |
xor.b64 %rd62, %rd61, %rd60; | |
xor.b64 %rd63, %rd62, 842468239; | |
mul.lo.s64 %rd64, %rd63, 3449720151; | |
and.b64 %rd65, %rd64, 4294967295; | |
xor.b64 %rd66, %rd65, %rd59; | |
xor.b64 %rd67, %rd66, 387276957; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
shr.u64 %rd69, %rd68, 32; | |
shr.u64 %rd70, %rd64, 32; | |
and.b64 %rd71, %rd38, 4294967295; | |
xor.b64 %rd72, %rd71, %rd70; | |
xor.b64 %rd73, %rd72, 2027808484; | |
mul.lo.s64 %rd74, %rd73, 3528531795; | |
and.b64 %rd75, %rd74, 4294967295; | |
shr.u64 %rd76, %rd74, 32; | |
and.b64 %rd77, %rd48, 4294967295; | |
xor.b64 %rd78, %rd77, %rd76; | |
xor.b64 %rd79, %rd78, 2835769497; | |
mul.lo.s64 %rd80, %rd79, 3449720151; | |
and.b64 %rd81, %rd80, 4294967295; | |
shr.u64 %rd82, %rd80, 32; | |
and.b64 %rd83, %rd58, 4294967295; | |
xor.b64 %rd84, %rd83, %rd82; | |
xor.b64 %rd85, %rd84, 3041712726; | |
mul.lo.s64 %rd86, %rd85, 3528531795; | |
and.b64 %rd87, %rd86, 4294967295; | |
xor.b64 %rd88, %rd75, %rd69; | |
xor.b64 %rd89, %rd88, 1684936478; | |
mul.lo.s64 %rd90, %rd89, 3449720151; | |
shr.u64 %rd91, %rd90, 32; | |
xor.b64 %rd92, %rd81, %rd91; | |
xor.b64 %rd93, %rd92, 1401181199; | |
mul.lo.s64 %rd94, %rd93, 3528531795; | |
shr.u64 %rd95, %rd94, 32; | |
xor.b64 %rd96, %rd87, %rd95; | |
xor.b64 %rd97, %rd96, 3678237736; | |
mul.lo.s64 %rd98, %rd97, 3449720151; | |
shr.u64 %rd99, %rd98, 32; | |
cvt.u32.u64 %r9, %rd99; | |
shr.u64 %rd100, %rd86, 32; | |
xor.b64 %rd101, %rd100, %rd68; | |
cvt.u32.u64 %r10, %rd101; | |
xor.b32 %r11, %r10, 534103459; | |
mul.lo.s32 %r12, %r11, -845247145; | |
xor.b32 %r13, %r12, %r9; | |
shr.u32 %r14, %r13, 9; | |
xor.b32 %r15, %r14, 4716963; | |
cvt.rn.f32.u32 %f1, %r15; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h9, %f2; | |
mov.b16 %h10, 0x2E66; | |
setp.ge.f16 %p2, %h9, %h10; | |
add.s64 %rd102, %rd8, %rd11; | |
ld.global.nc.v4.b16 {%h11, %h12, %h13, %h14}, [%rd102]; | |
mov.b32 %hh3, {%h13, %h14}; | |
mov.b32 %hh4, {%h11, %h12}; | |
mov.b32 {%h15, %h16}, %hh4; | |
mov.b32 {%h17, %h18}, %hh3; | |
mul.wide.u32 %rd103, %r4, 4; | |
add.s64 %rd104, %rd3, %rd103; | |
ld.global.nc.f32 %f3, [%rd104]; | |
cvt.rn.f16.f32 %h19, %f3; | |
add.rn.f16 %h20, %h15, %h19; | |
mov.b16 %h21, 0x3C72; | |
mul.rn.f16 %h22, %h20, %h21; | |
selp.b16 %h23, %h22, 0x0000, %p2; | |
add.rn.f16 %h24, %h5, %h23; | |
add.s64 %rd105, %rd10, %rd11; | |
xor.b64 %rd106, %rd58, %rd82; | |
xor.b64 %rd107, %rd106, 3041712726; | |
mul.lo.s64 %rd108, %rd107, 3528531795; | |
xor.b64 %rd109, %rd95, %rd108; | |
cvt.u32.u64 %r16, %rd109; | |
xor.b32 %r17, %r16, -616729560; | |
mul.lo.s32 %r18, %r17, -845247145; | |
shr.u32 %r19, %r18, 9; | |
cvt.rn.f32.u32 %f4, %r19; | |
mul.rn.f32 %f5, %f4, 0f34000000; | |
cvt.rn.f16.f32 %h25, %f5; | |
setp.ge.f16 %p3, %h25, %h10; | |
mul.wide.u32 %rd110, %r6, 4; | |
add.s64 %rd111, %rd3, %rd110; | |
ld.global.nc.f32 %f6, [%rd111]; | |
cvt.rn.f16.f32 %h26, %f6; | |
add.rn.f16 %h27, %h16, %h26; | |
mul.rn.f16 %h28, %h27, %h21; | |
selp.b16 %h29, %h28, 0x0000, %p3; | |
add.rn.f16 %h30, %h6, %h29; | |
and.b64 %rd112, %rd90, 4294967295; | |
and.b64 %rd113, %rd68, 4294967295; | |
xor.b64 %rd114, %rd113, %rd100; | |
xor.b64 %rd115, %rd114, 534103459; | |
mul.lo.s64 %rd116, %rd115, 3449720151; | |
shr.u64 %rd117, %rd116, 32; | |
xor.b64 %rd118, %rd112, %rd117; | |
xor.b64 %rd119, %rd118, 4055616968; | |
mul.lo.s64 %rd120, %rd119, 3528531795; | |
shr.u64 %rd121, %rd120, 32; | |
cvt.u32.u64 %r20, %rd121; | |
xor.b64 %rd122, %rd91, %rd80; | |
cvt.u32.u64 %r21, %rd122; | |
xor.b32 %r22, %r21, 1401181199; | |
mul.lo.s32 %r23, %r22, -766435501; | |
xor.b32 %r24, %r23, %r20; | |
shr.u32 %r25, %r24, 9; | |
xor.b32 %r26, %r25, 4936337; | |
cvt.rn.f32.u32 %f7, %r26; | |
mul.rn.f32 %f8, %f7, 0f34000000; | |
cvt.rn.f16.f32 %h31, %f8; | |
setp.ge.f16 %p4, %h31, %h10; | |
mul.wide.u32 %rd123, %r7, 4; | |
add.s64 %rd124, %rd3, %rd123; | |
ld.global.nc.f32 %f9, [%rd124]; | |
cvt.rn.f16.f32 %h32, %f9; | |
add.rn.f16 %h33, %h17, %h32; | |
mul.rn.f16 %h34, %h33, %h21; | |
selp.b16 %h35, %h34, 0x0000, %p4; | |
add.rn.f16 %h36, %h7, %h35; | |
xor.b64 %rd125, %rd69, %rd74; | |
xor.b64 %rd126, %rd125, 1684936478; | |
mul.lo.s64 %rd127, %rd126, 3449720151; | |
xor.b64 %rd128, %rd117, %rd127; | |
cvt.u32.u64 %r27, %rd128; | |
xor.b32 %r28, %r27, -239350328; | |
mul.lo.s32 %r29, %r28, -766435501; | |
shr.u32 %r30, %r29, 9; | |
cvt.rn.f32.u32 %f10, %r30; | |
mul.rn.f32 %f11, %f10, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f11; | |
setp.ge.f16 %p5, %h37, %h10; | |
ld.global.nc.f32 %f12, [%rd104+12]; | |
cvt.rn.f16.f32 %h38, %f12; | |
add.rn.f16 %h39, %h18, %h38; | |
mul.rn.f16 %h40, %h39, %h21; | |
selp.b16 %h41, %h40, 0x0000, %p5; | |
add.rn.f16 %h42, %h8, %h41; | |
st.global.v4.b16 [%rd105], {%h24, %h30, %h36, %h42}; | |
ret; | |
} | |
// .globl fusion_2262 | |
.visible .entry fusion_2262( | |
.param .u64 fusion_2262_param_0, | |
.param .u64 fusion_2262_param_1, | |
.param .u64 fusion_2262_param_2 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot22[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<56>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<22>; | |
mov.u64 %SPL, __local_depot22; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2262_param_0]; | |
cvta.to.global.u64 %rd8, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd10, %r4, 2048; | |
add.s64 %rd11, %rd8, %rd10; | |
mul.wide.u32 %rd12, %r5, 2; | |
add.s64 %rd13, %rd11, %rd12; | |
ld.global.nc.b32 %hh1, [%rd13]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
add.rn.f32 %f3, %f2, 0f00000000; | |
cvt.f32.f16 %f4, %h2; | |
add.rn.f32 %f5, %f3, %f4; | |
ld.global.nc.b32 %hh2, [%rd13+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f6, %h3; | |
add.rn.f32 %f7, %f5, %f6; | |
cvt.f32.f16 %f8, %h4; | |
add.rn.f32 %f9, %f7, %f8; | |
ld.global.nc.b32 %hh3, [%rd13+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f10, %h5; | |
add.rn.f32 %f11, %f9, %f10; | |
cvt.f32.f16 %f12, %h6; | |
add.rn.f32 %f13, %f11, %f12; | |
ld.global.nc.b32 %hh4, [%rd13+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f14, %h7; | |
add.rn.f32 %f15, %f13, %f14; | |
cvt.f32.f16 %f16, %h8; | |
add.rn.f32 %f17, %f15, %f16; | |
ld.global.nc.b32 %hh5, [%rd13+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f18, %h9; | |
add.rn.f32 %f19, %f17, %f18; | |
cvt.f32.f16 %f20, %h10; | |
add.rn.f32 %f21, %f19, %f20; | |
ld.global.nc.b32 %hh6, [%rd13+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f22, %h11; | |
add.rn.f32 %f23, %f21, %f22; | |
cvt.f32.f16 %f24, %h12; | |
add.rn.f32 %f25, %f23, %f24; | |
ld.global.nc.b32 %hh7, [%rd13+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f26, %h13; | |
add.rn.f32 %f27, %f25, %f26; | |
cvt.f32.f16 %f28, %h14; | |
add.rn.f32 %f29, %f27, %f28; | |
ld.global.nc.b32 %hh8, [%rd13+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f30, %h15; | |
add.rn.f32 %f31, %f29, %f30; | |
cvt.f32.f16 %f32, %h16; | |
add.rn.f32 %f33, %f31, %f32; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
add.rn.f32 %f35, %f34, %f33; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
add.rn.f32 %f37, %f36, %f35; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
add.rn.f32 %f39, %f38, %f37; | |
shfl.sync.down.b32 %f40, %f39, 2, 31, -1; | |
add.rn.f32 %f41, %f40, %f39; | |
shfl.sync.down.b32 %f42, %f41, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd16, shared_cache_04; | |
@%p1 bra LBB22_3; | |
bra.uni LBB22_1; | |
LBB22_3: | |
mul.wide.u32 %rd15, %r3, 4; | |
add.s64 %rd3, %rd16, %rd15; | |
add.rn.f32 %f1, %f42, %f41; | |
st.shared.f32 [%rd3], %f1; | |
LBB22_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB22_4; | |
bra.uni LBB22_2; | |
LBB22_4: | |
add.u64 %rd9, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd17, %r2, 4; | |
add.s64 %rd4, %rd16, %rd17; | |
cvta.shared.u64 %rd19, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd21, %rd19, %rd9, %p3; | |
ld.f32 %f43, [%rd21]; | |
shfl.sync.down.b32 %f44, %f43, 16, 31, -1; | |
add.rn.f32 %f45, %f43, %f44; | |
shfl.sync.down.b32 %f46, %f45, 8, 31, -1; | |
add.rn.f32 %f47, %f45, %f46; | |
shfl.sync.down.b32 %f48, %f47, 4, 31, -1; | |
add.rn.f32 %f49, %f47, %f48; | |
shfl.sync.down.b32 %f50, %f49, 2, 31, -1; | |
add.rn.f32 %f51, %f49, %f50; | |
shfl.sync.down.b32 %f52, %f51, 1, 31, -1; | |
add.rn.f32 %f53, %f51, %f52; | |
st.f32 [%rd21], %f53; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB22_2; | |
ld.param.u64 %rd6, [fusion_2262_param_1]; | |
cvta.to.global.u64 %rd7, %rd6; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd2, %rd7, %rd14; | |
ld.shared.f32 %f54, [%rd4]; | |
atom.global.add.f32 %f55, [%rd2], %f54; | |
LBB22_2: | |
ret; | |
} | |
// .globl fusion_2259 | |
.visible .entry fusion_2259( | |
.param .u64 fusion_2259_param_0, | |
.param .u64 fusion_2259_param_1, | |
.param .u64 fusion_2259_param_2, | |
.param .u64 fusion_2259_param_3 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot23[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<90>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<25>; | |
mov.u64 %SPL, __local_depot23; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2259_param_0]; | |
ld.param.u64 %rd6, [fusion_2259_param_2]; | |
cvta.to.global.u64 %rd7, %rd6; | |
cvta.to.global.u64 %rd10, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd12, %r4, 2048; | |
add.s64 %rd13, %rd10, %rd12; | |
mul.wide.u32 %rd14, %r5, 2; | |
add.s64 %rd15, %rd13, %rd14; | |
ld.global.nc.b32 %hh1, [%rd15]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
mul.wide.u32 %rd16, %r4, 4; | |
add.s64 %rd17, %rd7, %rd16; | |
ld.global.nc.f32 %f3, [%rd17]; | |
mul.rn.f32 %f4, %f3, 0f3A800000; | |
sub.rn.f32 %f5, %f2, %f4; | |
mul.rn.f32 %f6, %f5, %f5; | |
add.rn.f32 %f7, %f6, 0f00000000; | |
cvt.f32.f16 %f8, %h2; | |
sub.rn.f32 %f9, %f8, %f4; | |
mul.rn.f32 %f10, %f9, %f9; | |
add.rn.f32 %f11, %f7, %f10; | |
ld.global.nc.b32 %hh2, [%rd15+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f12, %h3; | |
sub.rn.f32 %f13, %f12, %f4; | |
mul.rn.f32 %f14, %f13, %f13; | |
add.rn.f32 %f15, %f11, %f14; | |
cvt.f32.f16 %f16, %h4; | |
sub.rn.f32 %f17, %f16, %f4; | |
mul.rn.f32 %f18, %f17, %f17; | |
add.rn.f32 %f19, %f15, %f18; | |
ld.global.nc.b32 %hh3, [%rd15+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f20, %h5; | |
sub.rn.f32 %f21, %f20, %f4; | |
mul.rn.f32 %f22, %f21, %f21; | |
add.rn.f32 %f23, %f19, %f22; | |
cvt.f32.f16 %f24, %h6; | |
sub.rn.f32 %f25, %f24, %f4; | |
mul.rn.f32 %f26, %f25, %f25; | |
add.rn.f32 %f27, %f23, %f26; | |
ld.global.nc.b32 %hh4, [%rd15+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f28, %h7; | |
sub.rn.f32 %f29, %f28, %f4; | |
mul.rn.f32 %f30, %f29, %f29; | |
add.rn.f32 %f31, %f27, %f30; | |
cvt.f32.f16 %f32, %h8; | |
sub.rn.f32 %f33, %f32, %f4; | |
mul.rn.f32 %f34, %f33, %f33; | |
add.rn.f32 %f35, %f31, %f34; | |
ld.global.nc.b32 %hh5, [%rd15+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f36, %h9; | |
sub.rn.f32 %f37, %f36, %f4; | |
mul.rn.f32 %f38, %f37, %f37; | |
add.rn.f32 %f39, %f35, %f38; | |
cvt.f32.f16 %f40, %h10; | |
sub.rn.f32 %f41, %f40, %f4; | |
mul.rn.f32 %f42, %f41, %f41; | |
add.rn.f32 %f43, %f39, %f42; | |
ld.global.nc.b32 %hh6, [%rd15+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f44, %h11; | |
sub.rn.f32 %f45, %f44, %f4; | |
mul.rn.f32 %f46, %f45, %f45; | |
add.rn.f32 %f47, %f43, %f46; | |
cvt.f32.f16 %f48, %h12; | |
sub.rn.f32 %f49, %f48, %f4; | |
mul.rn.f32 %f50, %f49, %f49; | |
add.rn.f32 %f51, %f47, %f50; | |
ld.global.nc.b32 %hh7, [%rd15+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f52, %h13; | |
sub.rn.f32 %f53, %f52, %f4; | |
mul.rn.f32 %f54, %f53, %f53; | |
add.rn.f32 %f55, %f51, %f54; | |
cvt.f32.f16 %f56, %h14; | |
sub.rn.f32 %f57, %f56, %f4; | |
mul.rn.f32 %f58, %f57, %f57; | |
add.rn.f32 %f59, %f55, %f58; | |
ld.global.nc.b32 %hh8, [%rd15+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f60, %h15; | |
sub.rn.f32 %f61, %f60, %f4; | |
mul.rn.f32 %f62, %f61, %f61; | |
add.rn.f32 %f63, %f59, %f62; | |
cvt.f32.f16 %f64, %h16; | |
sub.rn.f32 %f65, %f64, %f4; | |
mul.rn.f32 %f66, %f65, %f65; | |
add.rn.f32 %f67, %f63, %f66; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f68, %f67, 16, 31, -1; | |
add.rn.f32 %f69, %f68, %f67; | |
shfl.sync.down.b32 %f70, %f69, 8, 31, -1; | |
add.rn.f32 %f71, %f70, %f69; | |
shfl.sync.down.b32 %f72, %f71, 4, 31, -1; | |
add.rn.f32 %f73, %f72, %f71; | |
shfl.sync.down.b32 %f74, %f73, 2, 31, -1; | |
add.rn.f32 %f75, %f74, %f73; | |
shfl.sync.down.b32 %f76, %f75, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd19, shared_cache_05; | |
@%p1 bra LBB23_3; | |
bra.uni LBB23_1; | |
LBB23_3: | |
mul.wide.u32 %rd18, %r3, 4; | |
add.s64 %rd3, %rd19, %rd18; | |
add.rn.f32 %f1, %f76, %f75; | |
st.shared.f32 [%rd3], %f1; | |
LBB23_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB23_4; | |
bra.uni LBB23_2; | |
LBB23_4: | |
add.u64 %rd11, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd20, %r2, 4; | |
add.s64 %rd4, %rd19, %rd20; | |
cvta.shared.u64 %rd22, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd24, %rd22, %rd11, %p3; | |
ld.f32 %f77, [%rd24]; | |
shfl.sync.down.b32 %f78, %f77, 16, 31, -1; | |
add.rn.f32 %f79, %f77, %f78; | |
shfl.sync.down.b32 %f80, %f79, 8, 31, -1; | |
add.rn.f32 %f81, %f79, %f80; | |
shfl.sync.down.b32 %f82, %f81, 4, 31, -1; | |
add.rn.f32 %f83, %f81, %f82; | |
shfl.sync.down.b32 %f84, %f83, 2, 31, -1; | |
add.rn.f32 %f85, %f83, %f84; | |
shfl.sync.down.b32 %f86, %f85, 1, 31, -1; | |
add.rn.f32 %f87, %f85, %f86; | |
st.f32 [%rd24], %f87; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB23_2; | |
ld.param.u64 %rd8, [fusion_2259_param_1]; | |
cvta.to.global.u64 %rd9, %rd8; | |
add.s64 %rd2, %rd9, %rd16; | |
ld.shared.f32 %f88, [%rd4]; | |
atom.global.add.f32 %f89, [%rd2], %f88; | |
LBB23_2: | |
ret; | |
} | |
// .globl fusion_2255 | |
.visible .entry fusion_2255( | |
.param .u64 fusion_2255_param_0, | |
.param .u64 fusion_2255_param_1, | |
.param .u64 fusion_2255_param_2, | |
.param .u64 fusion_2255_param_3, | |
.param .u64 fusion_2255_param_4, | |
.param .u64 fusion_2255_param_5, | |
.param .u64 fusion_2255_param_6 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<39>; | |
.reg .b32 %r<8>; | |
.reg .b64 %rd<28>; | |
ld.param.u64 %rd1, [fusion_2255_param_0]; | |
ld.param.u64 %rd2, [fusion_2255_param_5]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2255_param_1]; | |
ld.param.u64 %rd5, [fusion_2255_param_4]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2255_param_2]; | |
ld.param.u64 %rd8, [fusion_2255_param_3]; | |
cvta.to.global.u64 %rd9, %rd8; | |
cvta.to.global.u64 %rd10, %rd7; | |
cvta.to.global.u64 %rd11, %rd4; | |
cvta.to.global.u64 %rd12, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
mul.wide.u32 %rd13, %r5, 2; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd14]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
cvt.f32.f16 %f1, %h5; | |
mul.wide.u32 %rd15, %r1, 4; | |
add.s64 %rd16, %rd10, %rd15; | |
ld.global.nc.f32 %f2, [%rd16]; | |
mul.rn.f32 %f3, %f2, 0f3A800000; | |
add.rn.f32 %f4, %f3, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f5, %f4; | |
mul.wide.u32 %rd17, %r4, 4; | |
add.s64 %rd18, %rd3, %rd17; | |
ld.global.nc.f32 %f6, [%rd18]; | |
mul.rn.f32 %f7, %f5, %f6; | |
mul.rn.f32 %f8, %f7, %f1; | |
add.s64 %rd19, %rd6, %rd17; | |
ld.global.nc.f32 %f9, [%rd19]; | |
add.s64 %rd20, %rd9, %rd15; | |
ld.global.nc.f32 %f10, [%rd20]; | |
mul.rn.f32 %f11, %f10, 0f3A800000; | |
mul.rn.f32 %f12, %f7, %f11; | |
sub.rn.f32 %f13, %f9, %f12; | |
add.rn.f32 %f14, %f8, %f13; | |
cvt.rn.f16.f32 %h9, %f14; | |
add.s64 %rd21, %rd11, %rd13; | |
cvt.f32.f16 %f15, %h6; | |
mul.wide.u32 %rd22, %r6, 4; | |
add.s64 %rd23, %rd3, %rd22; | |
ld.global.nc.f32 %f16, [%rd23]; | |
mul.rn.f32 %f17, %f5, %f16; | |
mul.rn.f32 %f18, %f17, %f15; | |
add.s64 %rd24, %rd6, %rd22; | |
ld.global.nc.f32 %f19, [%rd24]; | |
mul.rn.f32 %f20, %f11, %f17; | |
sub.rn.f32 %f21, %f19, %f20; | |
add.rn.f32 %f22, %f18, %f21; | |
cvt.rn.f16.f32 %h10, %f22; | |
cvt.f32.f16 %f23, %h7; | |
mul.wide.u32 %rd25, %r7, 4; | |
add.s64 %rd26, %rd3, %rd25; | |
ld.global.nc.f32 %f24, [%rd26]; | |
mul.rn.f32 %f25, %f5, %f24; | |
mul.rn.f32 %f26, %f25, %f23; | |
add.s64 %rd27, %rd6, %rd25; | |
ld.global.nc.f32 %f27, [%rd27]; | |
mul.rn.f32 %f28, %f11, %f25; | |
sub.rn.f32 %f29, %f27, %f28; | |
add.rn.f32 %f30, %f26, %f29; | |
cvt.rn.f16.f32 %h11, %f30; | |
cvt.f32.f16 %f31, %h8; | |
ld.global.nc.f32 %f32, [%rd18+12]; | |
mul.rn.f32 %f33, %f5, %f32; | |
mul.rn.f32 %f34, %f33, %f31; | |
ld.global.nc.f32 %f35, [%rd19+12]; | |
mul.rn.f32 %f36, %f11, %f33; | |
sub.rn.f32 %f37, %f35, %f36; | |
add.rn.f32 %f38, %f34, %f37; | |
cvt.rn.f16.f32 %h12, %f38; | |
st.global.v4.b16 [%rd21], {%h9, %h10, %h11, %h12}; | |
ret; | |
} | |
// .globl convert_1393 | |
.visible .entry convert_1393( | |
.param .u64 convert_1393_param_0, | |
.param .u64 convert_1393_param_1, | |
.param .u64 convert_1393_param_2 | |
) | |
.reqntid 128, 1, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %h<29>; | |
.reg .f32 %f<29>; | |
.reg .b32 %r<9>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd3, [convert_1393_param_0]; | |
ld.param.u64 %rd4, [convert_1393_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd3; | |
mov.u32 %r3, %ctaid.x; | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r1, %r3, 9; | |
shl.b32 %r2, %r4, 2; | |
or.b32 %r5, %r1, %r2; | |
mul.wide.u32 %rd7, %r5, 4; | |
add.s64 %rd1, %rd5, %rd7; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd8, %r5, 2; | |
add.s64 %rd2, %rd6, %rd8; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4}; | |
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440]; | |
cvt.rn.f16.f32 %h5, %f5; | |
cvt.rn.f16.f32 %h6, %f6; | |
cvt.rn.f16.f32 %h7, %f7; | |
cvt.rn.f16.f32 %h8, %f8; | |
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8}; | |
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880]; | |
cvt.rn.f16.f32 %h9, %f9; | |
cvt.rn.f16.f32 %h10, %f10; | |
cvt.rn.f16.f32 %h11, %f11; | |
cvt.rn.f16.f32 %h12, %f12; | |
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12}; | |
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320]; | |
cvt.rn.f16.f32 %h13, %f13; | |
cvt.rn.f16.f32 %h14, %f14; | |
cvt.rn.f16.f32 %h15, %f15; | |
cvt.rn.f16.f32 %h16, %f16; | |
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16}; | |
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760]; | |
cvt.rn.f16.f32 %h17, %f17; | |
cvt.rn.f16.f32 %h18, %f18; | |
cvt.rn.f16.f32 %h19, %f19; | |
cvt.rn.f16.f32 %h20, %f20; | |
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20}; | |
add.s32 %r6, %r5, 3276800; | |
setp.gt.u32 %p1, %r6, 4194303; | |
@%p1 bra LBB25_2; | |
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200]; | |
cvt.rn.f16.f32 %h21, %f21; | |
cvt.rn.f16.f32 %h22, %f22; | |
cvt.rn.f16.f32 %h23, %f23; | |
cvt.rn.f16.f32 %h24, %f24; | |
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24}; | |
LBB25_2: | |
add.s32 %r7, %r1, 3932160; | |
or.b32 %r8, %r7, %r2; | |
setp.gt.u32 %p2, %r8, 4194303; | |
@%p2 bra LBB25_4; | |
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640]; | |
cvt.rn.f16.f32 %h25, %f25; | |
cvt.rn.f16.f32 %h26, %f26; | |
cvt.rn.f16.f32 %h27, %f27; | |
cvt.rn.f16.f32 %h28, %f28; | |
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28}; | |
LBB25_4: | |
ret; | |
} | |
// .globl fusion_2250 | |
.visible .entry fusion_2250( | |
.param .u64 fusion_2250_param_0, | |
.param .u64 fusion_2250_param_1, | |
.param .u64 fusion_2250_param_2, | |
.param .u64 fusion_2250_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<21>; | |
.reg .b16 %h<21>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<150>; | |
.reg .b32 %r<25>; | |
.reg .b64 %rd<18>; | |
ld.param.u64 %rd1, [fusion_2250_param_0]; | |
ld.param.u64 %rd2, [fusion_2250_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2250_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r5, 1; | |
or.b32 %r7, %r5, 2; | |
or.b32 %r8, %r5, 3; | |
and.b32 %r9, %r8, 4095; | |
and.b32 %r10, %r7, 4094; | |
and.b32 %r11, %r6, 4093; | |
and.b32 %r12, %r5, 4092; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd5, %rd7; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd8]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
mul.wide.u32 %rd9, %r12, 4; | |
add.s64 %rd10, %rd3, %rd9; | |
ld.global.nc.f32 %f1, [%rd10]; | |
cvt.rn.f16.f32 %h9, %f1; | |
add.rn.f16 %h10, %h5, %h9; | |
cvt.f32.f16 %f2, %h10; | |
mul.rn.f32 %f3, %f2, %f2; | |
mul.rn.f32 %f4, %f3, %f2; | |
mul.rn.f32 %f5, %f4, 0f3D372713; | |
add.rn.f32 %f6, %f5, %f2; | |
mul.rn.f32 %f7, %f6, 0f3F4C422A; | |
abs.f32 %f8, %f7; | |
setp.lt.f32 %p1, %f8, 0f39D1B717; | |
setp.lt.f32 %p2, %f7, 0fC1100000; | |
selp.f32 %f9, 0fC1100000, %f7, %p2; | |
setp.gt.f32 %p3, %f9, 0f41100000; | |
selp.f32 %f10, 0f41100000, %f9, %p3; | |
mul.rn.f32 %f11, %f10, %f10; | |
mul.rn.f32 %f12, %f11, 0f259F25C0; | |
mov.f32 %f13, 0f2A61337E; | |
sub.rn.f32 %f14, %f13, %f12; | |
mul.rn.f32 %f15, %f11, %f14; | |
add.rn.f32 %f16, %f15, 0fAEBD37FF; | |
mul.rn.f32 %f17, %f11, %f16; | |
add.rn.f32 %f18, %f17, 0f335C0041; | |
mul.rn.f32 %f19, %f11, %f18; | |
add.rn.f32 %f20, %f19, 0f3779434A; | |
mul.rn.f32 %f21, %f11, %f20; | |
add.rn.f32 %f22, %f21, 0f3A270DED; | |
mul.rn.f32 %f23, %f11, %f22; | |
add.rn.f32 %f24, %f23, 0f3BA059DC; | |
mul.rn.f32 %f25, %f10, %f24; | |
mul.rn.f32 %f26, %f11, 0f35A0D3D8; | |
add.rn.f32 %f27, %f26, 0f38F895D6; | |
mul.rn.f32 %f28, %f11, %f27; | |
add.rn.f32 %f29, %f28, 0f3B14AA05; | |
mul.rn.f32 %f30, %f11, %f29; | |
add.rn.f32 %f31, %f30, 0f3BA059DD; | |
div.full.f32 %f32, %f25, %f31; | |
selp.f32 %f33, %f7, %f32, %p1; | |
mov.b32 %r13, %f7; | |
shr.u32 %r14, %r13, 31; | |
and.b32 %r15, %r14, 1; | |
setp.eq.b32 %p4, %r15, 1; | |
selp.f32 %f34, 0fBF800000, 0f3F800000, %p4; | |
setp.ltu.f32 %p5, %f8, 0f41A00000; | |
selp.f32 %f35, %f33, %f34, %p5; | |
add.rn.f32 %f36, %f35, 0f3F800000; | |
mul.rn.f32 %f37, %f36, 0f3F000000; | |
mul.rn.f32 %f38, %f37, %f2; | |
cvt.rn.f16.f32 %h11, %f38; | |
add.s64 %rd11, %rd6, %rd7; | |
mul.wide.u32 %rd12, %r11, 4; | |
add.s64 %rd13, %rd3, %rd12; | |
ld.global.nc.f32 %f39, [%rd13]; | |
cvt.rn.f16.f32 %h12, %f39; | |
add.rn.f16 %h13, %h6, %h12; | |
cvt.f32.f16 %f40, %h13; | |
mul.rn.f32 %f41, %f40, %f40; | |
mul.rn.f32 %f42, %f41, %f40; | |
mul.rn.f32 %f43, %f42, 0f3D372713; | |
add.rn.f32 %f44, %f43, %f40; | |
mul.rn.f32 %f45, %f44, 0f3F4C422A; | |
abs.f32 %f46, %f45; | |
setp.lt.f32 %p6, %f46, 0f39D1B717; | |
setp.lt.f32 %p7, %f45, 0fC1100000; | |
selp.f32 %f47, 0fC1100000, %f45, %p7; | |
setp.gt.f32 %p8, %f47, 0f41100000; | |
selp.f32 %f48, 0f41100000, %f47, %p8; | |
mul.rn.f32 %f49, %f48, %f48; | |
mul.rn.f32 %f50, %f49, 0f259F25C0; | |
sub.rn.f32 %f51, %f13, %f50; | |
mul.rn.f32 %f52, %f49, %f51; | |
add.rn.f32 %f53, %f52, 0fAEBD37FF; | |
mul.rn.f32 %f54, %f49, %f53; | |
add.rn.f32 %f55, %f54, 0f335C0041; | |
mul.rn.f32 %f56, %f49, %f55; | |
add.rn.f32 %f57, %f56, 0f3779434A; | |
mul.rn.f32 %f58, %f49, %f57; | |
add.rn.f32 %f59, %f58, 0f3A270DED; | |
mul.rn.f32 %f60, %f49, %f59; | |
add.rn.f32 %f61, %f60, 0f3BA059DC; | |
mul.rn.f32 %f62, %f48, %f61; | |
mul.rn.f32 %f63, %f49, 0f35A0D3D8; | |
add.rn.f32 %f64, %f63, 0f38F895D6; | |
mul.rn.f32 %f65, %f49, %f64; | |
add.rn.f32 %f66, %f65, 0f3B14AA05; | |
mul.rn.f32 %f67, %f49, %f66; | |
add.rn.f32 %f68, %f67, 0f3BA059DD; | |
div.full.f32 %f69, %f62, %f68; | |
selp.f32 %f70, %f45, %f69, %p6; | |
mov.b32 %r16, %f45; | |
shr.u32 %r17, %r16, 31; | |
and.b32 %r18, %r17, 1; | |
setp.eq.b32 %p9, %r18, 1; | |
selp.f32 %f71, 0fBF800000, 0f3F800000, %p9; | |
setp.ltu.f32 %p10, %f46, 0f41A00000; | |
selp.f32 %f72, %f70, %f71, %p10; | |
add.rn.f32 %f73, %f72, 0f3F800000; | |
mul.rn.f32 %f74, %f73, 0f3F000000; | |
mul.rn.f32 %f75, %f74, %f40; | |
cvt.rn.f16.f32 %h14, %f75; | |
mul.wide.u32 %rd14, %r10, 4; | |
add.s64 %rd15, %rd3, %rd14; | |
ld.global.nc.f32 %f76, [%rd15]; | |
cvt.rn.f16.f32 %h15, %f76; | |
add.rn.f16 %h16, %h7, %h15; | |
cvt.f32.f16 %f77, %h16; | |
mul.rn.f32 %f78, %f77, %f77; | |
mul.rn.f32 %f79, %f78, %f77; | |
mul.rn.f32 %f80, %f79, 0f3D372713; | |
add.rn.f32 %f81, %f80, %f77; | |
mul.rn.f32 %f82, %f81, 0f3F4C422A; | |
abs.f32 %f83, %f82; | |
setp.lt.f32 %p11, %f83, 0f39D1B717; | |
setp.lt.f32 %p12, %f82, 0fC1100000; | |
selp.f32 %f84, 0fC1100000, %f82, %p12; | |
setp.gt.f32 %p13, %f84, 0f41100000; | |
selp.f32 %f85, 0f41100000, %f84, %p13; | |
mul.rn.f32 %f86, %f85, %f85; | |
mul.rn.f32 %f87, %f86, 0f259F25C0; | |
sub.rn.f32 %f88, %f13, %f87; | |
mul.rn.f32 %f89, %f86, %f88; | |
add.rn.f32 %f90, %f89, 0fAEBD37FF; | |
mul.rn.f32 %f91, %f86, %f90; | |
add.rn.f32 %f92, %f91, 0f335C0041; | |
mul.rn.f32 %f93, %f86, %f92; | |
add.rn.f32 %f94, %f93, 0f3779434A; | |
mul.rn.f32 %f95, %f86, %f94; | |
add.rn.f32 %f96, %f95, 0f3A270DED; | |
mul.rn.f32 %f97, %f86, %f96; | |
add.rn.f32 %f98, %f97, 0f3BA059DC; | |
mul.rn.f32 %f99, %f85, %f98; | |
mul.rn.f32 %f100, %f86, 0f35A0D3D8; | |
add.rn.f32 %f101, %f100, 0f38F895D6; | |
mul.rn.f32 %f102, %f86, %f101; | |
add.rn.f32 %f103, %f102, 0f3B14AA05; | |
mul.rn.f32 %f104, %f86, %f103; | |
add.rn.f32 %f105, %f104, 0f3BA059DD; | |
div.full.f32 %f106, %f99, %f105; | |
selp.f32 %f107, %f82, %f106, %p11; | |
mov.b32 %r19, %f82; | |
shr.u32 %r20, %r19, 31; | |
and.b32 %r21, %r20, 1; | |
setp.eq.b32 %p14, %r21, 1; | |
selp.f32 %f108, 0fBF800000, 0f3F800000, %p14; | |
setp.ltu.f32 %p15, %f83, 0f41A00000; | |
selp.f32 %f109, %f107, %f108, %p15; | |
add.rn.f32 %f110, %f109, 0f3F800000; | |
mul.rn.f32 %f111, %f110, 0f3F000000; | |
mul.rn.f32 %f112, %f111, %f77; | |
cvt.rn.f16.f32 %h17, %f112; | |
mul.wide.u32 %rd16, %r9, 4; | |
add.s64 %rd17, %rd3, %rd16; | |
ld.global.nc.f32 %f113, [%rd17]; | |
cvt.rn.f16.f32 %h18, %f113; | |
add.rn.f16 %h19, %h8, %h18; | |
cvt.f32.f16 %f114, %h19; | |
mul.rn.f32 %f115, %f114, %f114; | |
mul.rn.f32 %f116, %f115, %f114; | |
mul.rn.f32 %f117, %f116, 0f3D372713; | |
add.rn.f32 %f118, %f117, %f114; | |
mul.rn.f32 %f119, %f118, 0f3F4C422A; | |
abs.f32 %f120, %f119; | |
setp.lt.f32 %p16, %f120, 0f39D1B717; | |
setp.lt.f32 %p17, %f119, 0fC1100000; | |
selp.f32 %f121, 0fC1100000, %f119, %p17; | |
setp.gt.f32 %p18, %f121, 0f41100000; | |
selp.f32 %f122, 0f41100000, %f121, %p18; | |
mul.rn.f32 %f123, %f122, %f122; | |
mul.rn.f32 %f124, %f123, 0f259F25C0; | |
sub.rn.f32 %f125, %f13, %f124; | |
mul.rn.f32 %f126, %f123, %f125; | |
add.rn.f32 %f127, %f126, 0fAEBD37FF; | |
mul.rn.f32 %f128, %f123, %f127; | |
add.rn.f32 %f129, %f128, 0f335C0041; | |
mul.rn.f32 %f130, %f123, %f129; | |
add.rn.f32 %f131, %f130, 0f3779434A; | |
mul.rn.f32 %f132, %f123, %f131; | |
add.rn.f32 %f133, %f132, 0f3A270DED; | |
mul.rn.f32 %f134, %f123, %f133; | |
add.rn.f32 %f135, %f134, 0f3BA059DC; | |
mul.rn.f32 %f136, %f122, %f135; | |
mul.rn.f32 %f137, %f123, 0f35A0D3D8; | |
add.rn.f32 %f138, %f137, 0f38F895D6; | |
mul.rn.f32 %f139, %f123, %f138; | |
add.rn.f32 %f140, %f139, 0f3B14AA05; | |
mul.rn.f32 %f141, %f123, %f140; | |
add.rn.f32 %f142, %f141, 0f3BA059DD; | |
div.full.f32 %f143, %f136, %f142; | |
selp.f32 %f144, %f119, %f143, %p16; | |
mov.b32 %r22, %f119; | |
shr.u32 %r23, %r22, 31; | |
and.b32 %r24, %r23, 1; | |
setp.eq.b32 %p19, %r24, 1; | |
selp.f32 %f145, 0fBF800000, 0f3F800000, %p19; | |
setp.ltu.f32 %p20, %f120, 0f41A00000; | |
selp.f32 %f146, %f144, %f145, %p20; | |
add.rn.f32 %f147, %f146, 0f3F800000; | |
mul.rn.f32 %f148, %f147, 0f3F000000; | |
mul.rn.f32 %f149, %f148, %f114; | |
cvt.rn.f16.f32 %h20, %f149; | |
st.global.v4.b16 [%rd11], {%h11, %h14, %h17, %h20}; | |
ret; | |
} | |
// .globl convert_1395 | |
.visible .entry convert_1395( | |
.param .u64 convert_1395_param_0, | |
.param .u64 convert_1395_param_1, | |
.param .u64 convert_1395_param_2 | |
) | |
.reqntid 128, 1, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %h<29>; | |
.reg .f32 %f<29>; | |
.reg .b32 %r<9>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd3, [convert_1395_param_0]; | |
ld.param.u64 %rd4, [convert_1395_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd3; | |
mov.u32 %r3, %ctaid.x; | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r1, %r3, 9; | |
shl.b32 %r2, %r4, 2; | |
or.b32 %r5, %r1, %r2; | |
mul.wide.u32 %rd7, %r5, 4; | |
add.s64 %rd1, %rd5, %rd7; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd8, %r5, 2; | |
add.s64 %rd2, %rd6, %rd8; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4}; | |
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440]; | |
cvt.rn.f16.f32 %h5, %f5; | |
cvt.rn.f16.f32 %h6, %f6; | |
cvt.rn.f16.f32 %h7, %f7; | |
cvt.rn.f16.f32 %h8, %f8; | |
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8}; | |
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880]; | |
cvt.rn.f16.f32 %h9, %f9; | |
cvt.rn.f16.f32 %h10, %f10; | |
cvt.rn.f16.f32 %h11, %f11; | |
cvt.rn.f16.f32 %h12, %f12; | |
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12}; | |
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320]; | |
cvt.rn.f16.f32 %h13, %f13; | |
cvt.rn.f16.f32 %h14, %f14; | |
cvt.rn.f16.f32 %h15, %f15; | |
cvt.rn.f16.f32 %h16, %f16; | |
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16}; | |
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760]; | |
cvt.rn.f16.f32 %h17, %f17; | |
cvt.rn.f16.f32 %h18, %f18; | |
cvt.rn.f16.f32 %h19, %f19; | |
cvt.rn.f16.f32 %h20, %f20; | |
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20}; | |
add.s32 %r6, %r5, 3276800; | |
setp.gt.u32 %p1, %r6, 4194303; | |
@%p1 bra LBB27_2; | |
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200]; | |
cvt.rn.f16.f32 %h21, %f21; | |
cvt.rn.f16.f32 %h22, %f22; | |
cvt.rn.f16.f32 %h23, %f23; | |
cvt.rn.f16.f32 %h24, %f24; | |
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24}; | |
LBB27_2: | |
add.s32 %r7, %r1, 3932160; | |
or.b32 %r8, %r7, %r2; | |
setp.gt.u32 %p2, %r8, 4194303; | |
@%p2 bra LBB27_4; | |
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640]; | |
cvt.rn.f16.f32 %h25, %f25; | |
cvt.rn.f16.f32 %h26, %f26; | |
cvt.rn.f16.f32 %h27, %f27; | |
cvt.rn.f16.f32 %h28, %f28; | |
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28}; | |
LBB27_4: | |
ret; | |
} | |
// .globl rng_get_and_update_state_2 | |
.visible .entry rng_get_and_update_state_2( | |
.param .u64 rng_get_and_update_state_2_param_0, | |
.param .u64 rng_get_and_update_state_2_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_2_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 524288; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 524288; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2248 | |
.visible .entry fusion_2248( | |
.param .u64 fusion_2248_param_0, | |
.param .u64 fusion_2248_param_1, | |
.param .u64 fusion_2248_param_2, | |
.param .u64 fusion_2248_param_3, | |
.param .u64 fusion_2248_param_4, | |
.param .u64 fusion_2248_param_5, | |
.param .u64 fusion_2248_param_6, | |
.param .u64 fusion_2248_param_7, | |
.param .u64 fusion_2248_param_8, | |
.param .u64 fusion_2248_param_9 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot29[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<75>; | |
.reg .b16 %h<145>; | |
.reg .f32 %f<254>; | |
.reg .b32 %r<350>; | |
.reg .b64 %rd<2739>; | |
mov.u64 %SPL, __local_depot29; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd463, [fusion_2248_param_0]; | |
ld.param.u64 %rd464, [fusion_2248_param_8]; | |
cvta.to.global.u64 %rd1, %rd464; | |
ld.param.u64 %rd465, [fusion_2248_param_1]; | |
ld.param.u64 %rd466, [fusion_2248_param_7]; | |
cvta.to.global.u64 %rd2, %rd466; | |
ld.param.u64 %rd467, [fusion_2248_param_2]; | |
ld.param.u64 %rd468, [fusion_2248_param_6]; | |
cvta.to.global.u64 %rd3, %rd468; | |
ld.param.u64 %rd470, [fusion_2248_param_5]; | |
cvta.to.global.u64 %rd4, %rd470; | |
ld.param.u64 %rd471, [fusion_2248_param_4]; | |
cvta.to.global.u64 %rd5, %rd471; | |
cvta.to.global.u64 %rd7, %rd467; | |
cvta.to.global.u64 %rd8, %rd465; | |
cvta.to.global.u64 %rd9, %rd463; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 10; | |
or.b32 %r48, %r4, %r3; | |
shr.u32 %r49, %r48, 2; | |
and.b32 %r5, %r1, 1; | |
setp.eq.s32 %p1, %r5, 0; | |
ld.global.nc.u64 %rd11, [%rd7]; | |
cvt.u64.u32 %rd473, %r49; | |
add.s64 %rd12, %rd11, %rd473; | |
setp.lt.u64 %p69, %rd12, %rd11; | |
and.b64 %rd2384, %rd12, 4294967295; | |
@%p1 bra LBB29_1; | |
bra.uni LBB29_4; | |
LBB29_1: | |
mul.lo.s64 %rd2446, %rd2384, 3528531795; | |
ld.global.nc.u64 %rd2461, [%rd7+8]; | |
selp.u64 %rd516, 1, 0, %p69; | |
add.s64 %rd517, %rd2461, %rd516; | |
xor.b64 %rd518, %rd517, %rd2446; | |
shr.u64 %rd519, %rd518, 32; | |
mul.lo.s64 %rd2449, %rd519, 3449720151; | |
shr.u64 %rd520, %rd2449, 32; | |
and.b64 %rd521, %rd517, 4294967295; | |
mul.lo.s64 %rd522, %rd521, 3449720151; | |
and.b64 %rd523, %rd522, 4294967295; | |
xor.b64 %rd524, %rd523, %rd520; | |
xor.b64 %rd525, %rd524, 2654435769; | |
mul.lo.s64 %rd2452, %rd525, 3528531795; | |
xor.b64 %rd2442, %rd522, %rd12; | |
mov.u32 %r312, -1879881855; | |
mov.u32 %r311, -845247145; | |
mov.u32 %r310, 534103459; | |
mov.u64 %rd2460, 3678237736; | |
mov.u64 %rd2459, 3041712726; | |
mov.u64 %rd2458, 1401181199; | |
mov.u64 %rd2457, 2835769497; | |
mov.u64 %rd2456, 1684936478; | |
mov.u64 %rd2455, 2027808484; | |
mov.u64 %rd2454, 387276957; | |
mov.u64 %rd2453, 842468239; | |
mov.u64 %rd2451, 3986602516; | |
mov.u64 %rd2450, 1013904242; | |
mov.u64 %rd2448, 3668340011; | |
mov.u64 %rd2447, 3144134277; | |
mov.u64 %rd2445, 3449720151; | |
mov.u64 %rd2444, 1993301258; | |
mov.u64 %rd2443, 3528531795; | |
bra.uni LBB29_5; | |
LBB29_4: | |
mov.u32 %r311, -766435501; | |
mov.u64 %rd2459, 1684936478; | |
mov.u64 %rd2458, 534103459; | |
mov.u64 %rd2457, 387276957; | |
mov.u64 %rd2456, 3041712726; | |
mov.u64 %rd2455, 3986602516; | |
mov.u64 %rd2454, 2835769497; | |
mov.u64 %rd2453, 3668340011; | |
mov.u64 %rd2451, 2027808484; | |
mov.u64 %rd2450, 1993301258; | |
mov.u64 %rd2448, 842468239; | |
mov.u64 %rd2447, 2654435769; | |
mov.u64 %rd2445, 3528531795; | |
mov.u64 %rd2444, 1013904242; | |
mov.u64 %rd2443, 3449720151; | |
mov.u32 %r312, -1767562579; | |
mov.u32 %r310, 1401181199; | |
mov.u64 %rd2460, 4055616968; | |
ld.global.nc.u64 %rd2461, [%rd7+8]; | |
selp.u64 %rd489, 1, 0, %p69; | |
add.s64 %rd490, %rd2461, %rd489; | |
and.b64 %rd491, %rd490, 4294967295; | |
mul.lo.s64 %rd2446, %rd491, 3449720151; | |
xor.b64 %rd492, %rd2446, %rd12; | |
shr.u64 %rd493, %rd492, 32; | |
mul.lo.s64 %rd2449, %rd493, 3528531795; | |
shr.u64 %rd494, %rd2449, 32; | |
mul.lo.s64 %rd496, %rd2384, 3528531795; | |
and.b64 %rd497, %rd496, 4294967295; | |
xor.b64 %rd498, %rd497, %rd494; | |
xor.b64 %rd499, %rd498, 3144134277; | |
mul.lo.s64 %rd2452, %rd499, 3449720151; | |
xor.b64 %rd2442, %rd490, %rd496; | |
LBB29_5: | |
shr.u64 %rd526, %rd2452, 32; | |
shr.u64 %rd527, %rd2442, 32; | |
mul.lo.s64 %rd528, %rd527, %rd2443; | |
and.b64 %rd529, %rd528, 4294967295; | |
xor.b64 %rd530, %rd529, %rd526; | |
xor.b64 %rd531, %rd530, %rd2444; | |
mul.lo.s64 %rd532, %rd531, %rd2445; | |
shr.u64 %rd533, %rd532, 32; | |
shr.u64 %rd534, %rd528, 32; | |
and.b64 %rd535, %rd2446, 4294967295; | |
xor.b64 %rd536, %rd535, %rd534; | |
xor.b64 %rd537, %rd536, %rd2447; | |
mul.lo.s64 %rd538, %rd537, %rd2445; | |
and.b64 %rd539, %rd538, 4294967295; | |
xor.b64 %rd540, %rd539, %rd533; | |
xor.b64 %rd541, %rd540, %rd2448; | |
mul.lo.s64 %rd542, %rd541, %rd2443; | |
shr.u64 %rd543, %rd542, 32; | |
shr.u64 %rd544, %rd538, 32; | |
and.b64 %rd545, %rd2449, 4294967295; | |
xor.b64 %rd546, %rd545, %rd544; | |
xor.b64 %rd547, %rd546, %rd2450; | |
mul.lo.s64 %rd548, %rd547, %rd2443; | |
and.b64 %rd549, %rd548, 4294967295; | |
xor.b64 %rd550, %rd549, %rd543; | |
xor.b64 %rd551, %rd550, %rd2451; | |
mul.lo.s64 %rd552, %rd551, %rd2445; | |
shr.u64 %rd553, %rd552, 32; | |
shr.u64 %rd554, %rd548, 32; | |
and.b64 %rd555, %rd2452, 4294967295; | |
xor.b64 %rd556, %rd555, %rd554; | |
xor.b64 %rd557, %rd556, %rd2453; | |
mul.lo.s64 %rd558, %rd557, %rd2445; | |
and.b64 %rd559, %rd558, 4294967295; | |
xor.b64 %rd560, %rd559, %rd553; | |
xor.b64 %rd561, %rd560, %rd2454; | |
mul.lo.s64 %rd562, %rd561, %rd2443; | |
shr.u64 %rd563, %rd562, 32; | |
shr.u64 %rd564, %rd558, 32; | |
and.b64 %rd565, %rd532, 4294967295; | |
xor.b64 %rd566, %rd565, %rd564; | |
xor.b64 %rd567, %rd566, %rd2455; | |
mul.lo.s64 %rd568, %rd567, %rd2443; | |
and.b64 %rd569, %rd568, 4294967295; | |
xor.b64 %rd570, %rd569, %rd563; | |
xor.b64 %rd571, %rd570, %rd2456; | |
mul.lo.s64 %rd572, %rd571, %rd2445; | |
shr.u64 %rd573, %rd572, 32; | |
shr.u64 %rd574, %rd568, 32; | |
and.b64 %rd575, %rd542, 4294967295; | |
xor.b64 %rd576, %rd575, %rd574; | |
xor.b64 %rd577, %rd576, %rd2457; | |
mul.lo.s64 %rd578, %rd577, %rd2445; | |
and.b64 %rd579, %rd578, 4294967295; | |
xor.b64 %rd580, %rd579, %rd573; | |
xor.b64 %rd581, %rd580, %rd2458; | |
mul.lo.s64 %rd582, %rd581, %rd2443; | |
shr.u64 %rd583, %rd582, 32; | |
shr.u64 %rd584, %rd578, 32; | |
and.b64 %rd585, %rd552, 4294967295; | |
xor.b64 %rd586, %rd585, %rd584; | |
xor.b64 %rd587, %rd586, %rd2459; | |
mul.lo.s64 %rd588, %rd587, %rd2443; | |
and.b64 %rd589, %rd588, 4294967295; | |
xor.b64 %rd590, %rd589, %rd583; | |
xor.b64 %rd591, %rd590, %rd2460; | |
mul.lo.s64 %rd592, %rd591, %rd2445; | |
shr.u64 %rd593, %rd592, 32; | |
cvt.u32.u64 %r56, %rd593; | |
shr.u64 %rd594, %rd588, 32; | |
xor.b64 %rd595, %rd594, %rd562; | |
cvt.u32.u64 %r57, %rd595; | |
xor.b32 %r58, %r310, %r57; | |
mul.lo.s32 %r59, %r58, %r311; | |
xor.b32 %r60, %r59, %r56; | |
xor.b32 %r61, %r60, %r312; | |
shr.u32 %r62, %r61, 9; | |
cvt.rn.f32.u32 %f19, %r62; | |
mul.rn.f32 %f20, %f19, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f20; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p4, %h1, %h2; | |
mul.wide.u32 %rd596, %r2, 2048; | |
add.s64 %rd597, %rd9, %rd596; | |
mul.wide.u32 %rd598, %r3, 2; | |
add.s64 %rd44, %rd597, %rd598; | |
ld.global.nc.b16 %h3, [%rd44]; | |
mul.wide.u32 %rd599, %r3, 4; | |
add.s64 %rd45, %rd1, %rd599; | |
ld.global.nc.f32 %f21, [%rd45]; | |
cvt.rn.f16.f32 %h4, %f21; | |
add.rn.f16 %h5, %h3, %h4; | |
mov.b16 %h6, 0x3C72; | |
mul.rn.f16 %h7, %h5, %h6; | |
selp.b16 %h8, %h7, 0x0000, %p4; | |
cvt.f32.f16 %f22, %h8; | |
add.s64 %rd600, %rd8, %rd596; | |
add.s64 %rd46, %rd600, %rd598; | |
ld.global.nc.b16 %h9, [%rd46]; | |
cvt.f32.f16 %f23, %h9; | |
mul.wide.u32 %rd601, %r2, 4; | |
add.s64 %rd602, %rd5, %rd601; | |
ld.global.nc.f32 %f24, [%rd602]; | |
mul.rn.f32 %f25, %f24, 0f3A800000; | |
add.rn.f32 %f26, %f25, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f1, %f26; | |
add.s64 %rd47, %rd2, %rd599; | |
ld.global.nc.f32 %f27, [%rd47]; | |
mul.rn.f32 %f28, %f1, %f27; | |
mul.rn.f32 %f29, %f28, %f23; | |
add.s64 %rd48, %rd3, %rd599; | |
ld.global.nc.f32 %f30, [%rd48]; | |
add.s64 %rd603, %rd4, %rd601; | |
ld.global.nc.f32 %f31, [%rd603]; | |
mul.rn.f32 %f2, %f31, 0f3A800000; | |
mul.rn.f32 %f32, %f28, %f2; | |
sub.rn.f32 %f33, %f30, %f32; | |
add.rn.f32 %f34, %f29, %f33; | |
add.rn.f32 %f35, %f34, %f22; | |
add.rn.f32 %f3, %f35, 0f00000000; | |
or.b32 %r63, %r3, 1; | |
and.b32 %r64, %r63, 3; | |
setp.ne.s32 %p5, %r64, 1; | |
@%p5 bra LBB29_7; | |
mul.lo.s64 %rd2466, %rd2384, 3528531795; | |
selp.u64 %rd644, 1, 0, %p69; | |
add.s64 %rd645, %rd2461, %rd644; | |
xor.b64 %rd646, %rd645, %rd2466; | |
shr.u64 %rd647, %rd646, 32; | |
mul.lo.s64 %rd2469, %rd647, 3449720151; | |
shr.u64 %rd648, %rd2469, 32; | |
and.b64 %rd649, %rd645, 4294967295; | |
mul.lo.s64 %rd650, %rd649, 3449720151; | |
and.b64 %rd651, %rd650, 4294967295; | |
xor.b64 %rd652, %rd651, %rd648; | |
xor.b64 %rd653, %rd652, 2654435769; | |
mul.lo.s64 %rd2472, %rd653, 3528531795; | |
xor.b64 %rd2462, %rd650, %rd12; | |
mov.u32 %r314, -845247145; | |
mov.u32 %r313, -616729560; | |
mov.u64 %rd2479, 3041712726; | |
mov.u64 %rd2478, 1401181199; | |
mov.u64 %rd2477, 2835769497; | |
mov.u64 %rd2476, 1684936478; | |
mov.u64 %rd2475, 2027808484; | |
mov.u64 %rd2474, 387276957; | |
mov.u64 %rd2473, 842468239; | |
mov.u64 %rd2471, 3986602516; | |
mov.u64 %rd2470, 1013904242; | |
mov.u64 %rd2468, 3668340011; | |
mov.u64 %rd2467, 3144134277; | |
mov.u64 %rd2465, 3449720151; | |
mov.u64 %rd2464, 1993301258; | |
mov.u64 %rd2463, 3528531795; | |
bra.uni LBB29_8; | |
LBB29_7: | |
mov.u32 %r313, -239350328; | |
selp.u64 %rd618, 1, 0, %p69; | |
add.s64 %rd619, %rd2461, %rd618; | |
and.b64 %rd620, %rd619, 4294967295; | |
mul.lo.s64 %rd2466, %rd620, 3449720151; | |
xor.b64 %rd621, %rd2466, %rd12; | |
shr.u64 %rd622, %rd621, 32; | |
mul.lo.s64 %rd2469, %rd622, 3528531795; | |
shr.u64 %rd623, %rd2469, 32; | |
mul.lo.s64 %rd625, %rd2384, 3528531795; | |
and.b64 %rd626, %rd625, 4294967295; | |
xor.b64 %rd627, %rd626, %rd623; | |
xor.b64 %rd628, %rd627, 3144134277; | |
mul.lo.s64 %rd2472, %rd628, 3449720151; | |
xor.b64 %rd2462, %rd619, %rd625; | |
mov.u32 %r314, -766435501; | |
mov.u64 %rd2479, 1684936478; | |
mov.u64 %rd2478, 534103459; | |
mov.u64 %rd2477, 387276957; | |
mov.u64 %rd2476, 3041712726; | |
mov.u64 %rd2475, 3986602516; | |
mov.u64 %rd2474, 2835769497; | |
mov.u64 %rd2473, 3668340011; | |
mov.u64 %rd2471, 2027808484; | |
mov.u64 %rd2470, 1993301258; | |
mov.u64 %rd2468, 842468239; | |
mov.u64 %rd2467, 2654435769; | |
mov.u64 %rd2465, 3528531795; | |
mov.u64 %rd2464, 1013904242; | |
mov.u64 %rd2463, 3449720151; | |
LBB29_8: | |
setp.ne.s32 %p8, %r5, 0; | |
shr.u64 %rd654, %rd2472, 32; | |
shr.u64 %rd655, %rd2462, 32; | |
mul.lo.s64 %rd656, %rd655, %rd2463; | |
and.b64 %rd657, %rd656, 4294967295; | |
xor.b64 %rd658, %rd657, %rd654; | |
xor.b64 %rd659, %rd658, %rd2464; | |
mul.lo.s64 %rd660, %rd659, %rd2465; | |
shr.u64 %rd661, %rd660, 32; | |
shr.u64 %rd662, %rd656, 32; | |
and.b64 %rd663, %rd2466, 4294967295; | |
xor.b64 %rd664, %rd663, %rd662; | |
xor.b64 %rd665, %rd664, %rd2467; | |
mul.lo.s64 %rd666, %rd665, %rd2465; | |
and.b64 %rd667, %rd666, 4294967295; | |
xor.b64 %rd668, %rd667, %rd661; | |
xor.b64 %rd669, %rd668, %rd2468; | |
mul.lo.s64 %rd670, %rd669, %rd2463; | |
shr.u64 %rd671, %rd670, 32; | |
shr.u64 %rd672, %rd666, 32; | |
and.b64 %rd673, %rd2469, 4294967295; | |
xor.b64 %rd674, %rd673, %rd672; | |
xor.b64 %rd675, %rd674, %rd2470; | |
mul.lo.s64 %rd676, %rd675, %rd2463; | |
and.b64 %rd677, %rd676, 4294967295; | |
xor.b64 %rd678, %rd677, %rd671; | |
xor.b64 %rd679, %rd678, %rd2471; | |
mul.lo.s64 %rd680, %rd679, %rd2465; | |
shr.u64 %rd681, %rd680, 32; | |
shr.u64 %rd682, %rd676, 32; | |
and.b64 %rd683, %rd2472, 4294967295; | |
xor.b64 %rd684, %rd683, %rd682; | |
xor.b64 %rd685, %rd684, %rd2473; | |
mul.lo.s64 %rd686, %rd685, %rd2465; | |
and.b64 %rd687, %rd686, 4294967295; | |
xor.b64 %rd688, %rd687, %rd681; | |
xor.b64 %rd689, %rd688, %rd2474; | |
mul.lo.s64 %rd690, %rd689, %rd2463; | |
shr.u64 %rd691, %rd690, 32; | |
shr.u64 %rd692, %rd686, 32; | |
and.b64 %rd693, %rd660, 4294967295; | |
xor.b64 %rd694, %rd693, %rd692; | |
xor.b64 %rd695, %rd694, %rd2475; | |
mul.lo.s64 %rd696, %rd695, %rd2463; | |
and.b64 %rd697, %rd696, 4294967295; | |
xor.b64 %rd698, %rd697, %rd691; | |
xor.b64 %rd699, %rd698, %rd2476; | |
mul.lo.s64 %rd700, %rd699, %rd2465; | |
shr.u64 %rd701, %rd700, 32; | |
shr.u64 %rd702, %rd696, 32; | |
and.b64 %rd703, %rd670, 4294967295; | |
xor.b64 %rd704, %rd703, %rd702; | |
xor.b64 %rd705, %rd704, %rd2477; | |
mul.lo.s64 %rd706, %rd705, %rd2465; | |
and.b64 %rd707, %rd706, 4294967295; | |
xor.b64 %rd708, %rd707, %rd701; | |
xor.b64 %rd709, %rd708, %rd2478; | |
mul.lo.s64 %rd710, %rd709, %rd2463; | |
shr.u64 %rd711, %rd710, 32; | |
shr.u64 %rd712, %rd706, 32; | |
xor.b64 %rd713, %rd680, %rd712; | |
xor.b64 %rd714, %rd713, %rd2479; | |
mul.lo.s64 %rd715, %rd714, %rd2463; | |
xor.b64 %rd716, %rd711, %rd715; | |
cvt.u32.u64 %r69, %rd716; | |
xor.b32 %r70, %r313, %r69; | |
mul.lo.s32 %r71, %r70, %r314; | |
shr.u32 %r72, %r71, 9; | |
cvt.rn.f32.u32 %f36, %r72; | |
mul.rn.f32 %f37, %f36, 0f34000000; | |
cvt.rn.f16.f32 %h10, %f37; | |
mov.b16 %h11, 0x2E66; | |
setp.ge.f16 %p9, %h10, %h11; | |
ld.global.nc.b16 %h12, [%rd44+2]; | |
ld.global.nc.f32 %f38, [%rd45+4]; | |
cvt.rn.f16.f32 %h13, %f38; | |
add.rn.f16 %h14, %h12, %h13; | |
mov.b16 %h15, 0x3C72; | |
mul.rn.f16 %h16, %h14, %h15; | |
selp.b16 %h17, %h16, 0x0000, %p9; | |
cvt.f32.f16 %f39, %h17; | |
ld.global.nc.b16 %h18, [%rd46+2]; | |
cvt.f32.f16 %f40, %h18; | |
ld.global.nc.f32 %f41, [%rd47+4]; | |
mul.rn.f32 %f42, %f1, %f41; | |
mul.rn.f32 %f43, %f42, %f40; | |
ld.global.nc.f32 %f44, [%rd48+4]; | |
mul.rn.f32 %f45, %f2, %f42; | |
sub.rn.f32 %f46, %f44, %f45; | |
add.rn.f32 %f47, %f43, %f46; | |
add.rn.f32 %f48, %f47, %f39; | |
add.rn.f32 %f4, %f3, %f48; | |
or.b32 %r73, %r3, %r4; | |
or.b32 %r74, %r73, 128; | |
shr.u32 %r75, %r74, 2; | |
cvt.u64.u32 %rd717, %r75; | |
add.s64 %rd75, %rd11, %rd717; | |
and.b64 %rd2433, %rd75, 4294967295; | |
setp.lt.u64 %p74, %rd75, %rd11; | |
@%p8 bra LBB29_10; | |
mul.lo.s64 %rd2484, %rd2433, 3528531795; | |
selp.u64 %rd760, 1, 0, %p74; | |
add.s64 %rd761, %rd2461, %rd760; | |
xor.b64 %rd762, %rd761, %rd2484; | |
shr.u64 %rd763, %rd762, 32; | |
mul.lo.s64 %rd2487, %rd763, 3449720151; | |
shr.u64 %rd764, %rd2487, 32; | |
and.b64 %rd765, %rd761, 4294967295; | |
mul.lo.s64 %rd766, %rd765, 3449720151; | |
and.b64 %rd767, %rd766, 4294967295; | |
xor.b64 %rd768, %rd767, %rd764; | |
xor.b64 %rd769, %rd768, 2654435769; | |
mul.lo.s64 %rd2490, %rd769, 3528531795; | |
xor.b64 %rd2480, %rd766, %rd75; | |
mov.u32 %r317, -1879881855; | |
mov.u32 %r316, -845247145; | |
mov.u32 %r315, 534103459; | |
mov.u64 %rd2498, 3678237736; | |
mov.u64 %rd2497, 3041712726; | |
mov.u64 %rd2496, 1401181199; | |
mov.u64 %rd2495, 2835769497; | |
mov.u64 %rd2494, 1684936478; | |
mov.u64 %rd2493, 2027808484; | |
mov.u64 %rd2492, 387276957; | |
mov.u64 %rd2491, 842468239; | |
mov.u64 %rd2489, 3986602516; | |
mov.u64 %rd2488, 1013904242; | |
mov.u64 %rd2486, 3668340011; | |
mov.u64 %rd2485, 3144134277; | |
mov.u64 %rd2483, 3449720151; | |
mov.u64 %rd2482, 1993301258; | |
mov.u64 %rd2481, 3528531795; | |
bra.uni LBB29_11; | |
LBB29_10: | |
selp.u64 %rd733, 1, 0, %p74; | |
add.s64 %rd734, %rd2461, %rd733; | |
and.b64 %rd735, %rd734, 4294967295; | |
mul.lo.s64 %rd2484, %rd735, 3449720151; | |
xor.b64 %rd736, %rd2484, %rd75; | |
shr.u64 %rd737, %rd736, 32; | |
mul.lo.s64 %rd2487, %rd737, 3528531795; | |
shr.u64 %rd738, %rd2487, 32; | |
mul.lo.s64 %rd740, %rd2433, 3528531795; | |
and.b64 %rd741, %rd740, 4294967295; | |
xor.b64 %rd742, %rd741, %rd738; | |
xor.b64 %rd743, %rd742, 3144134277; | |
mul.lo.s64 %rd2490, %rd743, 3449720151; | |
xor.b64 %rd2480, %rd734, %rd740; | |
mov.u32 %r317, -1767562579; | |
mov.u32 %r316, -766435501; | |
mov.u32 %r315, 1401181199; | |
mov.u64 %rd2498, 4055616968; | |
mov.u64 %rd2497, 1684936478; | |
mov.u64 %rd2496, 534103459; | |
mov.u64 %rd2495, 387276957; | |
mov.u64 %rd2494, 3041712726; | |
mov.u64 %rd2493, 3986602516; | |
mov.u64 %rd2492, 2835769497; | |
mov.u64 %rd2491, 3668340011; | |
mov.u64 %rd2489, 2027808484; | |
mov.u64 %rd2488, 1993301258; | |
mov.u64 %rd2486, 842468239; | |
mov.u64 %rd2485, 2654435769; | |
mov.u64 %rd2483, 3528531795; | |
mov.u64 %rd2482, 1013904242; | |
mov.u64 %rd2481, 3449720151; | |
LBB29_11: | |
shr.u64 %rd770, %rd2490, 32; | |
shr.u64 %rd771, %rd2480, 32; | |
mul.lo.s64 %rd772, %rd771, %rd2481; | |
and.b64 %rd773, %rd772, 4294967295; | |
xor.b64 %rd774, %rd773, %rd770; | |
xor.b64 %rd775, %rd774, %rd2482; | |
mul.lo.s64 %rd776, %rd775, %rd2483; | |
shr.u64 %rd777, %rd776, 32; | |
shr.u64 %rd778, %rd772, 32; | |
and.b64 %rd779, %rd2484, 4294967295; | |
xor.b64 %rd780, %rd779, %rd778; | |
xor.b64 %rd781, %rd780, %rd2485; | |
mul.lo.s64 %rd782, %rd781, %rd2483; | |
and.b64 %rd783, %rd782, 4294967295; | |
xor.b64 %rd784, %rd783, %rd777; | |
xor.b64 %rd785, %rd784, %rd2486; | |
mul.lo.s64 %rd786, %rd785, %rd2481; | |
shr.u64 %rd787, %rd786, 32; | |
shr.u64 %rd788, %rd782, 32; | |
and.b64 %rd789, %rd2487, 4294967295; | |
xor.b64 %rd790, %rd789, %rd788; | |
xor.b64 %rd791, %rd790, %rd2488; | |
mul.lo.s64 %rd792, %rd791, %rd2481; | |
and.b64 %rd793, %rd792, 4294967295; | |
xor.b64 %rd794, %rd793, %rd787; | |
xor.b64 %rd795, %rd794, %rd2489; | |
mul.lo.s64 %rd796, %rd795, %rd2483; | |
shr.u64 %rd797, %rd796, 32; | |
shr.u64 %rd798, %rd792, 32; | |
and.b64 %rd799, %rd2490, 4294967295; | |
xor.b64 %rd800, %rd799, %rd798; | |
xor.b64 %rd801, %rd800, %rd2491; | |
mul.lo.s64 %rd802, %rd801, %rd2483; | |
and.b64 %rd803, %rd802, 4294967295; | |
xor.b64 %rd804, %rd803, %rd797; | |
xor.b64 %rd805, %rd804, %rd2492; | |
mul.lo.s64 %rd806, %rd805, %rd2481; | |
shr.u64 %rd807, %rd806, 32; | |
shr.u64 %rd808, %rd802, 32; | |
and.b64 %rd809, %rd776, 4294967295; | |
xor.b64 %rd810, %rd809, %rd808; | |
xor.b64 %rd811, %rd810, %rd2493; | |
mul.lo.s64 %rd812, %rd811, %rd2481; | |
and.b64 %rd813, %rd812, 4294967295; | |
xor.b64 %rd814, %rd813, %rd807; | |
xor.b64 %rd815, %rd814, %rd2494; | |
mul.lo.s64 %rd816, %rd815, %rd2483; | |
shr.u64 %rd817, %rd816, 32; | |
shr.u64 %rd818, %rd812, 32; | |
and.b64 %rd819, %rd786, 4294967295; | |
xor.b64 %rd820, %rd819, %rd818; | |
xor.b64 %rd821, %rd820, %rd2495; | |
mul.lo.s64 %rd822, %rd821, %rd2483; | |
and.b64 %rd823, %rd822, 4294967295; | |
xor.b64 %rd824, %rd823, %rd817; | |
xor.b64 %rd825, %rd824, %rd2496; | |
mul.lo.s64 %rd826, %rd825, %rd2481; | |
shr.u64 %rd827, %rd826, 32; | |
shr.u64 %rd828, %rd822, 32; | |
and.b64 %rd829, %rd796, 4294967295; | |
xor.b64 %rd830, %rd829, %rd828; | |
xor.b64 %rd831, %rd830, %rd2497; | |
mul.lo.s64 %rd832, %rd831, %rd2481; | |
and.b64 %rd833, %rd832, 4294967295; | |
xor.b64 %rd834, %rd833, %rd827; | |
xor.b64 %rd835, %rd834, %rd2498; | |
mul.lo.s64 %rd836, %rd835, %rd2483; | |
shr.u64 %rd837, %rd836, 32; | |
cvt.u32.u64 %r82, %rd837; | |
shr.u64 %rd838, %rd832, 32; | |
xor.b64 %rd839, %rd838, %rd806; | |
cvt.u32.u64 %r83, %rd839; | |
xor.b32 %r84, %r315, %r83; | |
mul.lo.s32 %r85, %r84, %r316; | |
xor.b32 %r86, %r85, %r82; | |
xor.b32 %r87, %r86, %r317; | |
shr.u32 %r88, %r87, 9; | |
cvt.rn.f32.u32 %f49, %r88; | |
mul.rn.f32 %f50, %f49, 0f34000000; | |
cvt.rn.f16.f32 %h19, %f50; | |
mov.b16 %h20, 0x2E66; | |
setp.ge.f16 %p12, %h19, %h20; | |
ld.global.nc.b16 %h21, [%rd44+256]; | |
ld.global.nc.f32 %f51, [%rd45+512]; | |
cvt.rn.f16.f32 %h22, %f51; | |
add.rn.f16 %h23, %h21, %h22; | |
mov.b16 %h24, 0x3C72; | |
mul.rn.f16 %h25, %h23, %h24; | |
selp.b16 %h26, %h25, 0x0000, %p12; | |
cvt.f32.f16 %f52, %h26; | |
ld.global.nc.b16 %h27, [%rd46+256]; | |
cvt.f32.f16 %f53, %h27; | |
ld.global.nc.f32 %f54, [%rd47+512]; | |
mul.rn.f32 %f55, %f1, %f54; | |
mul.rn.f32 %f56, %f55, %f53; | |
ld.global.nc.f32 %f57, [%rd48+512]; | |
mul.rn.f32 %f58, %f2, %f55; | |
sub.rn.f32 %f59, %f57, %f58; | |
add.rn.f32 %f60, %f56, %f59; | |
add.rn.f32 %f61, %f60, %f52; | |
add.rn.f32 %f5, %f4, %f61; | |
or.b32 %r89, %r3, 129; | |
or.b32 %r90, %r89, %r4; | |
and.b32 %r91, %r89, 3; | |
shr.u32 %r92, %r90, 2; | |
setp.ne.s32 %p13, %r91, 1; | |
cvt.u64.u32 %rd840, %r92; | |
add.s64 %rd103, %rd11, %rd840; | |
and.b64 %rd2430, %rd103, 4294967295; | |
setp.lt.u64 %p73, %rd103, %rd11; | |
@%p13 bra LBB29_13; | |
mul.lo.s64 %rd2503, %rd2430, 3528531795; | |
selp.u64 %rd881, 1, 0, %p73; | |
add.s64 %rd882, %rd2461, %rd881; | |
xor.b64 %rd883, %rd882, %rd2503; | |
shr.u64 %rd884, %rd883, 32; | |
mul.lo.s64 %rd2506, %rd884, 3449720151; | |
shr.u64 %rd885, %rd2506, 32; | |
and.b64 %rd886, %rd882, 4294967295; | |
mul.lo.s64 %rd887, %rd886, 3449720151; | |
and.b64 %rd888, %rd887, 4294967295; | |
xor.b64 %rd889, %rd888, %rd885; | |
xor.b64 %rd890, %rd889, 2654435769; | |
mul.lo.s64 %rd2509, %rd890, 3528531795; | |
xor.b64 %rd2499, %rd887, %rd103; | |
mov.u32 %r319, -845247145; | |
mov.u32 %r318, -616729560; | |
mov.u64 %rd2516, 3041712726; | |
mov.u64 %rd2515, 1401181199; | |
mov.u64 %rd2514, 2835769497; | |
mov.u64 %rd2513, 1684936478; | |
mov.u64 %rd2512, 2027808484; | |
mov.u64 %rd2511, 387276957; | |
mov.u64 %rd2510, 842468239; | |
mov.u64 %rd2508, 3986602516; | |
mov.u64 %rd2507, 1013904242; | |
mov.u64 %rd2505, 3668340011; | |
mov.u64 %rd2504, 3144134277; | |
mov.u64 %rd2502, 3449720151; | |
mov.u64 %rd2501, 1993301258; | |
mov.u64 %rd2500, 3528531795; | |
bra.uni LBB29_14; | |
LBB29_13: | |
selp.u64 %rd855, 1, 0, %p73; | |
add.s64 %rd856, %rd2461, %rd855; | |
and.b64 %rd857, %rd856, 4294967295; | |
mul.lo.s64 %rd2503, %rd857, 3449720151; | |
xor.b64 %rd858, %rd2503, %rd103; | |
shr.u64 %rd859, %rd858, 32; | |
mul.lo.s64 %rd2506, %rd859, 3528531795; | |
shr.u64 %rd860, %rd2506, 32; | |
mul.lo.s64 %rd862, %rd2430, 3528531795; | |
and.b64 %rd863, %rd862, 4294967295; | |
xor.b64 %rd864, %rd863, %rd860; | |
xor.b64 %rd865, %rd864, 3144134277; | |
mul.lo.s64 %rd2509, %rd865, 3449720151; | |
xor.b64 %rd2499, %rd856, %rd862; | |
mov.u32 %r319, -766435501; | |
mov.u32 %r318, -239350328; | |
mov.u64 %rd2516, 1684936478; | |
mov.u64 %rd2515, 534103459; | |
mov.u64 %rd2514, 387276957; | |
mov.u64 %rd2513, 3041712726; | |
mov.u64 %rd2512, 3986602516; | |
mov.u64 %rd2511, 2835769497; | |
mov.u64 %rd2510, 3668340011; | |
mov.u64 %rd2508, 2027808484; | |
mov.u64 %rd2507, 1993301258; | |
mov.u64 %rd2505, 842468239; | |
mov.u64 %rd2504, 2654435769; | |
mov.u64 %rd2502, 3528531795; | |
mov.u64 %rd2501, 1013904242; | |
mov.u64 %rd2500, 3449720151; | |
LBB29_14: | |
shr.u64 %rd891, %rd2509, 32; | |
shr.u64 %rd892, %rd2499, 32; | |
mul.lo.s64 %rd893, %rd892, %rd2500; | |
and.b64 %rd894, %rd893, 4294967295; | |
xor.b64 %rd895, %rd894, %rd891; | |
xor.b64 %rd896, %rd895, %rd2501; | |
mul.lo.s64 %rd897, %rd896, %rd2502; | |
shr.u64 %rd898, %rd897, 32; | |
shr.u64 %rd899, %rd893, 32; | |
and.b64 %rd900, %rd2503, 4294967295; | |
xor.b64 %rd901, %rd900, %rd899; | |
xor.b64 %rd902, %rd901, %rd2504; | |
mul.lo.s64 %rd903, %rd902, %rd2502; | |
and.b64 %rd904, %rd903, 4294967295; | |
xor.b64 %rd905, %rd904, %rd898; | |
xor.b64 %rd906, %rd905, %rd2505; | |
mul.lo.s64 %rd907, %rd906, %rd2500; | |
shr.u64 %rd908, %rd907, 32; | |
shr.u64 %rd909, %rd903, 32; | |
and.b64 %rd910, %rd2506, 4294967295; | |
xor.b64 %rd911, %rd910, %rd909; | |
xor.b64 %rd912, %rd911, %rd2507; | |
mul.lo.s64 %rd913, %rd912, %rd2500; | |
and.b64 %rd914, %rd913, 4294967295; | |
xor.b64 %rd915, %rd914, %rd908; | |
xor.b64 %rd916, %rd915, %rd2508; | |
mul.lo.s64 %rd917, %rd916, %rd2502; | |
shr.u64 %rd918, %rd917, 32; | |
shr.u64 %rd919, %rd913, 32; | |
and.b64 %rd920, %rd2509, 4294967295; | |
xor.b64 %rd921, %rd920, %rd919; | |
xor.b64 %rd922, %rd921, %rd2510; | |
mul.lo.s64 %rd923, %rd922, %rd2502; | |
and.b64 %rd924, %rd923, 4294967295; | |
xor.b64 %rd925, %rd924, %rd918; | |
xor.b64 %rd926, %rd925, %rd2511; | |
mul.lo.s64 %rd927, %rd926, %rd2500; | |
shr.u64 %rd928, %rd927, 32; | |
shr.u64 %rd929, %rd923, 32; | |
and.b64 %rd930, %rd897, 4294967295; | |
xor.b64 %rd931, %rd930, %rd929; | |
xor.b64 %rd932, %rd931, %rd2512; | |
mul.lo.s64 %rd933, %rd932, %rd2500; | |
and.b64 %rd934, %rd933, 4294967295; | |
xor.b64 %rd935, %rd934, %rd928; | |
xor.b64 %rd936, %rd935, %rd2513; | |
mul.lo.s64 %rd937, %rd936, %rd2502; | |
shr.u64 %rd938, %rd937, 32; | |
shr.u64 %rd939, %rd933, 32; | |
and.b64 %rd940, %rd907, 4294967295; | |
xor.b64 %rd941, %rd940, %rd939; | |
xor.b64 %rd942, %rd941, %rd2514; | |
mul.lo.s64 %rd943, %rd942, %rd2502; | |
and.b64 %rd944, %rd943, 4294967295; | |
xor.b64 %rd945, %rd944, %rd938; | |
xor.b64 %rd946, %rd945, %rd2515; | |
mul.lo.s64 %rd947, %rd946, %rd2500; | |
shr.u64 %rd948, %rd947, 32; | |
shr.u64 %rd949, %rd943, 32; | |
xor.b64 %rd950, %rd917, %rd949; | |
xor.b64 %rd951, %rd950, %rd2516; | |
mul.lo.s64 %rd952, %rd951, %rd2500; | |
xor.b64 %rd953, %rd948, %rd952; | |
cvt.u32.u64 %r97, %rd953; | |
xor.b32 %r98, %r318, %r97; | |
mul.lo.s32 %r99, %r98, %r319; | |
shr.u32 %r100, %r99, 9; | |
cvt.rn.f32.u32 %f62, %r100; | |
mul.rn.f32 %f63, %f62, 0f34000000; | |
cvt.rn.f16.f32 %h28, %f63; | |
mov.b16 %h29, 0x2E66; | |
setp.ge.f16 %p17, %h28, %h29; | |
ld.global.nc.b16 %h30, [%rd44+258]; | |
ld.global.nc.f32 %f64, [%rd45+516]; | |
cvt.rn.f16.f32 %h31, %f64; | |
add.rn.f16 %h32, %h30, %h31; | |
mov.b16 %h33, 0x3C72; | |
mul.rn.f16 %h34, %h32, %h33; | |
selp.b16 %h35, %h34, 0x0000, %p17; | |
cvt.f32.f16 %f65, %h35; | |
ld.global.nc.b16 %h36, [%rd46+258]; | |
cvt.f32.f16 %f66, %h36; | |
ld.global.nc.f32 %f67, [%rd47+516]; | |
mul.rn.f32 %f68, %f1, %f67; | |
mul.rn.f32 %f69, %f68, %f66; | |
ld.global.nc.f32 %f70, [%rd48+516]; | |
mul.rn.f32 %f71, %f2, %f68; | |
sub.rn.f32 %f72, %f70, %f71; | |
add.rn.f32 %f73, %f69, %f72; | |
add.rn.f32 %f74, %f73, %f65; | |
add.rn.f32 %f6, %f5, %f74; | |
or.b32 %r102, %r73, 256; | |
shr.u32 %r103, %r102, 2; | |
cvt.u64.u32 %rd954, %r103; | |
add.s64 %rd130, %rd11, %rd954; | |
and.b64 %rd2426, %rd130, 4294967295; | |
setp.lt.u64 %p72, %rd130, %rd11; | |
@%p8 bra LBB29_16; | |
mul.lo.s64 %rd2521, %rd2426, 3528531795; | |
selp.u64 %rd997, 1, 0, %p72; | |
add.s64 %rd998, %rd2461, %rd997; | |
xor.b64 %rd999, %rd998, %rd2521; | |
shr.u64 %rd1000, %rd999, 32; | |
mul.lo.s64 %rd2524, %rd1000, 3449720151; | |
shr.u64 %rd1001, %rd2524, 32; | |
and.b64 %rd1002, %rd998, 4294967295; | |
mul.lo.s64 %rd1003, %rd1002, 3449720151; | |
and.b64 %rd1004, %rd1003, 4294967295; | |
xor.b64 %rd1005, %rd1004, %rd1001; | |
xor.b64 %rd1006, %rd1005, 2654435769; | |
mul.lo.s64 %rd2527, %rd1006, 3528531795; | |
xor.b64 %rd2517, %rd1003, %rd130; | |
mov.u32 %r322, -1879881855; | |
mov.u32 %r321, -845247145; | |
mov.u32 %r320, 534103459; | |
mov.u64 %rd2535, 3678237736; | |
mov.u64 %rd2534, 3041712726; | |
mov.u64 %rd2533, 1401181199; | |
mov.u64 %rd2532, 2835769497; | |
mov.u64 %rd2531, 1684936478; | |
mov.u64 %rd2530, 2027808484; | |
mov.u64 %rd2529, 387276957; | |
mov.u64 %rd2528, 842468239; | |
mov.u64 %rd2526, 3986602516; | |
mov.u64 %rd2525, 1013904242; | |
mov.u64 %rd2523, 3668340011; | |
mov.u64 %rd2522, 3144134277; | |
mov.u64 %rd2520, 3449720151; | |
mov.u64 %rd2519, 1993301258; | |
mov.u64 %rd2518, 3528531795; | |
bra.uni LBB29_17; | |
LBB29_16: | |
selp.u64 %rd970, 1, 0, %p72; | |
add.s64 %rd971, %rd2461, %rd970; | |
and.b64 %rd972, %rd971, 4294967295; | |
mul.lo.s64 %rd2521, %rd972, 3449720151; | |
xor.b64 %rd973, %rd2521, %rd130; | |
shr.u64 %rd974, %rd973, 32; | |
mul.lo.s64 %rd2524, %rd974, 3528531795; | |
shr.u64 %rd975, %rd2524, 32; | |
mul.lo.s64 %rd977, %rd2426, 3528531795; | |
and.b64 %rd978, %rd977, 4294967295; | |
xor.b64 %rd979, %rd978, %rd975; | |
xor.b64 %rd980, %rd979, 3144134277; | |
mul.lo.s64 %rd2527, %rd980, 3449720151; | |
xor.b64 %rd2517, %rd971, %rd977; | |
mov.u32 %r322, -1767562579; | |
mov.u32 %r321, -766435501; | |
mov.u32 %r320, 1401181199; | |
mov.u64 %rd2535, 4055616968; | |
mov.u64 %rd2534, 1684936478; | |
mov.u64 %rd2533, 534103459; | |
mov.u64 %rd2532, 387276957; | |
mov.u64 %rd2531, 3041712726; | |
mov.u64 %rd2530, 3986602516; | |
mov.u64 %rd2529, 2835769497; | |
mov.u64 %rd2528, 3668340011; | |
mov.u64 %rd2526, 2027808484; | |
mov.u64 %rd2525, 1993301258; | |
mov.u64 %rd2523, 842468239; | |
mov.u64 %rd2522, 2654435769; | |
mov.u64 %rd2520, 3528531795; | |
mov.u64 %rd2519, 1013904242; | |
mov.u64 %rd2518, 3449720151; | |
LBB29_17: | |
shr.u64 %rd1007, %rd2527, 32; | |
shr.u64 %rd1008, %rd2517, 32; | |
mul.lo.s64 %rd1009, %rd1008, %rd2518; | |
and.b64 %rd1010, %rd1009, 4294967295; | |
xor.b64 %rd1011, %rd1010, %rd1007; | |
xor.b64 %rd1012, %rd1011, %rd2519; | |
mul.lo.s64 %rd1013, %rd1012, %rd2520; | |
shr.u64 %rd1014, %rd1013, 32; | |
shr.u64 %rd1015, %rd1009, 32; | |
and.b64 %rd1016, %rd2521, 4294967295; | |
xor.b64 %rd1017, %rd1016, %rd1015; | |
xor.b64 %rd1018, %rd1017, %rd2522; | |
mul.lo.s64 %rd1019, %rd1018, %rd2520; | |
and.b64 %rd1020, %rd1019, 4294967295; | |
xor.b64 %rd1021, %rd1020, %rd1014; | |
xor.b64 %rd1022, %rd1021, %rd2523; | |
mul.lo.s64 %rd1023, %rd1022, %rd2518; | |
shr.u64 %rd1024, %rd1023, 32; | |
shr.u64 %rd1025, %rd1019, 32; | |
and.b64 %rd1026, %rd2524, 4294967295; | |
xor.b64 %rd1027, %rd1026, %rd1025; | |
xor.b64 %rd1028, %rd1027, %rd2525; | |
mul.lo.s64 %rd1029, %rd1028, %rd2518; | |
and.b64 %rd1030, %rd1029, 4294967295; | |
xor.b64 %rd1031, %rd1030, %rd1024; | |
xor.b64 %rd1032, %rd1031, %rd2526; | |
mul.lo.s64 %rd1033, %rd1032, %rd2520; | |
shr.u64 %rd1034, %rd1033, 32; | |
shr.u64 %rd1035, %rd1029, 32; | |
and.b64 %rd1036, %rd2527, 4294967295; | |
xor.b64 %rd1037, %rd1036, %rd1035; | |
xor.b64 %rd1038, %rd1037, %rd2528; | |
mul.lo.s64 %rd1039, %rd1038, %rd2520; | |
and.b64 %rd1040, %rd1039, 4294967295; | |
xor.b64 %rd1041, %rd1040, %rd1034; | |
xor.b64 %rd1042, %rd1041, %rd2529; | |
mul.lo.s64 %rd1043, %rd1042, %rd2518; | |
shr.u64 %rd1044, %rd1043, 32; | |
shr.u64 %rd1045, %rd1039, 32; | |
and.b64 %rd1046, %rd1013, 4294967295; | |
xor.b64 %rd1047, %rd1046, %rd1045; | |
xor.b64 %rd1048, %rd1047, %rd2530; | |
mul.lo.s64 %rd1049, %rd1048, %rd2518; | |
and.b64 %rd1050, %rd1049, 4294967295; | |
xor.b64 %rd1051, %rd1050, %rd1044; | |
xor.b64 %rd1052, %rd1051, %rd2531; | |
mul.lo.s64 %rd1053, %rd1052, %rd2520; | |
shr.u64 %rd1054, %rd1053, 32; | |
shr.u64 %rd1055, %rd1049, 32; | |
and.b64 %rd1056, %rd1023, 4294967295; | |
xor.b64 %rd1057, %rd1056, %rd1055; | |
xor.b64 %rd1058, %rd1057, %rd2532; | |
mul.lo.s64 %rd1059, %rd1058, %rd2520; | |
and.b64 %rd1060, %rd1059, 4294967295; | |
xor.b64 %rd1061, %rd1060, %rd1054; | |
xor.b64 %rd1062, %rd1061, %rd2533; | |
mul.lo.s64 %rd1063, %rd1062, %rd2518; | |
shr.u64 %rd1064, %rd1063, 32; | |
shr.u64 %rd1065, %rd1059, 32; | |
and.b64 %rd1066, %rd1033, 4294967295; | |
xor.b64 %rd1067, %rd1066, %rd1065; | |
xor.b64 %rd1068, %rd1067, %rd2534; | |
mul.lo.s64 %rd1069, %rd1068, %rd2518; | |
and.b64 %rd1070, %rd1069, 4294967295; | |
xor.b64 %rd1071, %rd1070, %rd1064; | |
xor.b64 %rd1072, %rd1071, %rd2535; | |
mul.lo.s64 %rd1073, %rd1072, %rd2520; | |
shr.u64 %rd1074, %rd1073, 32; | |
cvt.u32.u64 %r110, %rd1074; | |
shr.u64 %rd1075, %rd1069, 32; | |
xor.b64 %rd1076, %rd1075, %rd1043; | |
cvt.u32.u64 %r111, %rd1076; | |
xor.b32 %r112, %r320, %r111; | |
mul.lo.s32 %r113, %r112, %r321; | |
xor.b32 %r114, %r113, %r110; | |
xor.b32 %r115, %r114, %r322; | |
shr.u32 %r116, %r115, 9; | |
cvt.rn.f32.u32 %f75, %r116; | |
mul.rn.f32 %f76, %f75, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f76; | |
mov.b16 %h38, 0x2E66; | |
setp.ge.f16 %p20, %h37, %h38; | |
ld.global.nc.b16 %h39, [%rd44+512]; | |
ld.global.nc.f32 %f77, [%rd45+1024]; | |
cvt.rn.f16.f32 %h40, %f77; | |
add.rn.f16 %h41, %h39, %h40; | |
mov.b16 %h42, 0x3C72; | |
mul.rn.f16 %h43, %h41, %h42; | |
selp.b16 %h44, %h43, 0x0000, %p20; | |
cvt.f32.f16 %f78, %h44; | |
ld.global.nc.b16 %h45, [%rd46+512]; | |
cvt.f32.f16 %f79, %h45; | |
ld.global.nc.f32 %f80, [%rd47+1024]; | |
mul.rn.f32 %f81, %f1, %f80; | |
mul.rn.f32 %f82, %f81, %f79; | |
ld.global.nc.f32 %f83, [%rd48+1024]; | |
mul.rn.f32 %f84, %f2, %f81; | |
sub.rn.f32 %f85, %f83, %f84; | |
add.rn.f32 %f86, %f82, %f85; | |
add.rn.f32 %f87, %f86, %f78; | |
add.rn.f32 %f7, %f6, %f87; | |
or.b32 %r117, %r3, 257; | |
or.b32 %r118, %r117, %r4; | |
and.b32 %r119, %r117, 3; | |
shr.u32 %r120, %r118, 2; | |
setp.ne.s32 %p21, %r119, 1; | |
cvt.u64.u32 %rd1077, %r120; | |
add.s64 %rd158, %rd11, %rd1077; | |
and.b64 %rd2423, %rd158, 4294967295; | |
setp.lt.u64 %p71, %rd158, %rd11; | |
@%p21 bra LBB29_19; | |
mul.lo.s64 %rd2540, %rd2423, 3528531795; | |
selp.u64 %rd1118, 1, 0, %p71; | |
add.s64 %rd1119, %rd2461, %rd1118; | |
xor.b64 %rd1120, %rd1119, %rd2540; | |
shr.u64 %rd1121, %rd1120, 32; | |
mul.lo.s64 %rd2543, %rd1121, 3449720151; | |
shr.u64 %rd1122, %rd2543, 32; | |
and.b64 %rd1123, %rd1119, 4294967295; | |
mul.lo.s64 %rd1124, %rd1123, 3449720151; | |
and.b64 %rd1125, %rd1124, 4294967295; | |
xor.b64 %rd1126, %rd1125, %rd1122; | |
xor.b64 %rd1127, %rd1126, 2654435769; | |
mul.lo.s64 %rd2546, %rd1127, 3528531795; | |
xor.b64 %rd2536, %rd1124, %rd158; | |
mov.u32 %r324, -845247145; | |
mov.u32 %r323, -616729560; | |
mov.u64 %rd2553, 3041712726; | |
mov.u64 %rd2552, 1401181199; | |
mov.u64 %rd2551, 2835769497; | |
mov.u64 %rd2550, 1684936478; | |
mov.u64 %rd2549, 2027808484; | |
mov.u64 %rd2548, 387276957; | |
mov.u64 %rd2547, 842468239; | |
mov.u64 %rd2545, 3986602516; | |
mov.u64 %rd2544, 1013904242; | |
mov.u64 %rd2542, 3668340011; | |
mov.u64 %rd2541, 3144134277; | |
mov.u64 %rd2539, 3449720151; | |
mov.u64 %rd2538, 1993301258; | |
mov.u64 %rd2537, 3528531795; | |
bra.uni LBB29_20; | |
LBB29_19: | |
selp.u64 %rd1092, 1, 0, %p71; | |
add.s64 %rd1093, %rd2461, %rd1092; | |
and.b64 %rd1094, %rd1093, 4294967295; | |
mul.lo.s64 %rd2540, %rd1094, 3449720151; | |
xor.b64 %rd1095, %rd2540, %rd158; | |
shr.u64 %rd1096, %rd1095, 32; | |
mul.lo.s64 %rd2543, %rd1096, 3528531795; | |
shr.u64 %rd1097, %rd2543, 32; | |
mul.lo.s64 %rd1099, %rd2423, 3528531795; | |
and.b64 %rd1100, %rd1099, 4294967295; | |
xor.b64 %rd1101, %rd1100, %rd1097; | |
xor.b64 %rd1102, %rd1101, 3144134277; | |
mul.lo.s64 %rd2546, %rd1102, 3449720151; | |
xor.b64 %rd2536, %rd1093, %rd1099; | |
mov.u32 %r324, -766435501; | |
mov.u32 %r323, -239350328; | |
mov.u64 %rd2553, 1684936478; | |
mov.u64 %rd2552, 534103459; | |
mov.u64 %rd2551, 387276957; | |
mov.u64 %rd2550, 3041712726; | |
mov.u64 %rd2549, 3986602516; | |
mov.u64 %rd2548, 2835769497; | |
mov.u64 %rd2547, 3668340011; | |
mov.u64 %rd2545, 2027808484; | |
mov.u64 %rd2544, 1993301258; | |
mov.u64 %rd2542, 842468239; | |
mov.u64 %rd2541, 2654435769; | |
mov.u64 %rd2539, 3528531795; | |
mov.u64 %rd2538, 1013904242; | |
mov.u64 %rd2537, 3449720151; | |
LBB29_20: | |
shr.u64 %rd1128, %rd2546, 32; | |
shr.u64 %rd1129, %rd2536, 32; | |
mul.lo.s64 %rd1130, %rd1129, %rd2537; | |
and.b64 %rd1131, %rd1130, 4294967295; | |
xor.b64 %rd1132, %rd1131, %rd1128; | |
xor.b64 %rd1133, %rd1132, %rd2538; | |
mul.lo.s64 %rd1134, %rd1133, %rd2539; | |
shr.u64 %rd1135, %rd1134, 32; | |
shr.u64 %rd1136, %rd1130, 32; | |
and.b64 %rd1137, %rd2540, 4294967295; | |
xor.b64 %rd1138, %rd1137, %rd1136; | |
xor.b64 %rd1139, %rd1138, %rd2541; | |
mul.lo.s64 %rd1140, %rd1139, %rd2539; | |
and.b64 %rd1141, %rd1140, 4294967295; | |
xor.b64 %rd1142, %rd1141, %rd1135; | |
xor.b64 %rd1143, %rd1142, %rd2542; | |
mul.lo.s64 %rd1144, %rd1143, %rd2537; | |
shr.u64 %rd1145, %rd1144, 32; | |
shr.u64 %rd1146, %rd1140, 32; | |
and.b64 %rd1147, %rd2543, 4294967295; | |
xor.b64 %rd1148, %rd1147, %rd1146; | |
xor.b64 %rd1149, %rd1148, %rd2544; | |
mul.lo.s64 %rd1150, %rd1149, %rd2537; | |
and.b64 %rd1151, %rd1150, 4294967295; | |
xor.b64 %rd1152, %rd1151, %rd1145; | |
xor.b64 %rd1153, %rd1152, %rd2545; | |
mul.lo.s64 %rd1154, %rd1153, %rd2539; | |
shr.u64 %rd1155, %rd1154, 32; | |
shr.u64 %rd1156, %rd1150, 32; | |
and.b64 %rd1157, %rd2546, 4294967295; | |
xor.b64 %rd1158, %rd1157, %rd1156; | |
xor.b64 %rd1159, %rd1158, %rd2547; | |
mul.lo.s64 %rd1160, %rd1159, %rd2539; | |
and.b64 %rd1161, %rd1160, 4294967295; | |
xor.b64 %rd1162, %rd1161, %rd1155; | |
xor.b64 %rd1163, %rd1162, %rd2548; | |
mul.lo.s64 %rd1164, %rd1163, %rd2537; | |
shr.u64 %rd1165, %rd1164, 32; | |
shr.u64 %rd1166, %rd1160, 32; | |
and.b64 %rd1167, %rd1134, 4294967295; | |
xor.b64 %rd1168, %rd1167, %rd1166; | |
xor.b64 %rd1169, %rd1168, %rd2549; | |
mul.lo.s64 %rd1170, %rd1169, %rd2537; | |
and.b64 %rd1171, %rd1170, 4294967295; | |
xor.b64 %rd1172, %rd1171, %rd1165; | |
xor.b64 %rd1173, %rd1172, %rd2550; | |
mul.lo.s64 %rd1174, %rd1173, %rd2539; | |
shr.u64 %rd1175, %rd1174, 32; | |
shr.u64 %rd1176, %rd1170, 32; | |
and.b64 %rd1177, %rd1144, 4294967295; | |
xor.b64 %rd1178, %rd1177, %rd1176; | |
xor.b64 %rd1179, %rd1178, %rd2551; | |
mul.lo.s64 %rd1180, %rd1179, %rd2539; | |
and.b64 %rd1181, %rd1180, 4294967295; | |
xor.b64 %rd1182, %rd1181, %rd1175; | |
xor.b64 %rd1183, %rd1182, %rd2552; | |
mul.lo.s64 %rd1184, %rd1183, %rd2537; | |
shr.u64 %rd1185, %rd1184, 32; | |
shr.u64 %rd1186, %rd1180, 32; | |
xor.b64 %rd1187, %rd1154, %rd1186; | |
xor.b64 %rd1188, %rd1187, %rd2553; | |
mul.lo.s64 %rd1189, %rd1188, %rd2537; | |
xor.b64 %rd1190, %rd1185, %rd1189; | |
cvt.u32.u64 %r125, %rd1190; | |
xor.b32 %r126, %r323, %r125; | |
mul.lo.s32 %r127, %r126, %r324; | |
shr.u32 %r128, %r127, 9; | |
cvt.rn.f32.u32 %f88, %r128; | |
mul.rn.f32 %f89, %f88, 0f34000000; | |
cvt.rn.f16.f32 %h46, %f89; | |
mov.b16 %h47, 0x2E66; | |
setp.ge.f16 %p25, %h46, %h47; | |
ld.global.nc.b16 %h48, [%rd44+514]; | |
ld.global.nc.f32 %f90, [%rd45+1028]; | |
cvt.rn.f16.f32 %h49, %f90; | |
add.rn.f16 %h50, %h48, %h49; | |
mov.b16 %h51, 0x3C72; | |
mul.rn.f16 %h52, %h50, %h51; | |
selp.b16 %h53, %h52, 0x0000, %p25; | |
cvt.f32.f16 %f91, %h53; | |
ld.global.nc.b16 %h54, [%rd46+514]; | |
cvt.f32.f16 %f92, %h54; | |
ld.global.nc.f32 %f93, [%rd47+1028]; | |
mul.rn.f32 %f94, %f1, %f93; | |
mul.rn.f32 %f95, %f94, %f92; | |
ld.global.nc.f32 %f96, [%rd48+1028]; | |
mul.rn.f32 %f97, %f2, %f94; | |
sub.rn.f32 %f98, %f96, %f97; | |
add.rn.f32 %f99, %f95, %f98; | |
add.rn.f32 %f100, %f99, %f91; | |
add.rn.f32 %f8, %f7, %f100; | |
or.b32 %r130, %r73, 384; | |
shr.u32 %r131, %r130, 2; | |
cvt.u64.u32 %rd1191, %r131; | |
add.s64 %rd185, %rd11, %rd1191; | |
and.b64 %rd2419, %rd185, 4294967295; | |
setp.lt.u64 %p70, %rd185, %rd11; | |
@%p8 bra LBB29_22; | |
mul.lo.s64 %rd2558, %rd2419, 3528531795; | |
selp.u64 %rd1234, 1, 0, %p70; | |
add.s64 %rd1235, %rd2461, %rd1234; | |
xor.b64 %rd1236, %rd1235, %rd2558; | |
shr.u64 %rd1237, %rd1236, 32; | |
mul.lo.s64 %rd2561, %rd1237, 3449720151; | |
shr.u64 %rd1238, %rd2561, 32; | |
and.b64 %rd1239, %rd1235, 4294967295; | |
mul.lo.s64 %rd1240, %rd1239, 3449720151; | |
and.b64 %rd1241, %rd1240, 4294967295; | |
xor.b64 %rd1242, %rd1241, %rd1238; | |
xor.b64 %rd1243, %rd1242, 2654435769; | |
mul.lo.s64 %rd2564, %rd1243, 3528531795; | |
xor.b64 %rd2554, %rd1240, %rd185; | |
mov.u32 %r327, -1879881855; | |
mov.u32 %r326, -845247145; | |
mov.u32 %r325, 534103459; | |
mov.u64 %rd2572, 3678237736; | |
mov.u64 %rd2571, 3041712726; | |
mov.u64 %rd2570, 1401181199; | |
mov.u64 %rd2569, 2835769497; | |
mov.u64 %rd2568, 1684936478; | |
mov.u64 %rd2567, 2027808484; | |
mov.u64 %rd2566, 387276957; | |
mov.u64 %rd2565, 842468239; | |
mov.u64 %rd2563, 3986602516; | |
mov.u64 %rd2562, 1013904242; | |
mov.u64 %rd2560, 3668340011; | |
mov.u64 %rd2559, 3144134277; | |
mov.u64 %rd2557, 3449720151; | |
mov.u64 %rd2556, 1993301258; | |
mov.u64 %rd2555, 3528531795; | |
bra.uni LBB29_23; | |
LBB29_22: | |
selp.u64 %rd1207, 1, 0, %p70; | |
add.s64 %rd1208, %rd2461, %rd1207; | |
and.b64 %rd1209, %rd1208, 4294967295; | |
mul.lo.s64 %rd2558, %rd1209, 3449720151; | |
xor.b64 %rd1210, %rd2558, %rd185; | |
shr.u64 %rd1211, %rd1210, 32; | |
mul.lo.s64 %rd2561, %rd1211, 3528531795; | |
shr.u64 %rd1212, %rd2561, 32; | |
mul.lo.s64 %rd1214, %rd2419, 3528531795; | |
and.b64 %rd1215, %rd1214, 4294967295; | |
xor.b64 %rd1216, %rd1215, %rd1212; | |
xor.b64 %rd1217, %rd1216, 3144134277; | |
mul.lo.s64 %rd2564, %rd1217, 3449720151; | |
xor.b64 %rd2554, %rd1208, %rd1214; | |
mov.u32 %r327, -1767562579; | |
mov.u32 %r326, -766435501; | |
mov.u32 %r325, 1401181199; | |
mov.u64 %rd2572, 4055616968; | |
mov.u64 %rd2571, 1684936478; | |
mov.u64 %rd2570, 534103459; | |
mov.u64 %rd2569, 387276957; | |
mov.u64 %rd2568, 3041712726; | |
mov.u64 %rd2567, 3986602516; | |
mov.u64 %rd2566, 2835769497; | |
mov.u64 %rd2565, 3668340011; | |
mov.u64 %rd2563, 2027808484; | |
mov.u64 %rd2562, 1993301258; | |
mov.u64 %rd2560, 842468239; | |
mov.u64 %rd2559, 2654435769; | |
mov.u64 %rd2557, 3528531795; | |
mov.u64 %rd2556, 1013904242; | |
mov.u64 %rd2555, 3449720151; | |
LBB29_23: | |
shr.u64 %rd1244, %rd2564, 32; | |
shr.u64 %rd1245, %rd2554, 32; | |
mul.lo.s64 %rd1246, %rd1245, %rd2555; | |
and.b64 %rd1247, %rd1246, 4294967295; | |
xor.b64 %rd1248, %rd1247, %rd1244; | |
xor.b64 %rd1249, %rd1248, %rd2556; | |
mul.lo.s64 %rd1250, %rd1249, %rd2557; | |
shr.u64 %rd1251, %rd1250, 32; | |
shr.u64 %rd1252, %rd1246, 32; | |
and.b64 %rd1253, %rd2558, 4294967295; | |
xor.b64 %rd1254, %rd1253, %rd1252; | |
xor.b64 %rd1255, %rd1254, %rd2559; | |
mul.lo.s64 %rd1256, %rd1255, %rd2557; | |
and.b64 %rd1257, %rd1256, 4294967295; | |
xor.b64 %rd1258, %rd1257, %rd1251; | |
xor.b64 %rd1259, %rd1258, %rd2560; | |
mul.lo.s64 %rd1260, %rd1259, %rd2555; | |
shr.u64 %rd1261, %rd1260, 32; | |
shr.u64 %rd1262, %rd1256, 32; | |
and.b64 %rd1263, %rd2561, 4294967295; | |
xor.b64 %rd1264, %rd1263, %rd1262; | |
xor.b64 %rd1265, %rd1264, %rd2562; | |
mul.lo.s64 %rd1266, %rd1265, %rd2555; | |
and.b64 %rd1267, %rd1266, 4294967295; | |
xor.b64 %rd1268, %rd1267, %rd1261; | |
xor.b64 %rd1269, %rd1268, %rd2563; | |
mul.lo.s64 %rd1270, %rd1269, %rd2557; | |
shr.u64 %rd1271, %rd1270, 32; | |
shr.u64 %rd1272, %rd1266, 32; | |
and.b64 %rd1273, %rd2564, 4294967295; | |
xor.b64 %rd1274, %rd1273, %rd1272; | |
xor.b64 %rd1275, %rd1274, %rd2565; | |
mul.lo.s64 %rd1276, %rd1275, %rd2557; | |
and.b64 %rd1277, %rd1276, 4294967295; | |
xor.b64 %rd1278, %rd1277, %rd1271; | |
xor.b64 %rd1279, %rd1278, %rd2566; | |
mul.lo.s64 %rd1280, %rd1279, %rd2555; | |
shr.u64 %rd1281, %rd1280, 32; | |
shr.u64 %rd1282, %rd1276, 32; | |
and.b64 %rd1283, %rd1250, 4294967295; | |
xor.b64 %rd1284, %rd1283, %rd1282; | |
xor.b64 %rd1285, %rd1284, %rd2567; | |
mul.lo.s64 %rd1286, %rd1285, %rd2555; | |
and.b64 %rd1287, %rd1286, 4294967295; | |
xor.b64 %rd1288, %rd1287, %rd1281; | |
xor.b64 %rd1289, %rd1288, %rd2568; | |
mul.lo.s64 %rd1290, %rd1289, %rd2557; | |
shr.u64 %rd1291, %rd1290, 32; | |
shr.u64 %rd1292, %rd1286, 32; | |
and.b64 %rd1293, %rd1260, 4294967295; | |
xor.b64 %rd1294, %rd1293, %rd1292; | |
xor.b64 %rd1295, %rd1294, %rd2569; | |
mul.lo.s64 %rd1296, %rd1295, %rd2557; | |
and.b64 %rd1297, %rd1296, 4294967295; | |
xor.b64 %rd1298, %rd1297, %rd1291; | |
xor.b64 %rd1299, %rd1298, %rd2570; | |
mul.lo.s64 %rd1300, %rd1299, %rd2555; | |
shr.u64 %rd1301, %rd1300, 32; | |
shr.u64 %rd1302, %rd1296, 32; | |
and.b64 %rd1303, %rd1270, 4294967295; | |
xor.b64 %rd1304, %rd1303, %rd1302; | |
xor.b64 %rd1305, %rd1304, %rd2571; | |
mul.lo.s64 %rd1306, %rd1305, %rd2555; | |
and.b64 %rd1307, %rd1306, 4294967295; | |
xor.b64 %rd1308, %rd1307, %rd1301; | |
xor.b64 %rd1309, %rd1308, %rd2572; | |
mul.lo.s64 %rd1310, %rd1309, %rd2557; | |
shr.u64 %rd1311, %rd1310, 32; | |
cvt.u32.u64 %r138, %rd1311; | |
shr.u64 %rd1312, %rd1306, 32; | |
xor.b64 %rd1313, %rd1312, %rd1280; | |
cvt.u32.u64 %r139, %rd1313; | |
xor.b32 %r140, %r325, %r139; | |
mul.lo.s32 %r141, %r140, %r326; | |
xor.b32 %r142, %r141, %r138; | |
xor.b32 %r143, %r142, %r327; | |
shr.u32 %r144, %r143, 9; | |
cvt.rn.f32.u32 %f101, %r144; | |
mul.rn.f32 %f102, %f101, 0f34000000; | |
cvt.rn.f16.f32 %h55, %f102; | |
mov.b16 %h56, 0x2E66; | |
setp.ge.f16 %p28, %h55, %h56; | |
ld.global.nc.b16 %h57, [%rd44+768]; | |
ld.global.nc.f32 %f103, [%rd45+1536]; | |
cvt.rn.f16.f32 %h58, %f103; | |
add.rn.f16 %h59, %h57, %h58; | |
mov.b16 %h60, 0x3C72; | |
mul.rn.f16 %h61, %h59, %h60; | |
selp.b16 %h62, %h61, 0x0000, %p28; | |
cvt.f32.f16 %f104, %h62; | |
ld.global.nc.b16 %h63, [%rd46+768]; | |
cvt.f32.f16 %f105, %h63; | |
ld.global.nc.f32 %f106, [%rd47+1536]; | |
mul.rn.f32 %f107, %f1, %f106; | |
mul.rn.f32 %f108, %f107, %f105; | |
ld.global.nc.f32 %f109, [%rd48+1536]; | |
mul.rn.f32 %f110, %f2, %f107; | |
sub.rn.f32 %f111, %f109, %f110; | |
add.rn.f32 %f112, %f108, %f111; | |
add.rn.f32 %f113, %f112, %f104; | |
add.rn.f32 %f9, %f8, %f113; | |
or.b32 %r145, %r3, 385; | |
or.b32 %r146, %r145, %r4; | |
and.b32 %r147, %r145, 3; | |
shr.u32 %r148, %r146, 2; | |
setp.ne.s32 %p29, %r147, 1; | |
cvt.u64.u32 %rd1314, %r148; | |
add.s64 %rd213, %rd11, %rd1314; | |
@%p29 bra LBB29_25; | |
and.b64 %rd1354, %rd213, 4294967295; | |
mul.lo.s64 %rd2577, %rd1354, 3528531795; | |
setp.lt.u64 %p31, %rd213, %rd11; | |
selp.u64 %rd1355, 1, 0, %p31; | |
add.s64 %rd1356, %rd2461, %rd1355; | |
xor.b64 %rd1357, %rd1356, %rd2577; | |
shr.u64 %rd1358, %rd1357, 32; | |
mul.lo.s64 %rd2580, %rd1358, 3449720151; | |
shr.u64 %rd1359, %rd2580, 32; | |
and.b64 %rd1360, %rd1356, 4294967295; | |
mul.lo.s64 %rd1361, %rd1360, 3449720151; | |
and.b64 %rd1362, %rd1361, 4294967295; | |
xor.b64 %rd1363, %rd1362, %rd1359; | |
xor.b64 %rd1364, %rd1363, 2654435769; | |
mul.lo.s64 %rd2583, %rd1364, 3528531795; | |
xor.b64 %rd2573, %rd1361, %rd213; | |
mov.u32 %r329, -845247145; | |
mov.u32 %r328, -616729560; | |
mov.u64 %rd2590, 3041712726; | |
mov.u64 %rd2589, 1401181199; | |
mov.u64 %rd2588, 2835769497; | |
mov.u64 %rd2587, 1684936478; | |
mov.u64 %rd2586, 2027808484; | |
mov.u64 %rd2585, 387276957; | |
mov.u64 %rd2584, 842468239; | |
mov.u64 %rd2582, 3986602516; | |
mov.u64 %rd2581, 1013904242; | |
mov.u64 %rd2579, 3668340011; | |
mov.u64 %rd2578, 3144134277; | |
mov.u64 %rd2576, 3449720151; | |
mov.u64 %rd2575, 1993301258; | |
mov.u64 %rd2574, 3528531795; | |
bra.uni LBB29_26; | |
LBB29_25: | |
setp.lt.u64 %p30, %rd213, %rd11; | |
selp.u64 %rd1329, 1, 0, %p30; | |
add.s64 %rd1330, %rd2461, %rd1329; | |
and.b64 %rd1331, %rd1330, 4294967295; | |
mul.lo.s64 %rd2577, %rd1331, 3449720151; | |
xor.b64 %rd1332, %rd2577, %rd213; | |
shr.u64 %rd1333, %rd1332, 32; | |
mul.lo.s64 %rd2580, %rd1333, 3528531795; | |
shr.u64 %rd1334, %rd2580, 32; | |
and.b64 %rd1335, %rd213, 4294967295; | |
mul.lo.s64 %rd1336, %rd1335, 3528531795; | |
and.b64 %rd1337, %rd1336, 4294967295; | |
xor.b64 %rd1338, %rd1337, %rd1334; | |
xor.b64 %rd1339, %rd1338, 3144134277; | |
mul.lo.s64 %rd2583, %rd1339, 3449720151; | |
xor.b64 %rd2573, %rd1330, %rd1336; | |
mov.u32 %r329, -766435501; | |
mov.u32 %r328, -239350328; | |
mov.u64 %rd2590, 1684936478; | |
mov.u64 %rd2589, 534103459; | |
mov.u64 %rd2588, 387276957; | |
mov.u64 %rd2587, 3041712726; | |
mov.u64 %rd2586, 3986602516; | |
mov.u64 %rd2585, 2835769497; | |
mov.u64 %rd2584, 3668340011; | |
mov.u64 %rd2582, 2027808484; | |
mov.u64 %rd2581, 1993301258; | |
mov.u64 %rd2579, 842468239; | |
mov.u64 %rd2578, 2654435769; | |
mov.u64 %rd2576, 3528531795; | |
mov.u64 %rd2575, 1013904242; | |
mov.u64 %rd2574, 3449720151; | |
LBB29_26: | |
shr.u64 %rd1365, %rd2583, 32; | |
shr.u64 %rd1366, %rd2573, 32; | |
mul.lo.s64 %rd1367, %rd1366, %rd2574; | |
and.b64 %rd1368, %rd1367, 4294967295; | |
xor.b64 %rd1369, %rd1368, %rd1365; | |
xor.b64 %rd1370, %rd1369, %rd2575; | |
mul.lo.s64 %rd1371, %rd1370, %rd2576; | |
shr.u64 %rd1372, %rd1371, 32; | |
shr.u64 %rd1373, %rd1367, 32; | |
and.b64 %rd1374, %rd2577, 4294967295; | |
xor.b64 %rd1375, %rd1374, %rd1373; | |
xor.b64 %rd1376, %rd1375, %rd2578; | |
mul.lo.s64 %rd1377, %rd1376, %rd2576; | |
and.b64 %rd1378, %rd1377, 4294967295; | |
xor.b64 %rd1379, %rd1378, %rd1372; | |
xor.b64 %rd1380, %rd1379, %rd2579; | |
mul.lo.s64 %rd1381, %rd1380, %rd2574; | |
shr.u64 %rd1382, %rd1381, 32; | |
shr.u64 %rd1383, %rd1377, 32; | |
and.b64 %rd1384, %rd2580, 4294967295; | |
xor.b64 %rd1385, %rd1384, %rd1383; | |
xor.b64 %rd1386, %rd1385, %rd2581; | |
mul.lo.s64 %rd1387, %rd1386, %rd2574; | |
and.b64 %rd1388, %rd1387, 4294967295; | |
xor.b64 %rd1389, %rd1388, %rd1382; | |
xor.b64 %rd1390, %rd1389, %rd2582; | |
mul.lo.s64 %rd1391, %rd1390, %rd2576; | |
shr.u64 %rd1392, %rd1391, 32; | |
shr.u64 %rd1393, %rd1387, 32; | |
and.b64 %rd1394, %rd2583, 4294967295; | |
xor.b64 %rd1395, %rd1394, %rd1393; | |
xor.b64 %rd1396, %rd1395, %rd2584; | |
mul.lo.s64 %rd1397, %rd1396, %rd2576; | |
and.b64 %rd1398, %rd1397, 4294967295; | |
xor.b64 %rd1399, %rd1398, %rd1392; | |
xor.b64 %rd1400, %rd1399, %rd2585; | |
mul.lo.s64 %rd1401, %rd1400, %rd2574; | |
shr.u64 %rd1402, %rd1401, 32; | |
shr.u64 %rd1403, %rd1397, 32; | |
and.b64 %rd1404, %rd1371, 4294967295; | |
xor.b64 %rd1405, %rd1404, %rd1403; | |
xor.b64 %rd1406, %rd1405, %rd2586; | |
mul.lo.s64 %rd1407, %rd1406, %rd2574; | |
and.b64 %rd1408, %rd1407, 4294967295; | |
xor.b64 %rd1409, %rd1408, %rd1402; | |
xor.b64 %rd1410, %rd1409, %rd2587; | |
mul.lo.s64 %rd1411, %rd1410, %rd2576; | |
shr.u64 %rd1412, %rd1411, 32; | |
shr.u64 %rd1413, %rd1407, 32; | |
and.b64 %rd1414, %rd1381, 4294967295; | |
xor.b64 %rd1415, %rd1414, %rd1413; | |
xor.b64 %rd1416, %rd1415, %rd2588; | |
mul.lo.s64 %rd1417, %rd1416, %rd2576; | |
and.b64 %rd1418, %rd1417, 4294967295; | |
xor.b64 %rd1419, %rd1418, %rd1412; | |
xor.b64 %rd1420, %rd1419, %rd2589; | |
mul.lo.s64 %rd1421, %rd1420, %rd2574; | |
shr.u64 %rd1422, %rd1421, 32; | |
shr.u64 %rd1423, %rd1417, 32; | |
xor.b64 %rd1424, %rd1391, %rd1423; | |
xor.b64 %rd1425, %rd1424, %rd2590; | |
mul.lo.s64 %rd1426, %rd1425, %rd2574; | |
xor.b64 %rd1427, %rd1422, %rd1426; | |
cvt.u32.u64 %r153, %rd1427; | |
xor.b32 %r154, %r328, %r153; | |
mul.lo.s32 %r155, %r154, %r329; | |
shr.u32 %r156, %r155, 9; | |
cvt.rn.f32.u32 %f114, %r156; | |
mul.rn.f32 %f115, %f114, 0f34000000; | |
cvt.rn.f16.f32 %h64, %f115; | |
mov.b16 %h65, 0x2E66; | |
setp.ge.f16 %p33, %h64, %h65; | |
ld.global.nc.b16 %h66, [%rd44+770]; | |
ld.global.nc.f32 %f116, [%rd45+1540]; | |
cvt.rn.f16.f32 %h67, %f116; | |
add.rn.f16 %h68, %h66, %h67; | |
mov.b16 %h69, 0x3C72; | |
mul.rn.f16 %h70, %h68, %h69; | |
selp.b16 %h71, %h70, 0x0000, %p33; | |
cvt.f32.f16 %f117, %h71; | |
ld.global.nc.b16 %h72, [%rd46+770]; | |
cvt.f32.f16 %f118, %h72; | |
ld.global.nc.f32 %f119, [%rd47+1540]; | |
mul.rn.f32 %f120, %f1, %f119; | |
mul.rn.f32 %f121, %f120, %f118; | |
ld.global.nc.f32 %f122, [%rd48+1540]; | |
mul.rn.f32 %f123, %f2, %f120; | |
sub.rn.f32 %f124, %f122, %f123; | |
add.rn.f32 %f125, %f121, %f124; | |
add.rn.f32 %f126, %f125, %f117; | |
add.rn.f32 %f10, %f9, %f126; | |
or.b32 %r158, %r73, 512; | |
shr.u32 %r159, %r158, 2; | |
cvt.u64.u32 %rd1428, %r159; | |
add.s64 %rd240, %rd11, %rd1428; | |
@%p8 bra LBB29_28; | |
and.b64 %rd1470, %rd240, 4294967295; | |
mul.lo.s64 %rd2595, %rd1470, 3528531795; | |
setp.lt.u64 %p35, %rd240, %rd11; | |
selp.u64 %rd1471, 1, 0, %p35; | |
add.s64 %rd1472, %rd2461, %rd1471; | |
xor.b64 %rd1473, %rd1472, %rd2595; | |
shr.u64 %rd1474, %rd1473, 32; | |
mul.lo.s64 %rd2598, %rd1474, 3449720151; | |
shr.u64 %rd1475, %rd2598, 32; | |
and.b64 %rd1476, %rd1472, 4294967295; | |
mul.lo.s64 %rd1477, %rd1476, 3449720151; | |
and.b64 %rd1478, %rd1477, 4294967295; | |
xor.b64 %rd1479, %rd1478, %rd1475; | |
xor.b64 %rd1480, %rd1479, 2654435769; | |
mul.lo.s64 %rd2601, %rd1480, 3528531795; | |
xor.b64 %rd2591, %rd1477, %rd240; | |
mov.u32 %r332, -1879881855; | |
mov.u32 %r331, -845247145; | |
mov.u32 %r330, 534103459; | |
mov.u64 %rd2609, 3678237736; | |
mov.u64 %rd2608, 3041712726; | |
mov.u64 %rd2607, 1401181199; | |
mov.u64 %rd2606, 2835769497; | |
mov.u64 %rd2605, 1684936478; | |
mov.u64 %rd2604, 2027808484; | |
mov.u64 %rd2603, 387276957; | |
mov.u64 %rd2602, 842468239; | |
mov.u64 %rd2600, 3986602516; | |
mov.u64 %rd2599, 1013904242; | |
mov.u64 %rd2597, 3668340011; | |
mov.u64 %rd2596, 3144134277; | |
mov.u64 %rd2594, 3449720151; | |
mov.u64 %rd2593, 1993301258; | |
mov.u64 %rd2592, 3528531795; | |
bra.uni LBB29_29; | |
LBB29_28: | |
setp.lt.u64 %p34, %rd240, %rd11; | |
selp.u64 %rd1444, 1, 0, %p34; | |
add.s64 %rd1445, %rd2461, %rd1444; | |
and.b64 %rd1446, %rd1445, 4294967295; | |
mul.lo.s64 %rd2595, %rd1446, 3449720151; | |
xor.b64 %rd1447, %rd2595, %rd240; | |
shr.u64 %rd1448, %rd1447, 32; | |
mul.lo.s64 %rd2598, %rd1448, 3528531795; | |
shr.u64 %rd1449, %rd2598, 32; | |
and.b64 %rd1450, %rd240, 4294967295; | |
mul.lo.s64 %rd1451, %rd1450, 3528531795; | |
and.b64 %rd1452, %rd1451, 4294967295; | |
xor.b64 %rd1453, %rd1452, %rd1449; | |
xor.b64 %rd1454, %rd1453, 3144134277; | |
mul.lo.s64 %rd2601, %rd1454, 3449720151; | |
xor.b64 %rd2591, %rd1445, %rd1451; | |
mov.u32 %r332, -1767562579; | |
mov.u32 %r331, -766435501; | |
mov.u32 %r330, 1401181199; | |
mov.u64 %rd2609, 4055616968; | |
mov.u64 %rd2608, 1684936478; | |
mov.u64 %rd2607, 534103459; | |
mov.u64 %rd2606, 387276957; | |
mov.u64 %rd2605, 3041712726; | |
mov.u64 %rd2604, 3986602516; | |
mov.u64 %rd2603, 2835769497; | |
mov.u64 %rd2602, 3668340011; | |
mov.u64 %rd2600, 2027808484; | |
mov.u64 %rd2599, 1993301258; | |
mov.u64 %rd2597, 842468239; | |
mov.u64 %rd2596, 2654435769; | |
mov.u64 %rd2594, 3528531795; | |
mov.u64 %rd2593, 1013904242; | |
mov.u64 %rd2592, 3449720151; | |
LBB29_29: | |
shr.u64 %rd1481, %rd2601, 32; | |
shr.u64 %rd1482, %rd2591, 32; | |
mul.lo.s64 %rd1483, %rd1482, %rd2592; | |
and.b64 %rd1484, %rd1483, 4294967295; | |
xor.b64 %rd1485, %rd1484, %rd1481; | |
xor.b64 %rd1486, %rd1485, %rd2593; | |
mul.lo.s64 %rd1487, %rd1486, %rd2594; | |
shr.u64 %rd1488, %rd1487, 32; | |
shr.u64 %rd1489, %rd1483, 32; | |
and.b64 %rd1490, %rd2595, 4294967295; | |
xor.b64 %rd1491, %rd1490, %rd1489; | |
xor.b64 %rd1492, %rd1491, %rd2596; | |
mul.lo.s64 %rd1493, %rd1492, %rd2594; | |
and.b64 %rd1494, %rd1493, 4294967295; | |
xor.b64 %rd1495, %rd1494, %rd1488; | |
xor.b64 %rd1496, %rd1495, %rd2597; | |
mul.lo.s64 %rd1497, %rd1496, %rd2592; | |
shr.u64 %rd1498, %rd1497, 32; | |
shr.u64 %rd1499, %rd1493, 32; | |
and.b64 %rd1500, %rd2598, 4294967295; | |
xor.b64 %rd1501, %rd1500, %rd1499; | |
xor.b64 %rd1502, %rd1501, %rd2599; | |
mul.lo.s64 %rd1503, %rd1502, %rd2592; | |
and.b64 %rd1504, %rd1503, 4294967295; | |
xor.b64 %rd1505, %rd1504, %rd1498; | |
xor.b64 %rd1506, %rd1505, %rd2600; | |
mul.lo.s64 %rd1507, %rd1506, %rd2594; | |
shr.u64 %rd1508, %rd1507, 32; | |
shr.u64 %rd1509, %rd1503, 32; | |
and.b64 %rd1510, %rd2601, 4294967295; | |
xor.b64 %rd1511, %rd1510, %rd1509; | |
xor.b64 %rd1512, %rd1511, %rd2602; | |
mul.lo.s64 %rd1513, %rd1512, %rd2594; | |
and.b64 %rd1514, %rd1513, 4294967295; | |
xor.b64 %rd1515, %rd1514, %rd1508; | |
xor.b64 %rd1516, %rd1515, %rd2603; | |
mul.lo.s64 %rd1517, %rd1516, %rd2592; | |
shr.u64 %rd1518, %rd1517, 32; | |
shr.u64 %rd1519, %rd1513, 32; | |
and.b64 %rd1520, %rd1487, 4294967295; | |
xor.b64 %rd1521, %rd1520, %rd1519; | |
xor.b64 %rd1522, %rd1521, %rd2604; | |
mul.lo.s64 %rd1523, %rd1522, %rd2592; | |
and.b64 %rd1524, %rd1523, 4294967295; | |
xor.b64 %rd1525, %rd1524, %rd1518; | |
xor.b64 %rd1526, %rd1525, %rd2605; | |
mul.lo.s64 %rd1527, %rd1526, %rd2594; | |
shr.u64 %rd1528, %rd1527, 32; | |
shr.u64 %rd1529, %rd1523, 32; | |
and.b64 %rd1530, %rd1497, 4294967295; | |
xor.b64 %rd1531, %rd1530, %rd1529; | |
xor.b64 %rd1532, %rd1531, %rd2606; | |
mul.lo.s64 %rd1533, %rd1532, %rd2594; | |
and.b64 %rd1534, %rd1533, 4294967295; | |
xor.b64 %rd1535, %rd1534, %rd1528; | |
xor.b64 %rd1536, %rd1535, %rd2607; | |
mul.lo.s64 %rd1537, %rd1536, %rd2592; | |
shr.u64 %rd1538, %rd1537, 32; | |
shr.u64 %rd1539, %rd1533, 32; | |
and.b64 %rd1540, %rd1507, 4294967295; | |
xor.b64 %rd1541, %rd1540, %rd1539; | |
xor.b64 %rd1542, %rd1541, %rd2608; | |
mul.lo.s64 %rd1543, %rd1542, %rd2592; | |
and.b64 %rd1544, %rd1543, 4294967295; | |
xor.b64 %rd1545, %rd1544, %rd1538; | |
xor.b64 %rd1546, %rd1545, %rd2609; | |
mul.lo.s64 %rd1547, %rd1546, %rd2594; | |
shr.u64 %rd1548, %rd1547, 32; | |
cvt.u32.u64 %r166, %rd1548; | |
shr.u64 %rd1549, %rd1543, 32; | |
xor.b64 %rd1550, %rd1549, %rd1517; | |
cvt.u32.u64 %r167, %rd1550; | |
xor.b32 %r168, %r330, %r167; | |
mul.lo.s32 %r169, %r168, %r331; | |
xor.b32 %r170, %r169, %r166; | |
xor.b32 %r171, %r170, %r332; | |
shr.u32 %r172, %r171, 9; | |
cvt.rn.f32.u32 %f127, %r172; | |
mul.rn.f32 %f128, %f127, 0f34000000; | |
cvt.rn.f16.f32 %h73, %f128; | |
mov.b16 %h74, 0x2E66; | |
setp.ge.f16 %p36, %h73, %h74; | |
ld.global.nc.b16 %h75, [%rd44+1024]; | |
ld.global.nc.f32 %f129, [%rd45+2048]; | |
cvt.rn.f16.f32 %h76, %f129; | |
add.rn.f16 %h77, %h75, %h76; | |
mov.b16 %h78, 0x3C72; | |
mul.rn.f16 %h79, %h77, %h78; | |
selp.b16 %h80, %h79, 0x0000, %p36; | |
cvt.f32.f16 %f130, %h80; | |
ld.global.nc.b16 %h81, [%rd46+1024]; | |
cvt.f32.f16 %f131, %h81; | |
ld.global.nc.f32 %f132, [%rd47+2048]; | |
mul.rn.f32 %f133, %f1, %f132; | |
mul.rn.f32 %f134, %f133, %f131; | |
ld.global.nc.f32 %f135, [%rd48+2048]; | |
mul.rn.f32 %f136, %f2, %f133; | |
sub.rn.f32 %f137, %f135, %f136; | |
add.rn.f32 %f138, %f134, %f137; | |
add.rn.f32 %f139, %f138, %f130; | |
add.rn.f32 %f11, %f10, %f139; | |
or.b32 %r173, %r3, 513; | |
or.b32 %r174, %r173, %r4; | |
and.b32 %r175, %r173, 3; | |
shr.u32 %r176, %r174, 2; | |
setp.ne.s32 %p37, %r175, 1; | |
cvt.u64.u32 %rd1551, %r176; | |
add.s64 %rd268, %rd11, %rd1551; | |
@%p37 bra LBB29_31; | |
and.b64 %rd1591, %rd268, 4294967295; | |
mul.lo.s64 %rd2614, %rd1591, 3528531795; | |
setp.lt.u64 %p39, %rd268, %rd11; | |
selp.u64 %rd1592, 1, 0, %p39; | |
add.s64 %rd1593, %rd2461, %rd1592; | |
xor.b64 %rd1594, %rd1593, %rd2614; | |
shr.u64 %rd1595, %rd1594, 32; | |
mul.lo.s64 %rd2617, %rd1595, 3449720151; | |
shr.u64 %rd1596, %rd2617, 32; | |
and.b64 %rd1597, %rd1593, 4294967295; | |
mul.lo.s64 %rd1598, %rd1597, 3449720151; | |
and.b64 %rd1599, %rd1598, 4294967295; | |
xor.b64 %rd1600, %rd1599, %rd1596; | |
xor.b64 %rd1601, %rd1600, 2654435769; | |
mul.lo.s64 %rd2620, %rd1601, 3528531795; | |
xor.b64 %rd2610, %rd1598, %rd268; | |
mov.u32 %r334, -845247145; | |
mov.u32 %r333, -616729560; | |
mov.u64 %rd2627, 3041712726; | |
mov.u64 %rd2626, 1401181199; | |
mov.u64 %rd2625, 2835769497; | |
mov.u64 %rd2624, 1684936478; | |
mov.u64 %rd2623, 2027808484; | |
mov.u64 %rd2622, 387276957; | |
mov.u64 %rd2621, 842468239; | |
mov.u64 %rd2619, 3986602516; | |
mov.u64 %rd2618, 1013904242; | |
mov.u64 %rd2616, 3668340011; | |
mov.u64 %rd2615, 3144134277; | |
mov.u64 %rd2613, 3449720151; | |
mov.u64 %rd2612, 1993301258; | |
mov.u64 %rd2611, 3528531795; | |
bra.uni LBB29_32; | |
LBB29_31: | |
setp.lt.u64 %p38, %rd268, %rd11; | |
selp.u64 %rd1566, 1, 0, %p38; | |
add.s64 %rd1567, %rd2461, %rd1566; | |
and.b64 %rd1568, %rd1567, 4294967295; | |
mul.lo.s64 %rd2614, %rd1568, 3449720151; | |
xor.b64 %rd1569, %rd2614, %rd268; | |
shr.u64 %rd1570, %rd1569, 32; | |
mul.lo.s64 %rd2617, %rd1570, 3528531795; | |
shr.u64 %rd1571, %rd2617, 32; | |
and.b64 %rd1572, %rd268, 4294967295; | |
mul.lo.s64 %rd1573, %rd1572, 3528531795; | |
and.b64 %rd1574, %rd1573, 4294967295; | |
xor.b64 %rd1575, %rd1574, %rd1571; | |
xor.b64 %rd1576, %rd1575, 3144134277; | |
mul.lo.s64 %rd2620, %rd1576, 3449720151; | |
xor.b64 %rd2610, %rd1567, %rd1573; | |
mov.u32 %r334, -766435501; | |
mov.u32 %r333, -239350328; | |
mov.u64 %rd2627, 1684936478; | |
mov.u64 %rd2626, 534103459; | |
mov.u64 %rd2625, 387276957; | |
mov.u64 %rd2624, 3041712726; | |
mov.u64 %rd2623, 3986602516; | |
mov.u64 %rd2622, 2835769497; | |
mov.u64 %rd2621, 3668340011; | |
mov.u64 %rd2619, 2027808484; | |
mov.u64 %rd2618, 1993301258; | |
mov.u64 %rd2616, 842468239; | |
mov.u64 %rd2615, 2654435769; | |
mov.u64 %rd2613, 3528531795; | |
mov.u64 %rd2612, 1013904242; | |
mov.u64 %rd2611, 3449720151; | |
LBB29_32: | |
shr.u64 %rd1602, %rd2620, 32; | |
shr.u64 %rd1603, %rd2610, 32; | |
mul.lo.s64 %rd1604, %rd1603, %rd2611; | |
and.b64 %rd1605, %rd1604, 4294967295; | |
xor.b64 %rd1606, %rd1605, %rd1602; | |
xor.b64 %rd1607, %rd1606, %rd2612; | |
mul.lo.s64 %rd1608, %rd1607, %rd2613; | |
shr.u64 %rd1609, %rd1608, 32; | |
shr.u64 %rd1610, %rd1604, 32; | |
and.b64 %rd1611, %rd2614, 4294967295; | |
xor.b64 %rd1612, %rd1611, %rd1610; | |
xor.b64 %rd1613, %rd1612, %rd2615; | |
mul.lo.s64 %rd1614, %rd1613, %rd2613; | |
and.b64 %rd1615, %rd1614, 4294967295; | |
xor.b64 %rd1616, %rd1615, %rd1609; | |
xor.b64 %rd1617, %rd1616, %rd2616; | |
mul.lo.s64 %rd1618, %rd1617, %rd2611; | |
shr.u64 %rd1619, %rd1618, 32; | |
shr.u64 %rd1620, %rd1614, 32; | |
and.b64 %rd1621, %rd2617, 4294967295; | |
xor.b64 %rd1622, %rd1621, %rd1620; | |
xor.b64 %rd1623, %rd1622, %rd2618; | |
mul.lo.s64 %rd1624, %rd1623, %rd2611; | |
and.b64 %rd1625, %rd1624, 4294967295; | |
xor.b64 %rd1626, %rd1625, %rd1619; | |
xor.b64 %rd1627, %rd1626, %rd2619; | |
mul.lo.s64 %rd1628, %rd1627, %rd2613; | |
shr.u64 %rd1629, %rd1628, 32; | |
shr.u64 %rd1630, %rd1624, 32; | |
and.b64 %rd1631, %rd2620, 4294967295; | |
xor.b64 %rd1632, %rd1631, %rd1630; | |
xor.b64 %rd1633, %rd1632, %rd2621; | |
mul.lo.s64 %rd1634, %rd1633, %rd2613; | |
and.b64 %rd1635, %rd1634, 4294967295; | |
xor.b64 %rd1636, %rd1635, %rd1629; | |
xor.b64 %rd1637, %rd1636, %rd2622; | |
mul.lo.s64 %rd1638, %rd1637, %rd2611; | |
shr.u64 %rd1639, %rd1638, 32; | |
shr.u64 %rd1640, %rd1634, 32; | |
and.b64 %rd1641, %rd1608, 4294967295; | |
xor.b64 %rd1642, %rd1641, %rd1640; | |
xor.b64 %rd1643, %rd1642, %rd2623; | |
mul.lo.s64 %rd1644, %rd1643, %rd2611; | |
and.b64 %rd1645, %rd1644, 4294967295; | |
xor.b64 %rd1646, %rd1645, %rd1639; | |
xor.b64 %rd1647, %rd1646, %rd2624; | |
mul.lo.s64 %rd1648, %rd1647, %rd2613; | |
shr.u64 %rd1649, %rd1648, 32; | |
shr.u64 %rd1650, %rd1644, 32; | |
and.b64 %rd1651, %rd1618, 4294967295; | |
xor.b64 %rd1652, %rd1651, %rd1650; | |
xor.b64 %rd1653, %rd1652, %rd2625; | |
mul.lo.s64 %rd1654, %rd1653, %rd2613; | |
and.b64 %rd1655, %rd1654, 4294967295; | |
xor.b64 %rd1656, %rd1655, %rd1649; | |
xor.b64 %rd1657, %rd1656, %rd2626; | |
mul.lo.s64 %rd1658, %rd1657, %rd2611; | |
shr.u64 %rd1659, %rd1658, 32; | |
shr.u64 %rd1660, %rd1654, 32; | |
xor.b64 %rd1661, %rd1628, %rd1660; | |
xor.b64 %rd1662, %rd1661, %rd2627; | |
mul.lo.s64 %rd1663, %rd1662, %rd2611; | |
xor.b64 %rd1664, %rd1659, %rd1663; | |
cvt.u32.u64 %r181, %rd1664; | |
xor.b32 %r182, %r333, %r181; | |
mul.lo.s32 %r183, %r182, %r334; | |
shr.u32 %r184, %r183, 9; | |
cvt.rn.f32.u32 %f140, %r184; | |
mul.rn.f32 %f141, %f140, 0f34000000; | |
cvt.rn.f16.f32 %h82, %f141; | |
mov.b16 %h83, 0x2E66; | |
setp.ge.f16 %p41, %h82, %h83; | |
ld.global.nc.b16 %h84, [%rd44+1026]; | |
ld.global.nc.f32 %f142, [%rd45+2052]; | |
cvt.rn.f16.f32 %h85, %f142; | |
add.rn.f16 %h86, %h84, %h85; | |
mov.b16 %h87, 0x3C72; | |
mul.rn.f16 %h88, %h86, %h87; | |
selp.b16 %h89, %h88, 0x0000, %p41; | |
cvt.f32.f16 %f143, %h89; | |
ld.global.nc.b16 %h90, [%rd46+1026]; | |
cvt.f32.f16 %f144, %h90; | |
ld.global.nc.f32 %f145, [%rd47+2052]; | |
mul.rn.f32 %f146, %f1, %f145; | |
mul.rn.f32 %f147, %f146, %f144; | |
ld.global.nc.f32 %f148, [%rd48+2052]; | |
mul.rn.f32 %f149, %f2, %f146; | |
sub.rn.f32 %f150, %f148, %f149; | |
add.rn.f32 %f151, %f147, %f150; | |
add.rn.f32 %f152, %f151, %f143; | |
add.rn.f32 %f12, %f11, %f152; | |
or.b32 %r186, %r73, 640; | |
shr.u32 %r187, %r186, 2; | |
cvt.u64.u32 %rd1665, %r187; | |
add.s64 %rd295, %rd11, %rd1665; | |
@%p8 bra LBB29_34; | |
and.b64 %rd1707, %rd295, 4294967295; | |
mul.lo.s64 %rd2632, %rd1707, 3528531795; | |
setp.lt.u64 %p43, %rd295, %rd11; | |
selp.u64 %rd1708, 1, 0, %p43; | |
add.s64 %rd1709, %rd2461, %rd1708; | |
xor.b64 %rd1710, %rd1709, %rd2632; | |
shr.u64 %rd1711, %rd1710, 32; | |
mul.lo.s64 %rd2635, %rd1711, 3449720151; | |
shr.u64 %rd1712, %rd2635, 32; | |
and.b64 %rd1713, %rd1709, 4294967295; | |
mul.lo.s64 %rd1714, %rd1713, 3449720151; | |
and.b64 %rd1715, %rd1714, 4294967295; | |
xor.b64 %rd1716, %rd1715, %rd1712; | |
xor.b64 %rd1717, %rd1716, 2654435769; | |
mul.lo.s64 %rd2638, %rd1717, 3528531795; | |
xor.b64 %rd2628, %rd1714, %rd295; | |
mov.u32 %r337, -1879881855; | |
mov.u32 %r336, -845247145; | |
mov.u32 %r335, 534103459; | |
mov.u64 %rd2646, 3678237736; | |
mov.u64 %rd2645, 3041712726; | |
mov.u64 %rd2644, 1401181199; | |
mov.u64 %rd2643, 2835769497; | |
mov.u64 %rd2642, 1684936478; | |
mov.u64 %rd2641, 2027808484; | |
mov.u64 %rd2640, 387276957; | |
mov.u64 %rd2639, 842468239; | |
mov.u64 %rd2637, 3986602516; | |
mov.u64 %rd2636, 1013904242; | |
mov.u64 %rd2634, 3668340011; | |
mov.u64 %rd2633, 3144134277; | |
mov.u64 %rd2631, 3449720151; | |
mov.u64 %rd2630, 1993301258; | |
mov.u64 %rd2629, 3528531795; | |
bra.uni LBB29_35; | |
LBB29_34: | |
setp.lt.u64 %p42, %rd295, %rd11; | |
selp.u64 %rd1681, 1, 0, %p42; | |
add.s64 %rd1682, %rd2461, %rd1681; | |
and.b64 %rd1683, %rd1682, 4294967295; | |
mul.lo.s64 %rd2632, %rd1683, 3449720151; | |
xor.b64 %rd1684, %rd2632, %rd295; | |
shr.u64 %rd1685, %rd1684, 32; | |
mul.lo.s64 %rd2635, %rd1685, 3528531795; | |
shr.u64 %rd1686, %rd2635, 32; | |
and.b64 %rd1687, %rd295, 4294967295; | |
mul.lo.s64 %rd1688, %rd1687, 3528531795; | |
and.b64 %rd1689, %rd1688, 4294967295; | |
xor.b64 %rd1690, %rd1689, %rd1686; | |
xor.b64 %rd1691, %rd1690, 3144134277; | |
mul.lo.s64 %rd2638, %rd1691, 3449720151; | |
xor.b64 %rd2628, %rd1682, %rd1688; | |
mov.u32 %r337, -1767562579; | |
mov.u32 %r336, -766435501; | |
mov.u32 %r335, 1401181199; | |
mov.u64 %rd2646, 4055616968; | |
mov.u64 %rd2645, 1684936478; | |
mov.u64 %rd2644, 534103459; | |
mov.u64 %rd2643, 387276957; | |
mov.u64 %rd2642, 3041712726; | |
mov.u64 %rd2641, 3986602516; | |
mov.u64 %rd2640, 2835769497; | |
mov.u64 %rd2639, 3668340011; | |
mov.u64 %rd2637, 2027808484; | |
mov.u64 %rd2636, 1993301258; | |
mov.u64 %rd2634, 842468239; | |
mov.u64 %rd2633, 2654435769; | |
mov.u64 %rd2631, 3528531795; | |
mov.u64 %rd2630, 1013904242; | |
mov.u64 %rd2629, 3449720151; | |
LBB29_35: | |
shr.u64 %rd1718, %rd2638, 32; | |
shr.u64 %rd1719, %rd2628, 32; | |
mul.lo.s64 %rd1720, %rd1719, %rd2629; | |
and.b64 %rd1721, %rd1720, 4294967295; | |
xor.b64 %rd1722, %rd1721, %rd1718; | |
xor.b64 %rd1723, %rd1722, %rd2630; | |
mul.lo.s64 %rd1724, %rd1723, %rd2631; | |
shr.u64 %rd1725, %rd1724, 32; | |
shr.u64 %rd1726, %rd1720, 32; | |
and.b64 %rd1727, %rd2632, 4294967295; | |
xor.b64 %rd1728, %rd1727, %rd1726; | |
xor.b64 %rd1729, %rd1728, %rd2633; | |
mul.lo.s64 %rd1730, %rd1729, %rd2631; | |
and.b64 %rd1731, %rd1730, 4294967295; | |
xor.b64 %rd1732, %rd1731, %rd1725; | |
xor.b64 %rd1733, %rd1732, %rd2634; | |
mul.lo.s64 %rd1734, %rd1733, %rd2629; | |
shr.u64 %rd1735, %rd1734, 32; | |
shr.u64 %rd1736, %rd1730, 32; | |
and.b64 %rd1737, %rd2635, 4294967295; | |
xor.b64 %rd1738, %rd1737, %rd1736; | |
xor.b64 %rd1739, %rd1738, %rd2636; | |
mul.lo.s64 %rd1740, %rd1739, %rd2629; | |
and.b64 %rd1741, %rd1740, 4294967295; | |
xor.b64 %rd1742, %rd1741, %rd1735; | |
xor.b64 %rd1743, %rd1742, %rd2637; | |
mul.lo.s64 %rd1744, %rd1743, %rd2631; | |
shr.u64 %rd1745, %rd1744, 32; | |
shr.u64 %rd1746, %rd1740, 32; | |
and.b64 %rd1747, %rd2638, 4294967295; | |
xor.b64 %rd1748, %rd1747, %rd1746; | |
xor.b64 %rd1749, %rd1748, %rd2639; | |
mul.lo.s64 %rd1750, %rd1749, %rd2631; | |
and.b64 %rd1751, %rd1750, 4294967295; | |
xor.b64 %rd1752, %rd1751, %rd1745; | |
xor.b64 %rd1753, %rd1752, %rd2640; | |
mul.lo.s64 %rd1754, %rd1753, %rd2629; | |
shr.u64 %rd1755, %rd1754, 32; | |
shr.u64 %rd1756, %rd1750, 32; | |
and.b64 %rd1757, %rd1724, 4294967295; | |
xor.b64 %rd1758, %rd1757, %rd1756; | |
xor.b64 %rd1759, %rd1758, %rd2641; | |
mul.lo.s64 %rd1760, %rd1759, %rd2629; | |
and.b64 %rd1761, %rd1760, 4294967295; | |
xor.b64 %rd1762, %rd1761, %rd1755; | |
xor.b64 %rd1763, %rd1762, %rd2642; | |
mul.lo.s64 %rd1764, %rd1763, %rd2631; | |
shr.u64 %rd1765, %rd1764, 32; | |
shr.u64 %rd1766, %rd1760, 32; | |
and.b64 %rd1767, %rd1734, 4294967295; | |
xor.b64 %rd1768, %rd1767, %rd1766; | |
xor.b64 %rd1769, %rd1768, %rd2643; | |
mul.lo.s64 %rd1770, %rd1769, %rd2631; | |
and.b64 %rd1771, %rd1770, 4294967295; | |
xor.b64 %rd1772, %rd1771, %rd1765; | |
xor.b64 %rd1773, %rd1772, %rd2644; | |
mul.lo.s64 %rd1774, %rd1773, %rd2629; | |
shr.u64 %rd1775, %rd1774, 32; | |
shr.u64 %rd1776, %rd1770, 32; | |
and.b64 %rd1777, %rd1744, 4294967295; | |
xor.b64 %rd1778, %rd1777, %rd1776; | |
xor.b64 %rd1779, %rd1778, %rd2645; | |
mul.lo.s64 %rd1780, %rd1779, %rd2629; | |
and.b64 %rd1781, %rd1780, 4294967295; | |
xor.b64 %rd1782, %rd1781, %rd1775; | |
xor.b64 %rd1783, %rd1782, %rd2646; | |
mul.lo.s64 %rd1784, %rd1783, %rd2631; | |
shr.u64 %rd1785, %rd1784, 32; | |
cvt.u32.u64 %r194, %rd1785; | |
shr.u64 %rd1786, %rd1780, 32; | |
xor.b64 %rd1787, %rd1786, %rd1754; | |
cvt.u32.u64 %r195, %rd1787; | |
xor.b32 %r196, %r335, %r195; | |
mul.lo.s32 %r197, %r196, %r336; | |
xor.b32 %r198, %r197, %r194; | |
xor.b32 %r199, %r198, %r337; | |
shr.u32 %r200, %r199, 9; | |
cvt.rn.f32.u32 %f153, %r200; | |
mul.rn.f32 %f154, %f153, 0f34000000; | |
cvt.rn.f16.f32 %h91, %f154; | |
mov.b16 %h92, 0x2E66; | |
setp.ge.f16 %p44, %h91, %h92; | |
ld.global.nc.b16 %h93, [%rd44+1280]; | |
ld.global.nc.f32 %f155, [%rd45+2560]; | |
cvt.rn.f16.f32 %h94, %f155; | |
add.rn.f16 %h95, %h93, %h94; | |
mov.b16 %h96, 0x3C72; | |
mul.rn.f16 %h97, %h95, %h96; | |
selp.b16 %h98, %h97, 0x0000, %p44; | |
cvt.f32.f16 %f156, %h98; | |
ld.global.nc.b16 %h99, [%rd46+1280]; | |
cvt.f32.f16 %f157, %h99; | |
ld.global.nc.f32 %f158, [%rd47+2560]; | |
mul.rn.f32 %f159, %f1, %f158; | |
mul.rn.f32 %f160, %f159, %f157; | |
ld.global.nc.f32 %f161, [%rd48+2560]; | |
mul.rn.f32 %f162, %f2, %f159; | |
sub.rn.f32 %f163, %f161, %f162; | |
add.rn.f32 %f164, %f160, %f163; | |
add.rn.f32 %f165, %f164, %f156; | |
add.rn.f32 %f13, %f12, %f165; | |
or.b32 %r201, %r3, 641; | |
or.b32 %r202, %r201, %r4; | |
and.b32 %r203, %r201, 3; | |
shr.u32 %r204, %r202, 2; | |
setp.ne.s32 %p45, %r203, 1; | |
cvt.u64.u32 %rd1788, %r204; | |
add.s64 %rd323, %rd11, %rd1788; | |
@%p45 bra LBB29_37; | |
and.b64 %rd1828, %rd323, 4294967295; | |
mul.lo.s64 %rd2651, %rd1828, 3528531795; | |
setp.lt.u64 %p47, %rd323, %rd11; | |
selp.u64 %rd1829, 1, 0, %p47; | |
add.s64 %rd1830, %rd2461, %rd1829; | |
xor.b64 %rd1831, %rd1830, %rd2651; | |
shr.u64 %rd1832, %rd1831, 32; | |
mul.lo.s64 %rd2654, %rd1832, 3449720151; | |
shr.u64 %rd1833, %rd2654, 32; | |
and.b64 %rd1834, %rd1830, 4294967295; | |
mul.lo.s64 %rd1835, %rd1834, 3449720151; | |
and.b64 %rd1836, %rd1835, 4294967295; | |
xor.b64 %rd1837, %rd1836, %rd1833; | |
xor.b64 %rd1838, %rd1837, 2654435769; | |
mul.lo.s64 %rd2657, %rd1838, 3528531795; | |
xor.b64 %rd2647, %rd1835, %rd323; | |
mov.u32 %r339, -845247145; | |
mov.u32 %r338, -616729560; | |
mov.u64 %rd2664, 3041712726; | |
mov.u64 %rd2663, 1401181199; | |
mov.u64 %rd2662, 2835769497; | |
mov.u64 %rd2661, 1684936478; | |
mov.u64 %rd2660, 2027808484; | |
mov.u64 %rd2659, 387276957; | |
mov.u64 %rd2658, 842468239; | |
mov.u64 %rd2656, 3986602516; | |
mov.u64 %rd2655, 1013904242; | |
mov.u64 %rd2653, 3668340011; | |
mov.u64 %rd2652, 3144134277; | |
mov.u64 %rd2650, 3449720151; | |
mov.u64 %rd2649, 1993301258; | |
mov.u64 %rd2648, 3528531795; | |
bra.uni LBB29_38; | |
LBB29_37: | |
setp.lt.u64 %p46, %rd323, %rd11; | |
selp.u64 %rd1803, 1, 0, %p46; | |
add.s64 %rd1804, %rd2461, %rd1803; | |
and.b64 %rd1805, %rd1804, 4294967295; | |
mul.lo.s64 %rd2651, %rd1805, 3449720151; | |
xor.b64 %rd1806, %rd2651, %rd323; | |
shr.u64 %rd1807, %rd1806, 32; | |
mul.lo.s64 %rd2654, %rd1807, 3528531795; | |
shr.u64 %rd1808, %rd2654, 32; | |
and.b64 %rd1809, %rd323, 4294967295; | |
mul.lo.s64 %rd1810, %rd1809, 3528531795; | |
and.b64 %rd1811, %rd1810, 4294967295; | |
xor.b64 %rd1812, %rd1811, %rd1808; | |
xor.b64 %rd1813, %rd1812, 3144134277; | |
mul.lo.s64 %rd2657, %rd1813, 3449720151; | |
xor.b64 %rd2647, %rd1804, %rd1810; | |
mov.u32 %r339, -766435501; | |
mov.u32 %r338, -239350328; | |
mov.u64 %rd2664, 1684936478; | |
mov.u64 %rd2663, 534103459; | |
mov.u64 %rd2662, 387276957; | |
mov.u64 %rd2661, 3041712726; | |
mov.u64 %rd2660, 3986602516; | |
mov.u64 %rd2659, 2835769497; | |
mov.u64 %rd2658, 3668340011; | |
mov.u64 %rd2656, 2027808484; | |
mov.u64 %rd2655, 1993301258; | |
mov.u64 %rd2653, 842468239; | |
mov.u64 %rd2652, 2654435769; | |
mov.u64 %rd2650, 3528531795; | |
mov.u64 %rd2649, 1013904242; | |
mov.u64 %rd2648, 3449720151; | |
LBB29_38: | |
shr.u64 %rd1839, %rd2657, 32; | |
shr.u64 %rd1840, %rd2647, 32; | |
mul.lo.s64 %rd1841, %rd1840, %rd2648; | |
and.b64 %rd1842, %rd1841, 4294967295; | |
xor.b64 %rd1843, %rd1842, %rd1839; | |
xor.b64 %rd1844, %rd1843, %rd2649; | |
mul.lo.s64 %rd1845, %rd1844, %rd2650; | |
shr.u64 %rd1846, %rd1845, 32; | |
shr.u64 %rd1847, %rd1841, 32; | |
and.b64 %rd1848, %rd2651, 4294967295; | |
xor.b64 %rd1849, %rd1848, %rd1847; | |
xor.b64 %rd1850, %rd1849, %rd2652; | |
mul.lo.s64 %rd1851, %rd1850, %rd2650; | |
and.b64 %rd1852, %rd1851, 4294967295; | |
xor.b64 %rd1853, %rd1852, %rd1846; | |
xor.b64 %rd1854, %rd1853, %rd2653; | |
mul.lo.s64 %rd1855, %rd1854, %rd2648; | |
shr.u64 %rd1856, %rd1855, 32; | |
shr.u64 %rd1857, %rd1851, 32; | |
and.b64 %rd1858, %rd2654, 4294967295; | |
xor.b64 %rd1859, %rd1858, %rd1857; | |
xor.b64 %rd1860, %rd1859, %rd2655; | |
mul.lo.s64 %rd1861, %rd1860, %rd2648; | |
and.b64 %rd1862, %rd1861, 4294967295; | |
xor.b64 %rd1863, %rd1862, %rd1856; | |
xor.b64 %rd1864, %rd1863, %rd2656; | |
mul.lo.s64 %rd1865, %rd1864, %rd2650; | |
shr.u64 %rd1866, %rd1865, 32; | |
shr.u64 %rd1867, %rd1861, 32; | |
and.b64 %rd1868, %rd2657, 4294967295; | |
xor.b64 %rd1869, %rd1868, %rd1867; | |
xor.b64 %rd1870, %rd1869, %rd2658; | |
mul.lo.s64 %rd1871, %rd1870, %rd2650; | |
and.b64 %rd1872, %rd1871, 4294967295; | |
xor.b64 %rd1873, %rd1872, %rd1866; | |
xor.b64 %rd1874, %rd1873, %rd2659; | |
mul.lo.s64 %rd1875, %rd1874, %rd2648; | |
shr.u64 %rd1876, %rd1875, 32; | |
shr.u64 %rd1877, %rd1871, 32; | |
and.b64 %rd1878, %rd1845, 4294967295; | |
xor.b64 %rd1879, %rd1878, %rd1877; | |
xor.b64 %rd1880, %rd1879, %rd2660; | |
mul.lo.s64 %rd1881, %rd1880, %rd2648; | |
and.b64 %rd1882, %rd1881, 4294967295; | |
xor.b64 %rd1883, %rd1882, %rd1876; | |
xor.b64 %rd1884, %rd1883, %rd2661; | |
mul.lo.s64 %rd1885, %rd1884, %rd2650; | |
shr.u64 %rd1886, %rd1885, 32; | |
shr.u64 %rd1887, %rd1881, 32; | |
and.b64 %rd1888, %rd1855, 4294967295; | |
xor.b64 %rd1889, %rd1888, %rd1887; | |
xor.b64 %rd1890, %rd1889, %rd2662; | |
mul.lo.s64 %rd1891, %rd1890, %rd2650; | |
and.b64 %rd1892, %rd1891, 4294967295; | |
xor.b64 %rd1893, %rd1892, %rd1886; | |
xor.b64 %rd1894, %rd1893, %rd2663; | |
mul.lo.s64 %rd1895, %rd1894, %rd2648; | |
shr.u64 %rd1896, %rd1895, 32; | |
shr.u64 %rd1897, %rd1891, 32; | |
xor.b64 %rd1898, %rd1865, %rd1897; | |
xor.b64 %rd1899, %rd1898, %rd2664; | |
mul.lo.s64 %rd1900, %rd1899, %rd2648; | |
xor.b64 %rd1901, %rd1896, %rd1900; | |
cvt.u32.u64 %r209, %rd1901; | |
xor.b32 %r210, %r338, %r209; | |
mul.lo.s32 %r211, %r210, %r339; | |
shr.u32 %r212, %r211, 9; | |
cvt.rn.f32.u32 %f166, %r212; | |
mul.rn.f32 %f167, %f166, 0f34000000; | |
cvt.rn.f16.f32 %h100, %f167; | |
mov.b16 %h101, 0x2E66; | |
setp.ge.f16 %p49, %h100, %h101; | |
ld.global.nc.b16 %h102, [%rd44+1282]; | |
ld.global.nc.f32 %f168, [%rd45+2564]; | |
cvt.rn.f16.f32 %h103, %f168; | |
add.rn.f16 %h104, %h102, %h103; | |
mov.b16 %h105, 0x3C72; | |
mul.rn.f16 %h106, %h104, %h105; | |
selp.b16 %h107, %h106, 0x0000, %p49; | |
cvt.f32.f16 %f169, %h107; | |
ld.global.nc.b16 %h108, [%rd46+1282]; | |
cvt.f32.f16 %f170, %h108; | |
ld.global.nc.f32 %f171, [%rd47+2564]; | |
mul.rn.f32 %f172, %f1, %f171; | |
mul.rn.f32 %f173, %f172, %f170; | |
ld.global.nc.f32 %f174, [%rd48+2564]; | |
mul.rn.f32 %f175, %f2, %f172; | |
sub.rn.f32 %f176, %f174, %f175; | |
add.rn.f32 %f177, %f173, %f176; | |
add.rn.f32 %f178, %f177, %f169; | |
add.rn.f32 %f14, %f13, %f178; | |
or.b32 %r214, %r73, 768; | |
shr.u32 %r215, %r214, 2; | |
cvt.u64.u32 %rd1902, %r215; | |
add.s64 %rd350, %rd11, %rd1902; | |
@%p8 bra LBB29_40; | |
and.b64 %rd1944, %rd350, 4294967295; | |
mul.lo.s64 %rd2669, %rd1944, 3528531795; | |
setp.lt.u64 %p51, %rd350, %rd11; | |
selp.u64 %rd1945, 1, 0, %p51; | |
add.s64 %rd1946, %rd2461, %rd1945; | |
xor.b64 %rd1947, %rd1946, %rd2669; | |
shr.u64 %rd1948, %rd1947, 32; | |
mul.lo.s64 %rd2672, %rd1948, 3449720151; | |
shr.u64 %rd1949, %rd2672, 32; | |
and.b64 %rd1950, %rd1946, 4294967295; | |
mul.lo.s64 %rd1951, %rd1950, 3449720151; | |
and.b64 %rd1952, %rd1951, 4294967295; | |
xor.b64 %rd1953, %rd1952, %rd1949; | |
xor.b64 %rd1954, %rd1953, 2654435769; | |
mul.lo.s64 %rd2675, %rd1954, 3528531795; | |
xor.b64 %rd2665, %rd1951, %rd350; | |
mov.u32 %r342, -1879881855; | |
mov.u32 %r341, -845247145; | |
mov.u32 %r340, 534103459; | |
mov.u64 %rd2683, 3678237736; | |
mov.u64 %rd2682, 3041712726; | |
mov.u64 %rd2681, 1401181199; | |
mov.u64 %rd2680, 2835769497; | |
mov.u64 %rd2679, 1684936478; | |
mov.u64 %rd2678, 2027808484; | |
mov.u64 %rd2677, 387276957; | |
mov.u64 %rd2676, 842468239; | |
mov.u64 %rd2674, 3986602516; | |
mov.u64 %rd2673, 1013904242; | |
mov.u64 %rd2671, 3668340011; | |
mov.u64 %rd2670, 3144134277; | |
mov.u64 %rd2668, 3449720151; | |
mov.u64 %rd2667, 1993301258; | |
mov.u64 %rd2666, 3528531795; | |
bra.uni LBB29_41; | |
LBB29_40: | |
setp.lt.u64 %p50, %rd350, %rd11; | |
selp.u64 %rd1918, 1, 0, %p50; | |
add.s64 %rd1919, %rd2461, %rd1918; | |
and.b64 %rd1920, %rd1919, 4294967295; | |
mul.lo.s64 %rd2669, %rd1920, 3449720151; | |
xor.b64 %rd1921, %rd2669, %rd350; | |
shr.u64 %rd1922, %rd1921, 32; | |
mul.lo.s64 %rd2672, %rd1922, 3528531795; | |
shr.u64 %rd1923, %rd2672, 32; | |
and.b64 %rd1924, %rd350, 4294967295; | |
mul.lo.s64 %rd1925, %rd1924, 3528531795; | |
and.b64 %rd1926, %rd1925, 4294967295; | |
xor.b64 %rd1927, %rd1926, %rd1923; | |
xor.b64 %rd1928, %rd1927, 3144134277; | |
mul.lo.s64 %rd2675, %rd1928, 3449720151; | |
xor.b64 %rd2665, %rd1919, %rd1925; | |
mov.u32 %r342, -1767562579; | |
mov.u32 %r341, -766435501; | |
mov.u32 %r340, 1401181199; | |
mov.u64 %rd2683, 4055616968; | |
mov.u64 %rd2682, 1684936478; | |
mov.u64 %rd2681, 534103459; | |
mov.u64 %rd2680, 387276957; | |
mov.u64 %rd2679, 3041712726; | |
mov.u64 %rd2678, 3986602516; | |
mov.u64 %rd2677, 2835769497; | |
mov.u64 %rd2676, 3668340011; | |
mov.u64 %rd2674, 2027808484; | |
mov.u64 %rd2673, 1993301258; | |
mov.u64 %rd2671, 842468239; | |
mov.u64 %rd2670, 2654435769; | |
mov.u64 %rd2668, 3528531795; | |
mov.u64 %rd2667, 1013904242; | |
mov.u64 %rd2666, 3449720151; | |
LBB29_41: | |
shr.u64 %rd1955, %rd2675, 32; | |
shr.u64 %rd1956, %rd2665, 32; | |
mul.lo.s64 %rd1957, %rd1956, %rd2666; | |
and.b64 %rd1958, %rd1957, 4294967295; | |
xor.b64 %rd1959, %rd1958, %rd1955; | |
xor.b64 %rd1960, %rd1959, %rd2667; | |
mul.lo.s64 %rd1961, %rd1960, %rd2668; | |
shr.u64 %rd1962, %rd1961, 32; | |
shr.u64 %rd1963, %rd1957, 32; | |
and.b64 %rd1964, %rd2669, 4294967295; | |
xor.b64 %rd1965, %rd1964, %rd1963; | |
xor.b64 %rd1966, %rd1965, %rd2670; | |
mul.lo.s64 %rd1967, %rd1966, %rd2668; | |
and.b64 %rd1968, %rd1967, 4294967295; | |
xor.b64 %rd1969, %rd1968, %rd1962; | |
xor.b64 %rd1970, %rd1969, %rd2671; | |
mul.lo.s64 %rd1971, %rd1970, %rd2666; | |
shr.u64 %rd1972, %rd1971, 32; | |
shr.u64 %rd1973, %rd1967, 32; | |
and.b64 %rd1974, %rd2672, 4294967295; | |
xor.b64 %rd1975, %rd1974, %rd1973; | |
xor.b64 %rd1976, %rd1975, %rd2673; | |
mul.lo.s64 %rd1977, %rd1976, %rd2666; | |
and.b64 %rd1978, %rd1977, 4294967295; | |
xor.b64 %rd1979, %rd1978, %rd1972; | |
xor.b64 %rd1980, %rd1979, %rd2674; | |
mul.lo.s64 %rd1981, %rd1980, %rd2668; | |
shr.u64 %rd1982, %rd1981, 32; | |
shr.u64 %rd1983, %rd1977, 32; | |
and.b64 %rd1984, %rd2675, 4294967295; | |
xor.b64 %rd1985, %rd1984, %rd1983; | |
xor.b64 %rd1986, %rd1985, %rd2676; | |
mul.lo.s64 %rd1987, %rd1986, %rd2668; | |
and.b64 %rd1988, %rd1987, 4294967295; | |
xor.b64 %rd1989, %rd1988, %rd1982; | |
xor.b64 %rd1990, %rd1989, %rd2677; | |
mul.lo.s64 %rd1991, %rd1990, %rd2666; | |
shr.u64 %rd1992, %rd1991, 32; | |
shr.u64 %rd1993, %rd1987, 32; | |
and.b64 %rd1994, %rd1961, 4294967295; | |
xor.b64 %rd1995, %rd1994, %rd1993; | |
xor.b64 %rd1996, %rd1995, %rd2678; | |
mul.lo.s64 %rd1997, %rd1996, %rd2666; | |
and.b64 %rd1998, %rd1997, 4294967295; | |
xor.b64 %rd1999, %rd1998, %rd1992; | |
xor.b64 %rd2000, %rd1999, %rd2679; | |
mul.lo.s64 %rd2001, %rd2000, %rd2668; | |
shr.u64 %rd2002, %rd2001, 32; | |
shr.u64 %rd2003, %rd1997, 32; | |
and.b64 %rd2004, %rd1971, 4294967295; | |
xor.b64 %rd2005, %rd2004, %rd2003; | |
xor.b64 %rd2006, %rd2005, %rd2680; | |
mul.lo.s64 %rd2007, %rd2006, %rd2668; | |
and.b64 %rd2008, %rd2007, 4294967295; | |
xor.b64 %rd2009, %rd2008, %rd2002; | |
xor.b64 %rd2010, %rd2009, %rd2681; | |
mul.lo.s64 %rd2011, %rd2010, %rd2666; | |
shr.u64 %rd2012, %rd2011, 32; | |
shr.u64 %rd2013, %rd2007, 32; | |
and.b64 %rd2014, %rd1981, 4294967295; | |
xor.b64 %rd2015, %rd2014, %rd2013; | |
xor.b64 %rd2016, %rd2015, %rd2682; | |
mul.lo.s64 %rd2017, %rd2016, %rd2666; | |
and.b64 %rd2018, %rd2017, 4294967295; | |
xor.b64 %rd2019, %rd2018, %rd2012; | |
xor.b64 %rd2020, %rd2019, %rd2683; | |
mul.lo.s64 %rd2021, %rd2020, %rd2668; | |
shr.u64 %rd2022, %rd2021, 32; | |
cvt.u32.u64 %r222, %rd2022; | |
shr.u64 %rd2023, %rd2017, 32; | |
xor.b64 %rd2024, %rd2023, %rd1991; | |
cvt.u32.u64 %r223, %rd2024; | |
xor.b32 %r224, %r340, %r223; | |
mul.lo.s32 %r225, %r224, %r341; | |
xor.b32 %r226, %r225, %r222; | |
xor.b32 %r227, %r226, %r342; | |
shr.u32 %r228, %r227, 9; | |
cvt.rn.f32.u32 %f179, %r228; | |
mul.rn.f32 %f180, %f179, 0f34000000; | |
cvt.rn.f16.f32 %h109, %f180; | |
mov.b16 %h110, 0x2E66; | |
setp.ge.f16 %p52, %h109, %h110; | |
ld.global.nc.b16 %h111, [%rd44+1536]; | |
ld.global.nc.f32 %f181, [%rd45+3072]; | |
cvt.rn.f16.f32 %h112, %f181; | |
add.rn.f16 %h113, %h111, %h112; | |
mov.b16 %h114, 0x3C72; | |
mul.rn.f16 %h115, %h113, %h114; | |
selp.b16 %h116, %h115, 0x0000, %p52; | |
cvt.f32.f16 %f182, %h116; | |
ld.global.nc.b16 %h117, [%rd46+1536]; | |
cvt.f32.f16 %f183, %h117; | |
ld.global.nc.f32 %f184, [%rd47+3072]; | |
mul.rn.f32 %f185, %f1, %f184; | |
mul.rn.f32 %f186, %f185, %f183; | |
ld.global.nc.f32 %f187, [%rd48+3072]; | |
mul.rn.f32 %f188, %f2, %f185; | |
sub.rn.f32 %f189, %f187, %f188; | |
add.rn.f32 %f190, %f186, %f189; | |
add.rn.f32 %f191, %f190, %f182; | |
add.rn.f32 %f15, %f14, %f191; | |
or.b32 %r229, %r3, 769; | |
or.b32 %r230, %r229, %r4; | |
and.b32 %r231, %r229, 3; | |
shr.u32 %r232, %r230, 2; | |
setp.ne.s32 %p53, %r231, 1; | |
cvt.u64.u32 %rd2025, %r232; | |
add.s64 %rd378, %rd11, %rd2025; | |
@%p53 bra LBB29_43; | |
and.b64 %rd2065, %rd378, 4294967295; | |
mul.lo.s64 %rd2688, %rd2065, 3528531795; | |
setp.lt.u64 %p55, %rd378, %rd11; | |
selp.u64 %rd2066, 1, 0, %p55; | |
add.s64 %rd2067, %rd2461, %rd2066; | |
xor.b64 %rd2068, %rd2067, %rd2688; | |
shr.u64 %rd2069, %rd2068, 32; | |
mul.lo.s64 %rd2691, %rd2069, 3449720151; | |
shr.u64 %rd2070, %rd2691, 32; | |
and.b64 %rd2071, %rd2067, 4294967295; | |
mul.lo.s64 %rd2072, %rd2071, 3449720151; | |
and.b64 %rd2073, %rd2072, 4294967295; | |
xor.b64 %rd2074, %rd2073, %rd2070; | |
xor.b64 %rd2075, %rd2074, 2654435769; | |
mul.lo.s64 %rd2694, %rd2075, 3528531795; | |
xor.b64 %rd2684, %rd2072, %rd378; | |
mov.u32 %r344, -845247145; | |
mov.u32 %r343, -616729560; | |
mov.u64 %rd2701, 3041712726; | |
mov.u64 %rd2700, 1401181199; | |
mov.u64 %rd2699, 2835769497; | |
mov.u64 %rd2698, 1684936478; | |
mov.u64 %rd2697, 2027808484; | |
mov.u64 %rd2696, 387276957; | |
mov.u64 %rd2695, 842468239; | |
mov.u64 %rd2693, 3986602516; | |
mov.u64 %rd2692, 1013904242; | |
mov.u64 %rd2690, 3668340011; | |
mov.u64 %rd2689, 3144134277; | |
mov.u64 %rd2687, 3449720151; | |
mov.u64 %rd2686, 1993301258; | |
mov.u64 %rd2685, 3528531795; | |
bra.uni LBB29_44; | |
LBB29_43: | |
setp.lt.u64 %p54, %rd378, %rd11; | |
selp.u64 %rd2040, 1, 0, %p54; | |
add.s64 %rd2041, %rd2461, %rd2040; | |
and.b64 %rd2042, %rd2041, 4294967295; | |
mul.lo.s64 %rd2688, %rd2042, 3449720151; | |
xor.b64 %rd2043, %rd2688, %rd378; | |
shr.u64 %rd2044, %rd2043, 32; | |
mul.lo.s64 %rd2691, %rd2044, 3528531795; | |
shr.u64 %rd2045, %rd2691, 32; | |
and.b64 %rd2046, %rd378, 4294967295; | |
mul.lo.s64 %rd2047, %rd2046, 3528531795; | |
and.b64 %rd2048, %rd2047, 4294967295; | |
xor.b64 %rd2049, %rd2048, %rd2045; | |
xor.b64 %rd2050, %rd2049, 3144134277; | |
mul.lo.s64 %rd2694, %rd2050, 3449720151; | |
xor.b64 %rd2684, %rd2041, %rd2047; | |
mov.u32 %r344, -766435501; | |
mov.u32 %r343, -239350328; | |
mov.u64 %rd2701, 1684936478; | |
mov.u64 %rd2700, 534103459; | |
mov.u64 %rd2699, 387276957; | |
mov.u64 %rd2698, 3041712726; | |
mov.u64 %rd2697, 3986602516; | |
mov.u64 %rd2696, 2835769497; | |
mov.u64 %rd2695, 3668340011; | |
mov.u64 %rd2693, 2027808484; | |
mov.u64 %rd2692, 1993301258; | |
mov.u64 %rd2690, 842468239; | |
mov.u64 %rd2689, 2654435769; | |
mov.u64 %rd2687, 3528531795; | |
mov.u64 %rd2686, 1013904242; | |
mov.u64 %rd2685, 3449720151; | |
LBB29_44: | |
shr.u64 %rd2076, %rd2694, 32; | |
shr.u64 %rd2077, %rd2684, 32; | |
mul.lo.s64 %rd2078, %rd2077, %rd2685; | |
and.b64 %rd2079, %rd2078, 4294967295; | |
xor.b64 %rd2080, %rd2079, %rd2076; | |
xor.b64 %rd2081, %rd2080, %rd2686; | |
mul.lo.s64 %rd2082, %rd2081, %rd2687; | |
shr.u64 %rd2083, %rd2082, 32; | |
shr.u64 %rd2084, %rd2078, 32; | |
and.b64 %rd2085, %rd2688, 4294967295; | |
xor.b64 %rd2086, %rd2085, %rd2084; | |
xor.b64 %rd2087, %rd2086, %rd2689; | |
mul.lo.s64 %rd2088, %rd2087, %rd2687; | |
and.b64 %rd2089, %rd2088, 4294967295; | |
xor.b64 %rd2090, %rd2089, %rd2083; | |
xor.b64 %rd2091, %rd2090, %rd2690; | |
mul.lo.s64 %rd2092, %rd2091, %rd2685; | |
shr.u64 %rd2093, %rd2092, 32; | |
shr.u64 %rd2094, %rd2088, 32; | |
and.b64 %rd2095, %rd2691, 4294967295; | |
xor.b64 %rd2096, %rd2095, %rd2094; | |
xor.b64 %rd2097, %rd2096, %rd2692; | |
mul.lo.s64 %rd2098, %rd2097, %rd2685; | |
and.b64 %rd2099, %rd2098, 4294967295; | |
xor.b64 %rd2100, %rd2099, %rd2093; | |
xor.b64 %rd2101, %rd2100, %rd2693; | |
mul.lo.s64 %rd2102, %rd2101, %rd2687; | |
shr.u64 %rd2103, %rd2102, 32; | |
shr.u64 %rd2104, %rd2098, 32; | |
and.b64 %rd2105, %rd2694, 4294967295; | |
xor.b64 %rd2106, %rd2105, %rd2104; | |
xor.b64 %rd2107, %rd2106, %rd2695; | |
mul.lo.s64 %rd2108, %rd2107, %rd2687; | |
and.b64 %rd2109, %rd2108, 4294967295; | |
xor.b64 %rd2110, %rd2109, %rd2103; | |
xor.b64 %rd2111, %rd2110, %rd2696; | |
mul.lo.s64 %rd2112, %rd2111, %rd2685; | |
shr.u64 %rd2113, %rd2112, 32; | |
shr.u64 %rd2114, %rd2108, 32; | |
and.b64 %rd2115, %rd2082, 4294967295; | |
xor.b64 %rd2116, %rd2115, %rd2114; | |
xor.b64 %rd2117, %rd2116, %rd2697; | |
mul.lo.s64 %rd2118, %rd2117, %rd2685; | |
and.b64 %rd2119, %rd2118, 4294967295; | |
xor.b64 %rd2120, %rd2119, %rd2113; | |
xor.b64 %rd2121, %rd2120, %rd2698; | |
mul.lo.s64 %rd2122, %rd2121, %rd2687; | |
shr.u64 %rd2123, %rd2122, 32; | |
shr.u64 %rd2124, %rd2118, 32; | |
and.b64 %rd2125, %rd2092, 4294967295; | |
xor.b64 %rd2126, %rd2125, %rd2124; | |
xor.b64 %rd2127, %rd2126, %rd2699; | |
mul.lo.s64 %rd2128, %rd2127, %rd2687; | |
and.b64 %rd2129, %rd2128, 4294967295; | |
xor.b64 %rd2130, %rd2129, %rd2123; | |
xor.b64 %rd2131, %rd2130, %rd2700; | |
mul.lo.s64 %rd2132, %rd2131, %rd2685; | |
shr.u64 %rd2133, %rd2132, 32; | |
shr.u64 %rd2134, %rd2128, 32; | |
xor.b64 %rd2135, %rd2102, %rd2134; | |
xor.b64 %rd2136, %rd2135, %rd2701; | |
mul.lo.s64 %rd2137, %rd2136, %rd2685; | |
xor.b64 %rd2138, %rd2133, %rd2137; | |
cvt.u32.u64 %r237, %rd2138; | |
xor.b32 %r238, %r343, %r237; | |
mul.lo.s32 %r239, %r238, %r344; | |
shr.u32 %r240, %r239, 9; | |
cvt.rn.f32.u32 %f192, %r240; | |
mul.rn.f32 %f193, %f192, 0f34000000; | |
cvt.rn.f16.f32 %h118, %f193; | |
mov.b16 %h119, 0x2E66; | |
setp.ge.f16 %p57, %h118, %h119; | |
ld.global.nc.b16 %h120, [%rd44+1538]; | |
ld.global.nc.f32 %f194, [%rd45+3076]; | |
cvt.rn.f16.f32 %h121, %f194; | |
add.rn.f16 %h122, %h120, %h121; | |
mov.b16 %h123, 0x3C72; | |
mul.rn.f16 %h124, %h122, %h123; | |
selp.b16 %h125, %h124, 0x0000, %p57; | |
cvt.f32.f16 %f195, %h125; | |
ld.global.nc.b16 %h126, [%rd46+1538]; | |
cvt.f32.f16 %f196, %h126; | |
ld.global.nc.f32 %f197, [%rd47+3076]; | |
mul.rn.f32 %f198, %f1, %f197; | |
mul.rn.f32 %f199, %f198, %f196; | |
ld.global.nc.f32 %f200, [%rd48+3076]; | |
mul.rn.f32 %f201, %f2, %f198; | |
sub.rn.f32 %f202, %f200, %f201; | |
add.rn.f32 %f203, %f199, %f202; | |
add.rn.f32 %f204, %f203, %f195; | |
add.rn.f32 %f16, %f15, %f204; | |
or.b32 %r242, %r73, 896; | |
shr.u32 %r243, %r242, 2; | |
cvt.u64.u32 %rd2139, %r243; | |
add.s64 %rd405, %rd11, %rd2139; | |
@%p8 bra LBB29_46; | |
mov.u32 %r347, -1879881855; | |
mov.u32 %r345, 534103459; | |
mov.u64 %rd2720, 3678237736; | |
and.b64 %rd2181, %rd405, 4294967295; | |
mul.lo.s64 %rd2706, %rd2181, 3528531795; | |
setp.lt.u64 %p59, %rd405, %rd11; | |
selp.u64 %rd2182, 1, 0, %p59; | |
add.s64 %rd2183, %rd2461, %rd2182; | |
xor.b64 %rd2184, %rd2183, %rd2706; | |
shr.u64 %rd2185, %rd2184, 32; | |
mul.lo.s64 %rd2709, %rd2185, 3449720151; | |
shr.u64 %rd2186, %rd2709, 32; | |
and.b64 %rd2187, %rd2183, 4294967295; | |
mul.lo.s64 %rd2188, %rd2187, 3449720151; | |
and.b64 %rd2189, %rd2188, 4294967295; | |
xor.b64 %rd2190, %rd2189, %rd2186; | |
xor.b64 %rd2191, %rd2190, 2654435769; | |
mul.lo.s64 %rd2712, %rd2191, 3528531795; | |
xor.b64 %rd2702, %rd2188, %rd405; | |
mov.u32 %r346, -845247145; | |
mov.u64 %rd2719, 3041712726; | |
mov.u64 %rd2718, 1401181199; | |
mov.u64 %rd2717, 2835769497; | |
mov.u64 %rd2716, 1684936478; | |
mov.u64 %rd2715, 2027808484; | |
mov.u64 %rd2714, 387276957; | |
mov.u64 %rd2713, 842468239; | |
mov.u64 %rd2711, 3986602516; | |
mov.u64 %rd2710, 1013904242; | |
mov.u64 %rd2708, 3668340011; | |
mov.u64 %rd2707, 3144134277; | |
mov.u64 %rd2705, 3449720151; | |
mov.u64 %rd2704, 1993301258; | |
mov.u64 %rd2703, 3528531795; | |
bra.uni LBB29_47; | |
LBB29_46: | |
setp.lt.u64 %p58, %rd405, %rd11; | |
selp.u64 %rd2155, 1, 0, %p58; | |
add.s64 %rd2156, %rd2461, %rd2155; | |
and.b64 %rd2157, %rd2156, 4294967295; | |
mul.lo.s64 %rd2706, %rd2157, 3449720151; | |
xor.b64 %rd2158, %rd2706, %rd405; | |
shr.u64 %rd2159, %rd2158, 32; | |
mul.lo.s64 %rd2709, %rd2159, 3528531795; | |
shr.u64 %rd2160, %rd2709, 32; | |
and.b64 %rd2161, %rd405, 4294967295; | |
mul.lo.s64 %rd2162, %rd2161, 3528531795; | |
and.b64 %rd2163, %rd2162, 4294967295; | |
xor.b64 %rd2164, %rd2163, %rd2160; | |
xor.b64 %rd2165, %rd2164, 3144134277; | |
mul.lo.s64 %rd2712, %rd2165, 3449720151; | |
xor.b64 %rd2702, %rd2156, %rd2162; | |
mov.u32 %r347, -1767562579; | |
mov.u32 %r346, -766435501; | |
mov.u32 %r345, 1401181199; | |
mov.u64 %rd2720, 4055616968; | |
mov.u64 %rd2719, 1684936478; | |
mov.u64 %rd2718, 534103459; | |
mov.u64 %rd2717, 387276957; | |
mov.u64 %rd2716, 3041712726; | |
mov.u64 %rd2715, 3986602516; | |
mov.u64 %rd2714, 2835769497; | |
mov.u64 %rd2713, 3668340011; | |
mov.u64 %rd2711, 2027808484; | |
mov.u64 %rd2710, 1993301258; | |
mov.u64 %rd2708, 842468239; | |
mov.u64 %rd2707, 2654435769; | |
mov.u64 %rd2705, 3528531795; | |
mov.u64 %rd2704, 1013904242; | |
mov.u64 %rd2703, 3449720151; | |
LBB29_47: | |
shr.u64 %rd2192, %rd2712, 32; | |
shr.u64 %rd2193, %rd2702, 32; | |
mul.lo.s64 %rd2194, %rd2193, %rd2703; | |
and.b64 %rd2195, %rd2194, 4294967295; | |
xor.b64 %rd2196, %rd2195, %rd2192; | |
xor.b64 %rd2197, %rd2196, %rd2704; | |
mul.lo.s64 %rd2198, %rd2197, %rd2705; | |
shr.u64 %rd2199, %rd2198, 32; | |
shr.u64 %rd2200, %rd2194, 32; | |
and.b64 %rd2201, %rd2706, 4294967295; | |
xor.b64 %rd2202, %rd2201, %rd2200; | |
xor.b64 %rd2203, %rd2202, %rd2707; | |
mul.lo.s64 %rd2204, %rd2203, %rd2705; | |
and.b64 %rd2205, %rd2204, 4294967295; | |
xor.b64 %rd2206, %rd2205, %rd2199; | |
xor.b64 %rd2207, %rd2206, %rd2708; | |
mul.lo.s64 %rd2208, %rd2207, %rd2703; | |
shr.u64 %rd2209, %rd2208, 32; | |
shr.u64 %rd2210, %rd2204, 32; | |
and.b64 %rd2211, %rd2709, 4294967295; | |
xor.b64 %rd2212, %rd2211, %rd2210; | |
xor.b64 %rd2213, %rd2212, %rd2710; | |
mul.lo.s64 %rd2214, %rd2213, %rd2703; | |
and.b64 %rd2215, %rd2214, 4294967295; | |
xor.b64 %rd2216, %rd2215, %rd2209; | |
xor.b64 %rd2217, %rd2216, %rd2711; | |
mul.lo.s64 %rd2218, %rd2217, %rd2705; | |
shr.u64 %rd2219, %rd2218, 32; | |
shr.u64 %rd2220, %rd2214, 32; | |
and.b64 %rd2221, %rd2712, 4294967295; | |
xor.b64 %rd2222, %rd2221, %rd2220; | |
xor.b64 %rd2223, %rd2222, %rd2713; | |
mul.lo.s64 %rd2224, %rd2223, %rd2705; | |
and.b64 %rd2225, %rd2224, 4294967295; | |
xor.b64 %rd2226, %rd2225, %rd2219; | |
xor.b64 %rd2227, %rd2226, %rd2714; | |
mul.lo.s64 %rd2228, %rd2227, %rd2703; | |
shr.u64 %rd2229, %rd2228, 32; | |
shr.u64 %rd2230, %rd2224, 32; | |
and.b64 %rd2231, %rd2198, 4294967295; | |
xor.b64 %rd2232, %rd2231, %rd2230; | |
xor.b64 %rd2233, %rd2232, %rd2715; | |
mul.lo.s64 %rd2234, %rd2233, %rd2703; | |
and.b64 %rd2235, %rd2234, 4294967295; | |
xor.b64 %rd2236, %rd2235, %rd2229; | |
xor.b64 %rd2237, %rd2236, %rd2716; | |
mul.lo.s64 %rd2238, %rd2237, %rd2705; | |
shr.u64 %rd2239, %rd2238, 32; | |
shr.u64 %rd2240, %rd2234, 32; | |
and.b64 %rd2241, %rd2208, 4294967295; | |
xor.b64 %rd2242, %rd2241, %rd2240; | |
xor.b64 %rd2243, %rd2242, %rd2717; | |
mul.lo.s64 %rd2244, %rd2243, %rd2705; | |
and.b64 %rd2245, %rd2244, 4294967295; | |
xor.b64 %rd2246, %rd2245, %rd2239; | |
xor.b64 %rd2247, %rd2246, %rd2718; | |
mul.lo.s64 %rd2248, %rd2247, %rd2703; | |
shr.u64 %rd2249, %rd2248, 32; | |
shr.u64 %rd2250, %rd2244, 32; | |
and.b64 %rd2251, %rd2218, 4294967295; | |
xor.b64 %rd2252, %rd2251, %rd2250; | |
xor.b64 %rd2253, %rd2252, %rd2719; | |
mul.lo.s64 %rd2254, %rd2253, %rd2703; | |
and.b64 %rd2255, %rd2254, 4294967295; | |
xor.b64 %rd2256, %rd2255, %rd2249; | |
xor.b64 %rd2257, %rd2256, %rd2720; | |
mul.lo.s64 %rd2258, %rd2257, %rd2705; | |
shr.u64 %rd2259, %rd2258, 32; | |
cvt.u32.u64 %r250, %rd2259; | |
shr.u64 %rd2260, %rd2254, 32; | |
xor.b64 %rd2261, %rd2260, %rd2228; | |
cvt.u32.u64 %r251, %rd2261; | |
xor.b32 %r252, %r345, %r251; | |
mul.lo.s32 %r253, %r252, %r346; | |
xor.b32 %r254, %r253, %r250; | |
xor.b32 %r255, %r254, %r347; | |
shr.u32 %r256, %r255, 9; | |
cvt.rn.f32.u32 %f205, %r256; | |
mul.rn.f32 %f206, %f205, 0f34000000; | |
cvt.rn.f16.f32 %h127, %f206; | |
mov.b16 %h128, 0x2E66; | |
setp.ge.f16 %p60, %h127, %h128; | |
ld.global.nc.b16 %h129, [%rd44+1792]; | |
ld.global.nc.f32 %f207, [%rd45+3584]; | |
cvt.rn.f16.f32 %h130, %f207; | |
add.rn.f16 %h131, %h129, %h130; | |
mov.b16 %h132, 0x3C72; | |
mul.rn.f16 %h133, %h131, %h132; | |
selp.b16 %h134, %h133, 0x0000, %p60; | |
cvt.f32.f16 %f208, %h134; | |
ld.global.nc.b16 %h135, [%rd46+1792]; | |
cvt.f32.f16 %f209, %h135; | |
ld.global.nc.f32 %f210, [%rd47+3584]; | |
mul.rn.f32 %f211, %f1, %f210; | |
mul.rn.f32 %f212, %f211, %f209; | |
ld.global.nc.f32 %f213, [%rd48+3584]; | |
mul.rn.f32 %f214, %f2, %f211; | |
sub.rn.f32 %f215, %f213, %f214; | |
add.rn.f32 %f216, %f212, %f215; | |
add.rn.f32 %f217, %f216, %f208; | |
add.rn.f32 %f17, %f16, %f217; | |
or.b32 %r257, %r3, 897; | |
or.b32 %r258, %r257, %r4; | |
and.b32 %r259, %r257, 3; | |
shr.u32 %r260, %r258, 2; | |
setp.ne.s32 %p61, %r259, 1; | |
cvt.u64.u32 %rd2262, %r260; | |
add.s64 %rd433, %rd11, %rd2262; | |
@%p61 bra LBB29_49; | |
mov.u32 %r349, -845247145; | |
mov.u64 %rd2737, 1401181199; | |
mov.u64 %rd2726, 3144134277; | |
mov.u32 %r348, -616729560; | |
and.b64 %rd2302, %rd433, 4294967295; | |
mul.lo.s64 %rd2725, %rd2302, 3528531795; | |
setp.lt.u64 %p63, %rd433, %rd11; | |
selp.u64 %rd2303, 1, 0, %p63; | |
add.s64 %rd2304, %rd2461, %rd2303; | |
xor.b64 %rd2305, %rd2304, %rd2725; | |
shr.u64 %rd2306, %rd2305, 32; | |
mul.lo.s64 %rd2728, %rd2306, 3449720151; | |
shr.u64 %rd2307, %rd2728, 32; | |
and.b64 %rd2308, %rd2304, 4294967295; | |
mul.lo.s64 %rd2309, %rd2308, 3449720151; | |
and.b64 %rd2310, %rd2309, 4294967295; | |
xor.b64 %rd2311, %rd2310, %rd2307; | |
xor.b64 %rd2312, %rd2311, 2654435769; | |
mul.lo.s64 %rd2731, %rd2312, 3528531795; | |
xor.b64 %rd2721, %rd2309, %rd433; | |
mov.u64 %rd2738, 3041712726; | |
mov.u64 %rd2736, 2835769497; | |
mov.u64 %rd2735, 1684936478; | |
mov.u64 %rd2734, 2027808484; | |
mov.u64 %rd2733, 387276957; | |
mov.u64 %rd2732, 842468239; | |
mov.u64 %rd2730, 3986602516; | |
mov.u64 %rd2729, 1013904242; | |
mov.u64 %rd2727, 3668340011; | |
mov.u64 %rd2724, 3449720151; | |
mov.u64 %rd2723, 1993301258; | |
mov.u64 %rd2722, 3528531795; | |
bra.uni LBB29_50; | |
LBB29_49: | |
setp.lt.u64 %p62, %rd433, %rd11; | |
selp.u64 %rd2277, 1, 0, %p62; | |
add.s64 %rd2278, %rd2461, %rd2277; | |
and.b64 %rd2279, %rd2278, 4294967295; | |
mul.lo.s64 %rd2725, %rd2279, 3449720151; | |
xor.b64 %rd2280, %rd2725, %rd433; | |
shr.u64 %rd2281, %rd2280, 32; | |
mul.lo.s64 %rd2728, %rd2281, 3528531795; | |
shr.u64 %rd2282, %rd2728, 32; | |
and.b64 %rd2283, %rd433, 4294967295; | |
mul.lo.s64 %rd2284, %rd2283, 3528531795; | |
and.b64 %rd2285, %rd2284, 4294967295; | |
xor.b64 %rd2286, %rd2285, %rd2282; | |
xor.b64 %rd2287, %rd2286, 3144134277; | |
mul.lo.s64 %rd2731, %rd2287, 3449720151; | |
xor.b64 %rd2721, %rd2278, %rd2284; | |
mov.u32 %r349, -766435501; | |
mov.u32 %r348, -239350328; | |
mov.u64 %rd2738, 1684936478; | |
mov.u64 %rd2737, 534103459; | |
mov.u64 %rd2736, 387276957; | |
mov.u64 %rd2735, 3041712726; | |
mov.u64 %rd2734, 3986602516; | |
mov.u64 %rd2733, 2835769497; | |
mov.u64 %rd2732, 3668340011; | |
mov.u64 %rd2730, 2027808484; | |
mov.u64 %rd2729, 1993301258; | |
mov.u64 %rd2727, 842468239; | |
mov.u64 %rd2726, 2654435769; | |
mov.u64 %rd2724, 3528531795; | |
mov.u64 %rd2723, 1013904242; | |
mov.u64 %rd2722, 3449720151; | |
LBB29_50: | |
shr.u64 %rd2313, %rd2731, 32; | |
shr.u64 %rd2314, %rd2721, 32; | |
mul.lo.s64 %rd2315, %rd2314, %rd2722; | |
and.b64 %rd2316, %rd2315, 4294967295; | |
xor.b64 %rd2317, %rd2316, %rd2313; | |
xor.b64 %rd2318, %rd2317, %rd2723; | |
mul.lo.s64 %rd2319, %rd2318, %rd2724; | |
shr.u64 %rd2320, %rd2319, 32; | |
shr.u64 %rd2321, %rd2315, 32; | |
and.b64 %rd2322, %rd2725, 4294967295; | |
xor.b64 %rd2323, %rd2322, %rd2321; | |
xor.b64 %rd2324, %rd2323, %rd2726; | |
mul.lo.s64 %rd2325, %rd2324, %rd2724; | |
and.b64 %rd2326, %rd2325, 4294967295; | |
xor.b64 %rd2327, %rd2326, %rd2320; | |
xor.b64 %rd2328, %rd2327, %rd2727; | |
mul.lo.s64 %rd2329, %rd2328, %rd2722; | |
shr.u64 %rd2330, %rd2329, 32; | |
shr.u64 %rd2331, %rd2325, 32; | |
and.b64 %rd2332, %rd2728, 4294967295; | |
xor.b64 %rd2333, %rd2332, %rd2331; | |
xor.b64 %rd2334, %rd2333, %rd2729; | |
mul.lo.s64 %rd2335, %rd2334, %rd2722; | |
and.b64 %rd2336, %rd2335, 4294967295; | |
xor.b64 %rd2337, %rd2336, %rd2330; | |
xor.b64 %rd2338, %rd2337, %rd2730; | |
mul.lo.s64 %rd2339, %rd2338, %rd2724; | |
shr.u64 %rd2340, %rd2339, 32; | |
shr.u64 %rd2341, %rd2335, 32; | |
and.b64 %rd2342, %rd2731, 4294967295; | |
xor.b64 %rd2343, %rd2342, %rd2341; | |
xor.b64 %rd2344, %rd2343, %rd2732; | |
mul.lo.s64 %rd2345, %rd2344, %rd2724; | |
and.b64 %rd2346, %rd2345, 4294967295; | |
xor.b64 %rd2347, %rd2346, %rd2340; | |
xor.b64 %rd2348, %rd2347, %rd2733; | |
mul.lo.s64 %rd2349, %rd2348, %rd2722; | |
shr.u64 %rd2350, %rd2349, 32; | |
shr.u64 %rd2351, %rd2345, 32; | |
and.b64 %rd2352, %rd2319, 4294967295; | |
xor.b64 %rd2353, %rd2352, %rd2351; | |
xor.b64 %rd2354, %rd2353, %rd2734; | |
mul.lo.s64 %rd2355, %rd2354, %rd2722; | |
and.b64 %rd2356, %rd2355, 4294967295; | |
xor.b64 %rd2357, %rd2356, %rd2350; | |
xor.b64 %rd2358, %rd2357, %rd2735; | |
mul.lo.s64 %rd2359, %rd2358, %rd2724; | |
shr.u64 %rd2360, %rd2359, 32; | |
shr.u64 %rd2361, %rd2355, 32; | |
and.b64 %rd2362, %rd2329, 4294967295; | |
xor.b64 %rd2363, %rd2362, %rd2361; | |
xor.b64 %rd2364, %rd2363, %rd2736; | |
mul.lo.s64 %rd2365, %rd2364, %rd2724; | |
and.b64 %rd2366, %rd2365, 4294967295; | |
xor.b64 %rd2367, %rd2366, %rd2360; | |
xor.b64 %rd2368, %rd2367, %rd2737; | |
mul.lo.s64 %rd2369, %rd2368, %rd2722; | |
shr.u64 %rd2370, %rd2369, 32; | |
shr.u64 %rd2371, %rd2365, 32; | |
xor.b64 %rd2372, %rd2339, %rd2371; | |
xor.b64 %rd2373, %rd2372, %rd2738; | |
mul.lo.s64 %rd2374, %rd2373, %rd2722; | |
xor.b64 %rd2375, %rd2370, %rd2374; | |
cvt.u32.u64 %r265, %rd2375; | |
xor.b32 %r266, %r348, %r265; | |
mul.lo.s32 %r267, %r266, %r349; | |
shr.u32 %r268, %r267, 9; | |
cvt.rn.f32.u32 %f218, %r268; | |
mul.rn.f32 %f219, %f218, 0f34000000; | |
cvt.rn.f16.f32 %h136, %f219; | |
mov.b16 %h137, 0x2E66; | |
setp.ge.f16 %p64, %h136, %h137; | |
ld.global.nc.b16 %h138, [%rd44+1794]; | |
ld.global.nc.f32 %f220, [%rd45+3588]; | |
cvt.rn.f16.f32 %h139, %f220; | |
add.rn.f16 %h140, %h138, %h139; | |
mov.b16 %h141, 0x3C72; | |
mul.rn.f16 %h142, %h140, %h141; | |
selp.b16 %h143, %h142, 0x0000, %p64; | |
cvt.f32.f16 %f221, %h143; | |
ld.global.nc.b16 %h144, [%rd46+1794]; | |
cvt.f32.f16 %f222, %h144; | |
ld.global.nc.f32 %f223, [%rd47+3588]; | |
mul.rn.f32 %f224, %f1, %f223; | |
mul.rn.f32 %f225, %f224, %f222; | |
ld.global.nc.f32 %f226, [%rd48+3588]; | |
mul.rn.f32 %f227, %f2, %f224; | |
sub.rn.f32 %f228, %f226, %f227; | |
add.rn.f32 %f229, %f225, %f228; | |
add.rn.f32 %f230, %f229, %f221; | |
add.rn.f32 %f231, %f17, %f230; | |
and.b32 %r46, %r1, 31; | |
shfl.sync.down.b32 %f232, %f231, 16, 31, -1; | |
add.rn.f32 %f233, %f232, %f231; | |
shfl.sync.down.b32 %f234, %f233, 8, 31, -1; | |
add.rn.f32 %f235, %f234, %f233; | |
shfl.sync.down.b32 %f236, %f235, 4, 31, -1; | |
add.rn.f32 %f237, %f236, %f235; | |
shfl.sync.down.b32 %f238, %f237, 2, 31, -1; | |
add.rn.f32 %f239, %f238, %f237; | |
shfl.sync.down.b32 %f240, %f239, 1, 31, -1; | |
shr.u32 %r47, %r1, 5; | |
setp.ne.s32 %p65, %r46, 0; | |
mov.u64 %rd2378, shared_cache_06; | |
@%p65 bra LBB29_2; | |
mul.wide.u32 %rd2377, %r47, 4; | |
add.s64 %rd461, %rd2378, %rd2377; | |
add.rn.f32 %f18, %f240, %f239; | |
st.shared.f32 [%rd461], %f18; | |
LBB29_2: | |
bar.sync 0; | |
setp.eq.s32 %p66, %r47, 0; | |
@%p66 bra LBB29_52; | |
bra.uni LBB29_3; | |
LBB29_52: | |
add.u64 %rd472, %SP, 0; | |
add.u64 %rd10, %SPL, 0; | |
mul.wide.u32 %rd2379, %r46, 4; | |
add.s64 %rd462, %rd2378, %rd2379; | |
cvta.shared.u64 %rd2381, %rd462; | |
mov.u32 %r269, 0; | |
st.local.u32 [%rd10], %r269; | |
setp.lt.u32 %p67, %r1, 2; | |
selp.b64 %rd2383, %rd2381, %rd472, %p67; | |
ld.f32 %f241, [%rd2383]; | |
shfl.sync.down.b32 %f242, %f241, 16, 31, -1; | |
add.rn.f32 %f243, %f241, %f242; | |
shfl.sync.down.b32 %f244, %f243, 8, 31, -1; | |
add.rn.f32 %f245, %f243, %f244; | |
shfl.sync.down.b32 %f246, %f245, 4, 31, -1; | |
add.rn.f32 %f247, %f245, %f246; | |
shfl.sync.down.b32 %f248, %f247, 2, 31, -1; | |
add.rn.f32 %f249, %f247, %f248; | |
shfl.sync.down.b32 %f250, %f249, 1, 31, -1; | |
add.rn.f32 %f251, %f249, %f250; | |
st.f32 [%rd2383], %f251; | |
setp.ne.s32 %p68, %r1, 0; | |
@%p68 bra LBB29_3; | |
ld.param.u64 %rd469, [fusion_2248_param_3]; | |
cvt.u64.u32 %rd43, %r2; | |
cvta.to.global.u64 %rd6, %rd469; | |
shl.b64 %rd2376, %rd43, 2; | |
add.s64 %rd460, %rd6, %rd2376; | |
ld.shared.f32 %f252, [%rd462]; | |
atom.global.add.f32 %f253, [%rd460], %f252; | |
LBB29_3: | |
ret; | |
} | |
// .globl fusion_2246 | |
.visible .entry fusion_2246( | |
.param .u64 fusion_2246_param_0, | |
.param .u64 fusion_2246_param_1, | |
.param .u64 fusion_2246_param_2, | |
.param .u64 fusion_2246_param_3, | |
.param .u64 fusion_2246_param_4, | |
.param .u64 fusion_2246_param_5, | |
.param .u64 fusion_2246_param_6, | |
.param .u64 fusion_2246_param_7, | |
.param .u64 fusion_2246_param_8, | |
.param .u64 fusion_2246_param_9, | |
.param .u64 fusion_2246_param_10 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot30[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<75>; | |
.reg .b16 %h<145>; | |
.reg .f32 %f<288>; | |
.reg .b32 %r<350>; | |
.reg .b64 %rd<2742>; | |
mov.u64 %SPL, __local_depot30; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd464, [fusion_2246_param_0]; | |
ld.param.u64 %rd465, [fusion_2246_param_9]; | |
cvta.to.global.u64 %rd1, %rd465; | |
ld.param.u64 %rd466, [fusion_2246_param_1]; | |
ld.param.u64 %rd467, [fusion_2246_param_8]; | |
cvta.to.global.u64 %rd2, %rd467; | |
ld.param.u64 %rd468, [fusion_2246_param_2]; | |
ld.param.u64 %rd469, [fusion_2246_param_7]; | |
cvta.to.global.u64 %rd3, %rd469; | |
ld.param.u64 %rd471, [fusion_2246_param_6]; | |
cvta.to.global.u64 %rd4, %rd471; | |
ld.param.u64 %rd472, [fusion_2246_param_4]; | |
ld.param.u64 %rd473, [fusion_2246_param_5]; | |
cvta.to.global.u64 %rd5, %rd473; | |
cvta.to.global.u64 %rd6, %rd472; | |
cvta.to.global.u64 %rd8, %rd468; | |
cvta.to.global.u64 %rd9, %rd466; | |
cvta.to.global.u64 %rd10, %rd464; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 10; | |
or.b32 %r48, %r4, %r3; | |
shr.u32 %r49, %r48, 2; | |
and.b32 %r5, %r1, 1; | |
setp.eq.s32 %p1, %r5, 0; | |
ld.global.nc.u64 %rd12, [%rd8]; | |
cvt.u64.u32 %rd475, %r49; | |
add.s64 %rd13, %rd12, %rd475; | |
setp.lt.u64 %p69, %rd13, %rd12; | |
and.b64 %rd2387, %rd13, 4294967295; | |
@%p1 bra LBB30_1; | |
bra.uni LBB30_4; | |
LBB30_1: | |
mul.lo.s64 %rd2449, %rd2387, 3528531795; | |
ld.global.nc.u64 %rd2464, [%rd8+8]; | |
selp.u64 %rd518, 1, 0, %p69; | |
add.s64 %rd519, %rd2464, %rd518; | |
xor.b64 %rd520, %rd519, %rd2449; | |
shr.u64 %rd521, %rd520, 32; | |
mul.lo.s64 %rd2452, %rd521, 3449720151; | |
shr.u64 %rd522, %rd2452, 32; | |
and.b64 %rd523, %rd519, 4294967295; | |
mul.lo.s64 %rd524, %rd523, 3449720151; | |
and.b64 %rd525, %rd524, 4294967295; | |
xor.b64 %rd526, %rd525, %rd522; | |
xor.b64 %rd527, %rd526, 2654435769; | |
mul.lo.s64 %rd2455, %rd527, 3528531795; | |
xor.b64 %rd2445, %rd524, %rd13; | |
mov.u32 %r312, -1879881855; | |
mov.u32 %r311, -845247145; | |
mov.u32 %r310, 534103459; | |
mov.u64 %rd2463, 3678237736; | |
mov.u64 %rd2462, 3041712726; | |
mov.u64 %rd2461, 1401181199; | |
mov.u64 %rd2460, 2835769497; | |
mov.u64 %rd2459, 1684936478; | |
mov.u64 %rd2458, 2027808484; | |
mov.u64 %rd2457, 387276957; | |
mov.u64 %rd2456, 842468239; | |
mov.u64 %rd2454, 3986602516; | |
mov.u64 %rd2453, 1013904242; | |
mov.u64 %rd2451, 3668340011; | |
mov.u64 %rd2450, 3144134277; | |
mov.u64 %rd2448, 3449720151; | |
mov.u64 %rd2447, 1993301258; | |
mov.u64 %rd2446, 3528531795; | |
bra.uni LBB30_5; | |
LBB30_4: | |
mov.u32 %r311, -766435501; | |
mov.u64 %rd2462, 1684936478; | |
mov.u64 %rd2461, 534103459; | |
mov.u64 %rd2460, 387276957; | |
mov.u64 %rd2459, 3041712726; | |
mov.u64 %rd2458, 3986602516; | |
mov.u64 %rd2457, 2835769497; | |
mov.u64 %rd2456, 3668340011; | |
mov.u64 %rd2454, 2027808484; | |
mov.u64 %rd2453, 1993301258; | |
mov.u64 %rd2451, 842468239; | |
mov.u64 %rd2450, 2654435769; | |
mov.u64 %rd2448, 3528531795; | |
mov.u64 %rd2447, 1013904242; | |
mov.u64 %rd2446, 3449720151; | |
mov.u32 %r312, -1767562579; | |
mov.u32 %r310, 1401181199; | |
mov.u64 %rd2463, 4055616968; | |
ld.global.nc.u64 %rd2464, [%rd8+8]; | |
selp.u64 %rd491, 1, 0, %p69; | |
add.s64 %rd492, %rd2464, %rd491; | |
and.b64 %rd493, %rd492, 4294967295; | |
mul.lo.s64 %rd2449, %rd493, 3449720151; | |
xor.b64 %rd494, %rd2449, %rd13; | |
shr.u64 %rd495, %rd494, 32; | |
mul.lo.s64 %rd2452, %rd495, 3528531795; | |
shr.u64 %rd496, %rd2452, 32; | |
mul.lo.s64 %rd498, %rd2387, 3528531795; | |
and.b64 %rd499, %rd498, 4294967295; | |
xor.b64 %rd500, %rd499, %rd496; | |
xor.b64 %rd501, %rd500, 3144134277; | |
mul.lo.s64 %rd2455, %rd501, 3449720151; | |
xor.b64 %rd2445, %rd492, %rd498; | |
LBB30_5: | |
shr.u64 %rd528, %rd2455, 32; | |
shr.u64 %rd529, %rd2445, 32; | |
mul.lo.s64 %rd530, %rd529, %rd2446; | |
and.b64 %rd531, %rd530, 4294967295; | |
xor.b64 %rd532, %rd531, %rd528; | |
xor.b64 %rd533, %rd532, %rd2447; | |
mul.lo.s64 %rd534, %rd533, %rd2448; | |
shr.u64 %rd535, %rd534, 32; | |
shr.u64 %rd536, %rd530, 32; | |
and.b64 %rd537, %rd2449, 4294967295; | |
xor.b64 %rd538, %rd537, %rd536; | |
xor.b64 %rd539, %rd538, %rd2450; | |
mul.lo.s64 %rd540, %rd539, %rd2448; | |
and.b64 %rd541, %rd540, 4294967295; | |
xor.b64 %rd542, %rd541, %rd535; | |
xor.b64 %rd543, %rd542, %rd2451; | |
mul.lo.s64 %rd544, %rd543, %rd2446; | |
shr.u64 %rd545, %rd544, 32; | |
shr.u64 %rd546, %rd540, 32; | |
and.b64 %rd547, %rd2452, 4294967295; | |
xor.b64 %rd548, %rd547, %rd546; | |
xor.b64 %rd549, %rd548, %rd2453; | |
mul.lo.s64 %rd550, %rd549, %rd2446; | |
and.b64 %rd551, %rd550, 4294967295; | |
xor.b64 %rd552, %rd551, %rd545; | |
xor.b64 %rd553, %rd552, %rd2454; | |
mul.lo.s64 %rd554, %rd553, %rd2448; | |
shr.u64 %rd555, %rd554, 32; | |
shr.u64 %rd556, %rd550, 32; | |
and.b64 %rd557, %rd2455, 4294967295; | |
xor.b64 %rd558, %rd557, %rd556; | |
xor.b64 %rd559, %rd558, %rd2456; | |
mul.lo.s64 %rd560, %rd559, %rd2448; | |
and.b64 %rd561, %rd560, 4294967295; | |
xor.b64 %rd562, %rd561, %rd555; | |
xor.b64 %rd563, %rd562, %rd2457; | |
mul.lo.s64 %rd564, %rd563, %rd2446; | |
shr.u64 %rd565, %rd564, 32; | |
shr.u64 %rd566, %rd560, 32; | |
and.b64 %rd567, %rd534, 4294967295; | |
xor.b64 %rd568, %rd567, %rd566; | |
xor.b64 %rd569, %rd568, %rd2458; | |
mul.lo.s64 %rd570, %rd569, %rd2446; | |
and.b64 %rd571, %rd570, 4294967295; | |
xor.b64 %rd572, %rd571, %rd565; | |
xor.b64 %rd573, %rd572, %rd2459; | |
mul.lo.s64 %rd574, %rd573, %rd2448; | |
shr.u64 %rd575, %rd574, 32; | |
shr.u64 %rd576, %rd570, 32; | |
and.b64 %rd577, %rd544, 4294967295; | |
xor.b64 %rd578, %rd577, %rd576; | |
xor.b64 %rd579, %rd578, %rd2460; | |
mul.lo.s64 %rd580, %rd579, %rd2448; | |
and.b64 %rd581, %rd580, 4294967295; | |
xor.b64 %rd582, %rd581, %rd575; | |
xor.b64 %rd583, %rd582, %rd2461; | |
mul.lo.s64 %rd584, %rd583, %rd2446; | |
shr.u64 %rd585, %rd584, 32; | |
shr.u64 %rd586, %rd580, 32; | |
and.b64 %rd587, %rd554, 4294967295; | |
xor.b64 %rd588, %rd587, %rd586; | |
xor.b64 %rd589, %rd588, %rd2462; | |
mul.lo.s64 %rd590, %rd589, %rd2446; | |
and.b64 %rd591, %rd590, 4294967295; | |
xor.b64 %rd592, %rd591, %rd585; | |
xor.b64 %rd593, %rd592, %rd2463; | |
mul.lo.s64 %rd594, %rd593, %rd2448; | |
shr.u64 %rd595, %rd594, 32; | |
cvt.u32.u64 %r56, %rd595; | |
shr.u64 %rd596, %rd590, 32; | |
xor.b64 %rd597, %rd596, %rd564; | |
cvt.u32.u64 %r57, %rd597; | |
xor.b32 %r58, %r310, %r57; | |
mul.lo.s32 %r59, %r58, %r311; | |
xor.b32 %r60, %r59, %r56; | |
xor.b32 %r61, %r60, %r312; | |
shr.u32 %r62, %r61, 9; | |
cvt.rn.f32.u32 %f20, %r62; | |
mul.rn.f32 %f21, %f20, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f21; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p4, %h1, %h2; | |
mul.wide.u32 %rd598, %r2, 2048; | |
add.s64 %rd599, %rd10, %rd598; | |
mul.wide.u32 %rd600, %r3, 2; | |
add.s64 %rd45, %rd599, %rd600; | |
ld.global.nc.b16 %h3, [%rd45]; | |
mul.wide.u32 %rd601, %r3, 4; | |
add.s64 %rd46, %rd1, %rd601; | |
ld.global.nc.f32 %f22, [%rd46]; | |
cvt.rn.f16.f32 %h4, %f22; | |
add.rn.f16 %h5, %h3, %h4; | |
mov.b16 %h6, 0x3C72; | |
mul.rn.f16 %h7, %h5, %h6; | |
selp.b16 %h8, %h7, 0x0000, %p4; | |
cvt.f32.f16 %f23, %h8; | |
add.s64 %rd602, %rd9, %rd598; | |
add.s64 %rd47, %rd602, %rd600; | |
ld.global.nc.b16 %h9, [%rd47]; | |
cvt.f32.f16 %f24, %h9; | |
mul.wide.u32 %rd603, %r2, 4; | |
add.s64 %rd604, %rd5, %rd603; | |
ld.global.nc.f32 %f25, [%rd604]; | |
mul.rn.f32 %f26, %f25, 0f3A800000; | |
add.rn.f32 %f27, %f26, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f1, %f27; | |
add.s64 %rd48, %rd2, %rd601; | |
ld.global.nc.f32 %f28, [%rd48]; | |
mul.rn.f32 %f29, %f1, %f28; | |
mul.rn.f32 %f30, %f29, %f24; | |
add.s64 %rd49, %rd3, %rd601; | |
ld.global.nc.f32 %f31, [%rd49]; | |
add.s64 %rd605, %rd4, %rd603; | |
ld.global.nc.f32 %f32, [%rd605]; | |
mul.rn.f32 %f2, %f32, 0f3A800000; | |
mul.rn.f32 %f33, %f29, %f2; | |
sub.rn.f32 %f34, %f31, %f33; | |
add.rn.f32 %f35, %f30, %f34; | |
add.rn.f32 %f36, %f35, %f23; | |
add.s64 %rd606, %rd6, %rd603; | |
ld.global.nc.f32 %f37, [%rd606]; | |
mul.rn.f32 %f3, %f37, 0f3A800000; | |
sub.rn.f32 %f38, %f36, %f3; | |
mul.rn.f32 %f39, %f38, %f38; | |
add.rn.f32 %f4, %f39, 0f00000000; | |
or.b32 %r63, %r3, 1; | |
and.b32 %r64, %r63, 3; | |
setp.ne.s32 %p5, %r64, 1; | |
@%p5 bra LBB30_7; | |
mul.lo.s64 %rd2469, %rd2387, 3528531795; | |
selp.u64 %rd647, 1, 0, %p69; | |
add.s64 %rd648, %rd2464, %rd647; | |
xor.b64 %rd649, %rd648, %rd2469; | |
shr.u64 %rd650, %rd649, 32; | |
mul.lo.s64 %rd2472, %rd650, 3449720151; | |
shr.u64 %rd651, %rd2472, 32; | |
and.b64 %rd652, %rd648, 4294967295; | |
mul.lo.s64 %rd653, %rd652, 3449720151; | |
and.b64 %rd654, %rd653, 4294967295; | |
xor.b64 %rd655, %rd654, %rd651; | |
xor.b64 %rd656, %rd655, 2654435769; | |
mul.lo.s64 %rd2475, %rd656, 3528531795; | |
xor.b64 %rd2465, %rd653, %rd13; | |
mov.u32 %r314, -845247145; | |
mov.u32 %r313, -616729560; | |
mov.u64 %rd2482, 3041712726; | |
mov.u64 %rd2481, 1401181199; | |
mov.u64 %rd2480, 2835769497; | |
mov.u64 %rd2479, 1684936478; | |
mov.u64 %rd2478, 2027808484; | |
mov.u64 %rd2477, 387276957; | |
mov.u64 %rd2476, 842468239; | |
mov.u64 %rd2474, 3986602516; | |
mov.u64 %rd2473, 1013904242; | |
mov.u64 %rd2471, 3668340011; | |
mov.u64 %rd2470, 3144134277; | |
mov.u64 %rd2468, 3449720151; | |
mov.u64 %rd2467, 1993301258; | |
mov.u64 %rd2466, 3528531795; | |
bra.uni LBB30_8; | |
LBB30_7: | |
mov.u32 %r313, -239350328; | |
selp.u64 %rd621, 1, 0, %p69; | |
add.s64 %rd622, %rd2464, %rd621; | |
and.b64 %rd623, %rd622, 4294967295; | |
mul.lo.s64 %rd2469, %rd623, 3449720151; | |
xor.b64 %rd624, %rd2469, %rd13; | |
shr.u64 %rd625, %rd624, 32; | |
mul.lo.s64 %rd2472, %rd625, 3528531795; | |
shr.u64 %rd626, %rd2472, 32; | |
mul.lo.s64 %rd628, %rd2387, 3528531795; | |
and.b64 %rd629, %rd628, 4294967295; | |
xor.b64 %rd630, %rd629, %rd626; | |
xor.b64 %rd631, %rd630, 3144134277; | |
mul.lo.s64 %rd2475, %rd631, 3449720151; | |
xor.b64 %rd2465, %rd622, %rd628; | |
mov.u32 %r314, -766435501; | |
mov.u64 %rd2482, 1684936478; | |
mov.u64 %rd2481, 534103459; | |
mov.u64 %rd2480, 387276957; | |
mov.u64 %rd2479, 3041712726; | |
mov.u64 %rd2478, 3986602516; | |
mov.u64 %rd2477, 2835769497; | |
mov.u64 %rd2476, 3668340011; | |
mov.u64 %rd2474, 2027808484; | |
mov.u64 %rd2473, 1993301258; | |
mov.u64 %rd2471, 842468239; | |
mov.u64 %rd2470, 2654435769; | |
mov.u64 %rd2468, 3528531795; | |
mov.u64 %rd2467, 1013904242; | |
mov.u64 %rd2466, 3449720151; | |
LBB30_8: | |
setp.ne.s32 %p8, %r5, 0; | |
shr.u64 %rd657, %rd2475, 32; | |
shr.u64 %rd658, %rd2465, 32; | |
mul.lo.s64 %rd659, %rd658, %rd2466; | |
and.b64 %rd660, %rd659, 4294967295; | |
xor.b64 %rd661, %rd660, %rd657; | |
xor.b64 %rd662, %rd661, %rd2467; | |
mul.lo.s64 %rd663, %rd662, %rd2468; | |
shr.u64 %rd664, %rd663, 32; | |
shr.u64 %rd665, %rd659, 32; | |
and.b64 %rd666, %rd2469, 4294967295; | |
xor.b64 %rd667, %rd666, %rd665; | |
xor.b64 %rd668, %rd667, %rd2470; | |
mul.lo.s64 %rd669, %rd668, %rd2468; | |
and.b64 %rd670, %rd669, 4294967295; | |
xor.b64 %rd671, %rd670, %rd664; | |
xor.b64 %rd672, %rd671, %rd2471; | |
mul.lo.s64 %rd673, %rd672, %rd2466; | |
shr.u64 %rd674, %rd673, 32; | |
shr.u64 %rd675, %rd669, 32; | |
and.b64 %rd676, %rd2472, 4294967295; | |
xor.b64 %rd677, %rd676, %rd675; | |
xor.b64 %rd678, %rd677, %rd2473; | |
mul.lo.s64 %rd679, %rd678, %rd2466; | |
and.b64 %rd680, %rd679, 4294967295; | |
xor.b64 %rd681, %rd680, %rd674; | |
xor.b64 %rd682, %rd681, %rd2474; | |
mul.lo.s64 %rd683, %rd682, %rd2468; | |
shr.u64 %rd684, %rd683, 32; | |
shr.u64 %rd685, %rd679, 32; | |
and.b64 %rd686, %rd2475, 4294967295; | |
xor.b64 %rd687, %rd686, %rd685; | |
xor.b64 %rd688, %rd687, %rd2476; | |
mul.lo.s64 %rd689, %rd688, %rd2468; | |
and.b64 %rd690, %rd689, 4294967295; | |
xor.b64 %rd691, %rd690, %rd684; | |
xor.b64 %rd692, %rd691, %rd2477; | |
mul.lo.s64 %rd693, %rd692, %rd2466; | |
shr.u64 %rd694, %rd693, 32; | |
shr.u64 %rd695, %rd689, 32; | |
and.b64 %rd696, %rd663, 4294967295; | |
xor.b64 %rd697, %rd696, %rd695; | |
xor.b64 %rd698, %rd697, %rd2478; | |
mul.lo.s64 %rd699, %rd698, %rd2466; | |
and.b64 %rd700, %rd699, 4294967295; | |
xor.b64 %rd701, %rd700, %rd694; | |
xor.b64 %rd702, %rd701, %rd2479; | |
mul.lo.s64 %rd703, %rd702, %rd2468; | |
shr.u64 %rd704, %rd703, 32; | |
shr.u64 %rd705, %rd699, 32; | |
and.b64 %rd706, %rd673, 4294967295; | |
xor.b64 %rd707, %rd706, %rd705; | |
xor.b64 %rd708, %rd707, %rd2480; | |
mul.lo.s64 %rd709, %rd708, %rd2468; | |
and.b64 %rd710, %rd709, 4294967295; | |
xor.b64 %rd711, %rd710, %rd704; | |
xor.b64 %rd712, %rd711, %rd2481; | |
mul.lo.s64 %rd713, %rd712, %rd2466; | |
shr.u64 %rd714, %rd713, 32; | |
shr.u64 %rd715, %rd709, 32; | |
xor.b64 %rd716, %rd683, %rd715; | |
xor.b64 %rd717, %rd716, %rd2482; | |
mul.lo.s64 %rd718, %rd717, %rd2466; | |
xor.b64 %rd719, %rd714, %rd718; | |
cvt.u32.u64 %r69, %rd719; | |
xor.b32 %r70, %r313, %r69; | |
mul.lo.s32 %r71, %r70, %r314; | |
shr.u32 %r72, %r71, 9; | |
cvt.rn.f32.u32 %f40, %r72; | |
mul.rn.f32 %f41, %f40, 0f34000000; | |
cvt.rn.f16.f32 %h10, %f41; | |
mov.b16 %h11, 0x2E66; | |
setp.ge.f16 %p9, %h10, %h11; | |
ld.global.nc.b16 %h12, [%rd45+2]; | |
ld.global.nc.f32 %f42, [%rd46+4]; | |
cvt.rn.f16.f32 %h13, %f42; | |
add.rn.f16 %h14, %h12, %h13; | |
mov.b16 %h15, 0x3C72; | |
mul.rn.f16 %h16, %h14, %h15; | |
selp.b16 %h17, %h16, 0x0000, %p9; | |
cvt.f32.f16 %f43, %h17; | |
ld.global.nc.b16 %h18, [%rd47+2]; | |
cvt.f32.f16 %f44, %h18; | |
ld.global.nc.f32 %f45, [%rd48+4]; | |
mul.rn.f32 %f46, %f1, %f45; | |
mul.rn.f32 %f47, %f46, %f44; | |
ld.global.nc.f32 %f48, [%rd49+4]; | |
mul.rn.f32 %f49, %f2, %f46; | |
sub.rn.f32 %f50, %f48, %f49; | |
add.rn.f32 %f51, %f47, %f50; | |
add.rn.f32 %f52, %f51, %f43; | |
sub.rn.f32 %f53, %f52, %f3; | |
mul.rn.f32 %f54, %f53, %f53; | |
add.rn.f32 %f5, %f4, %f54; | |
or.b32 %r73, %r3, %r4; | |
or.b32 %r74, %r73, 128; | |
shr.u32 %r75, %r74, 2; | |
cvt.u64.u32 %rd720, %r75; | |
add.s64 %rd76, %rd12, %rd720; | |
and.b64 %rd2436, %rd76, 4294967295; | |
setp.lt.u64 %p74, %rd76, %rd12; | |
@%p8 bra LBB30_10; | |
mul.lo.s64 %rd2487, %rd2436, 3528531795; | |
selp.u64 %rd763, 1, 0, %p74; | |
add.s64 %rd764, %rd2464, %rd763; | |
xor.b64 %rd765, %rd764, %rd2487; | |
shr.u64 %rd766, %rd765, 32; | |
mul.lo.s64 %rd2490, %rd766, 3449720151; | |
shr.u64 %rd767, %rd2490, 32; | |
and.b64 %rd768, %rd764, 4294967295; | |
mul.lo.s64 %rd769, %rd768, 3449720151; | |
and.b64 %rd770, %rd769, 4294967295; | |
xor.b64 %rd771, %rd770, %rd767; | |
xor.b64 %rd772, %rd771, 2654435769; | |
mul.lo.s64 %rd2493, %rd772, 3528531795; | |
xor.b64 %rd2483, %rd769, %rd76; | |
mov.u32 %r317, -1879881855; | |
mov.u32 %r316, -845247145; | |
mov.u32 %r315, 534103459; | |
mov.u64 %rd2501, 3678237736; | |
mov.u64 %rd2500, 3041712726; | |
mov.u64 %rd2499, 1401181199; | |
mov.u64 %rd2498, 2835769497; | |
mov.u64 %rd2497, 1684936478; | |
mov.u64 %rd2496, 2027808484; | |
mov.u64 %rd2495, 387276957; | |
mov.u64 %rd2494, 842468239; | |
mov.u64 %rd2492, 3986602516; | |
mov.u64 %rd2491, 1013904242; | |
mov.u64 %rd2489, 3668340011; | |
mov.u64 %rd2488, 3144134277; | |
mov.u64 %rd2486, 3449720151; | |
mov.u64 %rd2485, 1993301258; | |
mov.u64 %rd2484, 3528531795; | |
bra.uni LBB30_11; | |
LBB30_10: | |
selp.u64 %rd736, 1, 0, %p74; | |
add.s64 %rd737, %rd2464, %rd736; | |
and.b64 %rd738, %rd737, 4294967295; | |
mul.lo.s64 %rd2487, %rd738, 3449720151; | |
xor.b64 %rd739, %rd2487, %rd76; | |
shr.u64 %rd740, %rd739, 32; | |
mul.lo.s64 %rd2490, %rd740, 3528531795; | |
shr.u64 %rd741, %rd2490, 32; | |
mul.lo.s64 %rd743, %rd2436, 3528531795; | |
and.b64 %rd744, %rd743, 4294967295; | |
xor.b64 %rd745, %rd744, %rd741; | |
xor.b64 %rd746, %rd745, 3144134277; | |
mul.lo.s64 %rd2493, %rd746, 3449720151; | |
xor.b64 %rd2483, %rd737, %rd743; | |
mov.u32 %r317, -1767562579; | |
mov.u32 %r316, -766435501; | |
mov.u32 %r315, 1401181199; | |
mov.u64 %rd2501, 4055616968; | |
mov.u64 %rd2500, 1684936478; | |
mov.u64 %rd2499, 534103459; | |
mov.u64 %rd2498, 387276957; | |
mov.u64 %rd2497, 3041712726; | |
mov.u64 %rd2496, 3986602516; | |
mov.u64 %rd2495, 2835769497; | |
mov.u64 %rd2494, 3668340011; | |
mov.u64 %rd2492, 2027808484; | |
mov.u64 %rd2491, 1993301258; | |
mov.u64 %rd2489, 842468239; | |
mov.u64 %rd2488, 2654435769; | |
mov.u64 %rd2486, 3528531795; | |
mov.u64 %rd2485, 1013904242; | |
mov.u64 %rd2484, 3449720151; | |
LBB30_11: | |
shr.u64 %rd773, %rd2493, 32; | |
shr.u64 %rd774, %rd2483, 32; | |
mul.lo.s64 %rd775, %rd774, %rd2484; | |
and.b64 %rd776, %rd775, 4294967295; | |
xor.b64 %rd777, %rd776, %rd773; | |
xor.b64 %rd778, %rd777, %rd2485; | |
mul.lo.s64 %rd779, %rd778, %rd2486; | |
shr.u64 %rd780, %rd779, 32; | |
shr.u64 %rd781, %rd775, 32; | |
and.b64 %rd782, %rd2487, 4294967295; | |
xor.b64 %rd783, %rd782, %rd781; | |
xor.b64 %rd784, %rd783, %rd2488; | |
mul.lo.s64 %rd785, %rd784, %rd2486; | |
and.b64 %rd786, %rd785, 4294967295; | |
xor.b64 %rd787, %rd786, %rd780; | |
xor.b64 %rd788, %rd787, %rd2489; | |
mul.lo.s64 %rd789, %rd788, %rd2484; | |
shr.u64 %rd790, %rd789, 32; | |
shr.u64 %rd791, %rd785, 32; | |
and.b64 %rd792, %rd2490, 4294967295; | |
xor.b64 %rd793, %rd792, %rd791; | |
xor.b64 %rd794, %rd793, %rd2491; | |
mul.lo.s64 %rd795, %rd794, %rd2484; | |
and.b64 %rd796, %rd795, 4294967295; | |
xor.b64 %rd797, %rd796, %rd790; | |
xor.b64 %rd798, %rd797, %rd2492; | |
mul.lo.s64 %rd799, %rd798, %rd2486; | |
shr.u64 %rd800, %rd799, 32; | |
shr.u64 %rd801, %rd795, 32; | |
and.b64 %rd802, %rd2493, 4294967295; | |
xor.b64 %rd803, %rd802, %rd801; | |
xor.b64 %rd804, %rd803, %rd2494; | |
mul.lo.s64 %rd805, %rd804, %rd2486; | |
and.b64 %rd806, %rd805, 4294967295; | |
xor.b64 %rd807, %rd806, %rd800; | |
xor.b64 %rd808, %rd807, %rd2495; | |
mul.lo.s64 %rd809, %rd808, %rd2484; | |
shr.u64 %rd810, %rd809, 32; | |
shr.u64 %rd811, %rd805, 32; | |
and.b64 %rd812, %rd779, 4294967295; | |
xor.b64 %rd813, %rd812, %rd811; | |
xor.b64 %rd814, %rd813, %rd2496; | |
mul.lo.s64 %rd815, %rd814, %rd2484; | |
and.b64 %rd816, %rd815, 4294967295; | |
xor.b64 %rd817, %rd816, %rd810; | |
xor.b64 %rd818, %rd817, %rd2497; | |
mul.lo.s64 %rd819, %rd818, %rd2486; | |
shr.u64 %rd820, %rd819, 32; | |
shr.u64 %rd821, %rd815, 32; | |
and.b64 %rd822, %rd789, 4294967295; | |
xor.b64 %rd823, %rd822, %rd821; | |
xor.b64 %rd824, %rd823, %rd2498; | |
mul.lo.s64 %rd825, %rd824, %rd2486; | |
and.b64 %rd826, %rd825, 4294967295; | |
xor.b64 %rd827, %rd826, %rd820; | |
xor.b64 %rd828, %rd827, %rd2499; | |
mul.lo.s64 %rd829, %rd828, %rd2484; | |
shr.u64 %rd830, %rd829, 32; | |
shr.u64 %rd831, %rd825, 32; | |
and.b64 %rd832, %rd799, 4294967295; | |
xor.b64 %rd833, %rd832, %rd831; | |
xor.b64 %rd834, %rd833, %rd2500; | |
mul.lo.s64 %rd835, %rd834, %rd2484; | |
and.b64 %rd836, %rd835, 4294967295; | |
xor.b64 %rd837, %rd836, %rd830; | |
xor.b64 %rd838, %rd837, %rd2501; | |
mul.lo.s64 %rd839, %rd838, %rd2486; | |
shr.u64 %rd840, %rd839, 32; | |
cvt.u32.u64 %r82, %rd840; | |
shr.u64 %rd841, %rd835, 32; | |
xor.b64 %rd842, %rd841, %rd809; | |
cvt.u32.u64 %r83, %rd842; | |
xor.b32 %r84, %r315, %r83; | |
mul.lo.s32 %r85, %r84, %r316; | |
xor.b32 %r86, %r85, %r82; | |
xor.b32 %r87, %r86, %r317; | |
shr.u32 %r88, %r87, 9; | |
cvt.rn.f32.u32 %f55, %r88; | |
mul.rn.f32 %f56, %f55, 0f34000000; | |
cvt.rn.f16.f32 %h19, %f56; | |
mov.b16 %h20, 0x2E66; | |
setp.ge.f16 %p12, %h19, %h20; | |
ld.global.nc.b16 %h21, [%rd45+256]; | |
ld.global.nc.f32 %f57, [%rd46+512]; | |
cvt.rn.f16.f32 %h22, %f57; | |
add.rn.f16 %h23, %h21, %h22; | |
mov.b16 %h24, 0x3C72; | |
mul.rn.f16 %h25, %h23, %h24; | |
selp.b16 %h26, %h25, 0x0000, %p12; | |
cvt.f32.f16 %f58, %h26; | |
ld.global.nc.b16 %h27, [%rd47+256]; | |
cvt.f32.f16 %f59, %h27; | |
ld.global.nc.f32 %f60, [%rd48+512]; | |
mul.rn.f32 %f61, %f1, %f60; | |
mul.rn.f32 %f62, %f61, %f59; | |
ld.global.nc.f32 %f63, [%rd49+512]; | |
mul.rn.f32 %f64, %f2, %f61; | |
sub.rn.f32 %f65, %f63, %f64; | |
add.rn.f32 %f66, %f62, %f65; | |
add.rn.f32 %f67, %f66, %f58; | |
sub.rn.f32 %f68, %f67, %f3; | |
mul.rn.f32 %f69, %f68, %f68; | |
add.rn.f32 %f6, %f5, %f69; | |
or.b32 %r89, %r3, 129; | |
or.b32 %r90, %r89, %r4; | |
and.b32 %r91, %r89, 3; | |
shr.u32 %r92, %r90, 2; | |
setp.ne.s32 %p13, %r91, 1; | |
cvt.u64.u32 %rd843, %r92; | |
add.s64 %rd104, %rd12, %rd843; | |
and.b64 %rd2433, %rd104, 4294967295; | |
setp.lt.u64 %p73, %rd104, %rd12; | |
@%p13 bra LBB30_13; | |
mul.lo.s64 %rd2506, %rd2433, 3528531795; | |
selp.u64 %rd884, 1, 0, %p73; | |
add.s64 %rd885, %rd2464, %rd884; | |
xor.b64 %rd886, %rd885, %rd2506; | |
shr.u64 %rd887, %rd886, 32; | |
mul.lo.s64 %rd2509, %rd887, 3449720151; | |
shr.u64 %rd888, %rd2509, 32; | |
and.b64 %rd889, %rd885, 4294967295; | |
mul.lo.s64 %rd890, %rd889, 3449720151; | |
and.b64 %rd891, %rd890, 4294967295; | |
xor.b64 %rd892, %rd891, %rd888; | |
xor.b64 %rd893, %rd892, 2654435769; | |
mul.lo.s64 %rd2512, %rd893, 3528531795; | |
xor.b64 %rd2502, %rd890, %rd104; | |
mov.u32 %r319, -845247145; | |
mov.u32 %r318, -616729560; | |
mov.u64 %rd2519, 3041712726; | |
mov.u64 %rd2518, 1401181199; | |
mov.u64 %rd2517, 2835769497; | |
mov.u64 %rd2516, 1684936478; | |
mov.u64 %rd2515, 2027808484; | |
mov.u64 %rd2514, 387276957; | |
mov.u64 %rd2513, 842468239; | |
mov.u64 %rd2511, 3986602516; | |
mov.u64 %rd2510, 1013904242; | |
mov.u64 %rd2508, 3668340011; | |
mov.u64 %rd2507, 3144134277; | |
mov.u64 %rd2505, 3449720151; | |
mov.u64 %rd2504, 1993301258; | |
mov.u64 %rd2503, 3528531795; | |
bra.uni LBB30_14; | |
LBB30_13: | |
selp.u64 %rd858, 1, 0, %p73; | |
add.s64 %rd859, %rd2464, %rd858; | |
and.b64 %rd860, %rd859, 4294967295; | |
mul.lo.s64 %rd2506, %rd860, 3449720151; | |
xor.b64 %rd861, %rd2506, %rd104; | |
shr.u64 %rd862, %rd861, 32; | |
mul.lo.s64 %rd2509, %rd862, 3528531795; | |
shr.u64 %rd863, %rd2509, 32; | |
mul.lo.s64 %rd865, %rd2433, 3528531795; | |
and.b64 %rd866, %rd865, 4294967295; | |
xor.b64 %rd867, %rd866, %rd863; | |
xor.b64 %rd868, %rd867, 3144134277; | |
mul.lo.s64 %rd2512, %rd868, 3449720151; | |
xor.b64 %rd2502, %rd859, %rd865; | |
mov.u32 %r319, -766435501; | |
mov.u32 %r318, -239350328; | |
mov.u64 %rd2519, 1684936478; | |
mov.u64 %rd2518, 534103459; | |
mov.u64 %rd2517, 387276957; | |
mov.u64 %rd2516, 3041712726; | |
mov.u64 %rd2515, 3986602516; | |
mov.u64 %rd2514, 2835769497; | |
mov.u64 %rd2513, 3668340011; | |
mov.u64 %rd2511, 2027808484; | |
mov.u64 %rd2510, 1993301258; | |
mov.u64 %rd2508, 842468239; | |
mov.u64 %rd2507, 2654435769; | |
mov.u64 %rd2505, 3528531795; | |
mov.u64 %rd2504, 1013904242; | |
mov.u64 %rd2503, 3449720151; | |
LBB30_14: | |
shr.u64 %rd894, %rd2512, 32; | |
shr.u64 %rd895, %rd2502, 32; | |
mul.lo.s64 %rd896, %rd895, %rd2503; | |
and.b64 %rd897, %rd896, 4294967295; | |
xor.b64 %rd898, %rd897, %rd894; | |
xor.b64 %rd899, %rd898, %rd2504; | |
mul.lo.s64 %rd900, %rd899, %rd2505; | |
shr.u64 %rd901, %rd900, 32; | |
shr.u64 %rd902, %rd896, 32; | |
and.b64 %rd903, %rd2506, 4294967295; | |
xor.b64 %rd904, %rd903, %rd902; | |
xor.b64 %rd905, %rd904, %rd2507; | |
mul.lo.s64 %rd906, %rd905, %rd2505; | |
and.b64 %rd907, %rd906, 4294967295; | |
xor.b64 %rd908, %rd907, %rd901; | |
xor.b64 %rd909, %rd908, %rd2508; | |
mul.lo.s64 %rd910, %rd909, %rd2503; | |
shr.u64 %rd911, %rd910, 32; | |
shr.u64 %rd912, %rd906, 32; | |
and.b64 %rd913, %rd2509, 4294967295; | |
xor.b64 %rd914, %rd913, %rd912; | |
xor.b64 %rd915, %rd914, %rd2510; | |
mul.lo.s64 %rd916, %rd915, %rd2503; | |
and.b64 %rd917, %rd916, 4294967295; | |
xor.b64 %rd918, %rd917, %rd911; | |
xor.b64 %rd919, %rd918, %rd2511; | |
mul.lo.s64 %rd920, %rd919, %rd2505; | |
shr.u64 %rd921, %rd920, 32; | |
shr.u64 %rd922, %rd916, 32; | |
and.b64 %rd923, %rd2512, 4294967295; | |
xor.b64 %rd924, %rd923, %rd922; | |
xor.b64 %rd925, %rd924, %rd2513; | |
mul.lo.s64 %rd926, %rd925, %rd2505; | |
and.b64 %rd927, %rd926, 4294967295; | |
xor.b64 %rd928, %rd927, %rd921; | |
xor.b64 %rd929, %rd928, %rd2514; | |
mul.lo.s64 %rd930, %rd929, %rd2503; | |
shr.u64 %rd931, %rd930, 32; | |
shr.u64 %rd932, %rd926, 32; | |
and.b64 %rd933, %rd900, 4294967295; | |
xor.b64 %rd934, %rd933, %rd932; | |
xor.b64 %rd935, %rd934, %rd2515; | |
mul.lo.s64 %rd936, %rd935, %rd2503; | |
and.b64 %rd937, %rd936, 4294967295; | |
xor.b64 %rd938, %rd937, %rd931; | |
xor.b64 %rd939, %rd938, %rd2516; | |
mul.lo.s64 %rd940, %rd939, %rd2505; | |
shr.u64 %rd941, %rd940, 32; | |
shr.u64 %rd942, %rd936, 32; | |
and.b64 %rd943, %rd910, 4294967295; | |
xor.b64 %rd944, %rd943, %rd942; | |
xor.b64 %rd945, %rd944, %rd2517; | |
mul.lo.s64 %rd946, %rd945, %rd2505; | |
and.b64 %rd947, %rd946, 4294967295; | |
xor.b64 %rd948, %rd947, %rd941; | |
xor.b64 %rd949, %rd948, %rd2518; | |
mul.lo.s64 %rd950, %rd949, %rd2503; | |
shr.u64 %rd951, %rd950, 32; | |
shr.u64 %rd952, %rd946, 32; | |
xor.b64 %rd953, %rd920, %rd952; | |
xor.b64 %rd954, %rd953, %rd2519; | |
mul.lo.s64 %rd955, %rd954, %rd2503; | |
xor.b64 %rd956, %rd951, %rd955; | |
cvt.u32.u64 %r97, %rd956; | |
xor.b32 %r98, %r318, %r97; | |
mul.lo.s32 %r99, %r98, %r319; | |
shr.u32 %r100, %r99, 9; | |
cvt.rn.f32.u32 %f70, %r100; | |
mul.rn.f32 %f71, %f70, 0f34000000; | |
cvt.rn.f16.f32 %h28, %f71; | |
mov.b16 %h29, 0x2E66; | |
setp.ge.f16 %p17, %h28, %h29; | |
ld.global.nc.b16 %h30, [%rd45+258]; | |
ld.global.nc.f32 %f72, [%rd46+516]; | |
cvt.rn.f16.f32 %h31, %f72; | |
add.rn.f16 %h32, %h30, %h31; | |
mov.b16 %h33, 0x3C72; | |
mul.rn.f16 %h34, %h32, %h33; | |
selp.b16 %h35, %h34, 0x0000, %p17; | |
cvt.f32.f16 %f73, %h35; | |
ld.global.nc.b16 %h36, [%rd47+258]; | |
cvt.f32.f16 %f74, %h36; | |
ld.global.nc.f32 %f75, [%rd48+516]; | |
mul.rn.f32 %f76, %f1, %f75; | |
mul.rn.f32 %f77, %f76, %f74; | |
ld.global.nc.f32 %f78, [%rd49+516]; | |
mul.rn.f32 %f79, %f2, %f76; | |
sub.rn.f32 %f80, %f78, %f79; | |
add.rn.f32 %f81, %f77, %f80; | |
add.rn.f32 %f82, %f81, %f73; | |
sub.rn.f32 %f83, %f82, %f3; | |
mul.rn.f32 %f84, %f83, %f83; | |
add.rn.f32 %f7, %f6, %f84; | |
or.b32 %r102, %r73, 256; | |
shr.u32 %r103, %r102, 2; | |
cvt.u64.u32 %rd957, %r103; | |
add.s64 %rd131, %rd12, %rd957; | |
and.b64 %rd2429, %rd131, 4294967295; | |
setp.lt.u64 %p72, %rd131, %rd12; | |
@%p8 bra LBB30_16; | |
mul.lo.s64 %rd2524, %rd2429, 3528531795; | |
selp.u64 %rd1000, 1, 0, %p72; | |
add.s64 %rd1001, %rd2464, %rd1000; | |
xor.b64 %rd1002, %rd1001, %rd2524; | |
shr.u64 %rd1003, %rd1002, 32; | |
mul.lo.s64 %rd2527, %rd1003, 3449720151; | |
shr.u64 %rd1004, %rd2527, 32; | |
and.b64 %rd1005, %rd1001, 4294967295; | |
mul.lo.s64 %rd1006, %rd1005, 3449720151; | |
and.b64 %rd1007, %rd1006, 4294967295; | |
xor.b64 %rd1008, %rd1007, %rd1004; | |
xor.b64 %rd1009, %rd1008, 2654435769; | |
mul.lo.s64 %rd2530, %rd1009, 3528531795; | |
xor.b64 %rd2520, %rd1006, %rd131; | |
mov.u32 %r322, -1879881855; | |
mov.u32 %r321, -845247145; | |
mov.u32 %r320, 534103459; | |
mov.u64 %rd2538, 3678237736; | |
mov.u64 %rd2537, 3041712726; | |
mov.u64 %rd2536, 1401181199; | |
mov.u64 %rd2535, 2835769497; | |
mov.u64 %rd2534, 1684936478; | |
mov.u64 %rd2533, 2027808484; | |
mov.u64 %rd2532, 387276957; | |
mov.u64 %rd2531, 842468239; | |
mov.u64 %rd2529, 3986602516; | |
mov.u64 %rd2528, 1013904242; | |
mov.u64 %rd2526, 3668340011; | |
mov.u64 %rd2525, 3144134277; | |
mov.u64 %rd2523, 3449720151; | |
mov.u64 %rd2522, 1993301258; | |
mov.u64 %rd2521, 3528531795; | |
bra.uni LBB30_17; | |
LBB30_16: | |
selp.u64 %rd973, 1, 0, %p72; | |
add.s64 %rd974, %rd2464, %rd973; | |
and.b64 %rd975, %rd974, 4294967295; | |
mul.lo.s64 %rd2524, %rd975, 3449720151; | |
xor.b64 %rd976, %rd2524, %rd131; | |
shr.u64 %rd977, %rd976, 32; | |
mul.lo.s64 %rd2527, %rd977, 3528531795; | |
shr.u64 %rd978, %rd2527, 32; | |
mul.lo.s64 %rd980, %rd2429, 3528531795; | |
and.b64 %rd981, %rd980, 4294967295; | |
xor.b64 %rd982, %rd981, %rd978; | |
xor.b64 %rd983, %rd982, 3144134277; | |
mul.lo.s64 %rd2530, %rd983, 3449720151; | |
xor.b64 %rd2520, %rd974, %rd980; | |
mov.u32 %r322, -1767562579; | |
mov.u32 %r321, -766435501; | |
mov.u32 %r320, 1401181199; | |
mov.u64 %rd2538, 4055616968; | |
mov.u64 %rd2537, 1684936478; | |
mov.u64 %rd2536, 534103459; | |
mov.u64 %rd2535, 387276957; | |
mov.u64 %rd2534, 3041712726; | |
mov.u64 %rd2533, 3986602516; | |
mov.u64 %rd2532, 2835769497; | |
mov.u64 %rd2531, 3668340011; | |
mov.u64 %rd2529, 2027808484; | |
mov.u64 %rd2528, 1993301258; | |
mov.u64 %rd2526, 842468239; | |
mov.u64 %rd2525, 2654435769; | |
mov.u64 %rd2523, 3528531795; | |
mov.u64 %rd2522, 1013904242; | |
mov.u64 %rd2521, 3449720151; | |
LBB30_17: | |
shr.u64 %rd1010, %rd2530, 32; | |
shr.u64 %rd1011, %rd2520, 32; | |
mul.lo.s64 %rd1012, %rd1011, %rd2521; | |
and.b64 %rd1013, %rd1012, 4294967295; | |
xor.b64 %rd1014, %rd1013, %rd1010; | |
xor.b64 %rd1015, %rd1014, %rd2522; | |
mul.lo.s64 %rd1016, %rd1015, %rd2523; | |
shr.u64 %rd1017, %rd1016, 32; | |
shr.u64 %rd1018, %rd1012, 32; | |
and.b64 %rd1019, %rd2524, 4294967295; | |
xor.b64 %rd1020, %rd1019, %rd1018; | |
xor.b64 %rd1021, %rd1020, %rd2525; | |
mul.lo.s64 %rd1022, %rd1021, %rd2523; | |
and.b64 %rd1023, %rd1022, 4294967295; | |
xor.b64 %rd1024, %rd1023, %rd1017; | |
xor.b64 %rd1025, %rd1024, %rd2526; | |
mul.lo.s64 %rd1026, %rd1025, %rd2521; | |
shr.u64 %rd1027, %rd1026, 32; | |
shr.u64 %rd1028, %rd1022, 32; | |
and.b64 %rd1029, %rd2527, 4294967295; | |
xor.b64 %rd1030, %rd1029, %rd1028; | |
xor.b64 %rd1031, %rd1030, %rd2528; | |
mul.lo.s64 %rd1032, %rd1031, %rd2521; | |
and.b64 %rd1033, %rd1032, 4294967295; | |
xor.b64 %rd1034, %rd1033, %rd1027; | |
xor.b64 %rd1035, %rd1034, %rd2529; | |
mul.lo.s64 %rd1036, %rd1035, %rd2523; | |
shr.u64 %rd1037, %rd1036, 32; | |
shr.u64 %rd1038, %rd1032, 32; | |
and.b64 %rd1039, %rd2530, 4294967295; | |
xor.b64 %rd1040, %rd1039, %rd1038; | |
xor.b64 %rd1041, %rd1040, %rd2531; | |
mul.lo.s64 %rd1042, %rd1041, %rd2523; | |
and.b64 %rd1043, %rd1042, 4294967295; | |
xor.b64 %rd1044, %rd1043, %rd1037; | |
xor.b64 %rd1045, %rd1044, %rd2532; | |
mul.lo.s64 %rd1046, %rd1045, %rd2521; | |
shr.u64 %rd1047, %rd1046, 32; | |
shr.u64 %rd1048, %rd1042, 32; | |
and.b64 %rd1049, %rd1016, 4294967295; | |
xor.b64 %rd1050, %rd1049, %rd1048; | |
xor.b64 %rd1051, %rd1050, %rd2533; | |
mul.lo.s64 %rd1052, %rd1051, %rd2521; | |
and.b64 %rd1053, %rd1052, 4294967295; | |
xor.b64 %rd1054, %rd1053, %rd1047; | |
xor.b64 %rd1055, %rd1054, %rd2534; | |
mul.lo.s64 %rd1056, %rd1055, %rd2523; | |
shr.u64 %rd1057, %rd1056, 32; | |
shr.u64 %rd1058, %rd1052, 32; | |
and.b64 %rd1059, %rd1026, 4294967295; | |
xor.b64 %rd1060, %rd1059, %rd1058; | |
xor.b64 %rd1061, %rd1060, %rd2535; | |
mul.lo.s64 %rd1062, %rd1061, %rd2523; | |
and.b64 %rd1063, %rd1062, 4294967295; | |
xor.b64 %rd1064, %rd1063, %rd1057; | |
xor.b64 %rd1065, %rd1064, %rd2536; | |
mul.lo.s64 %rd1066, %rd1065, %rd2521; | |
shr.u64 %rd1067, %rd1066, 32; | |
shr.u64 %rd1068, %rd1062, 32; | |
and.b64 %rd1069, %rd1036, 4294967295; | |
xor.b64 %rd1070, %rd1069, %rd1068; | |
xor.b64 %rd1071, %rd1070, %rd2537; | |
mul.lo.s64 %rd1072, %rd1071, %rd2521; | |
and.b64 %rd1073, %rd1072, 4294967295; | |
xor.b64 %rd1074, %rd1073, %rd1067; | |
xor.b64 %rd1075, %rd1074, %rd2538; | |
mul.lo.s64 %rd1076, %rd1075, %rd2523; | |
shr.u64 %rd1077, %rd1076, 32; | |
cvt.u32.u64 %r110, %rd1077; | |
shr.u64 %rd1078, %rd1072, 32; | |
xor.b64 %rd1079, %rd1078, %rd1046; | |
cvt.u32.u64 %r111, %rd1079; | |
xor.b32 %r112, %r320, %r111; | |
mul.lo.s32 %r113, %r112, %r321; | |
xor.b32 %r114, %r113, %r110; | |
xor.b32 %r115, %r114, %r322; | |
shr.u32 %r116, %r115, 9; | |
cvt.rn.f32.u32 %f85, %r116; | |
mul.rn.f32 %f86, %f85, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f86; | |
mov.b16 %h38, 0x2E66; | |
setp.ge.f16 %p20, %h37, %h38; | |
ld.global.nc.b16 %h39, [%rd45+512]; | |
ld.global.nc.f32 %f87, [%rd46+1024]; | |
cvt.rn.f16.f32 %h40, %f87; | |
add.rn.f16 %h41, %h39, %h40; | |
mov.b16 %h42, 0x3C72; | |
mul.rn.f16 %h43, %h41, %h42; | |
selp.b16 %h44, %h43, 0x0000, %p20; | |
cvt.f32.f16 %f88, %h44; | |
ld.global.nc.b16 %h45, [%rd47+512]; | |
cvt.f32.f16 %f89, %h45; | |
ld.global.nc.f32 %f90, [%rd48+1024]; | |
mul.rn.f32 %f91, %f1, %f90; | |
mul.rn.f32 %f92, %f91, %f89; | |
ld.global.nc.f32 %f93, [%rd49+1024]; | |
mul.rn.f32 %f94, %f2, %f91; | |
sub.rn.f32 %f95, %f93, %f94; | |
add.rn.f32 %f96, %f92, %f95; | |
add.rn.f32 %f97, %f96, %f88; | |
sub.rn.f32 %f98, %f97, %f3; | |
mul.rn.f32 %f99, %f98, %f98; | |
add.rn.f32 %f8, %f7, %f99; | |
or.b32 %r117, %r3, 257; | |
or.b32 %r118, %r117, %r4; | |
and.b32 %r119, %r117, 3; | |
shr.u32 %r120, %r118, 2; | |
setp.ne.s32 %p21, %r119, 1; | |
cvt.u64.u32 %rd1080, %r120; | |
add.s64 %rd159, %rd12, %rd1080; | |
and.b64 %rd2426, %rd159, 4294967295; | |
setp.lt.u64 %p71, %rd159, %rd12; | |
@%p21 bra LBB30_19; | |
mul.lo.s64 %rd2543, %rd2426, 3528531795; | |
selp.u64 %rd1121, 1, 0, %p71; | |
add.s64 %rd1122, %rd2464, %rd1121; | |
xor.b64 %rd1123, %rd1122, %rd2543; | |
shr.u64 %rd1124, %rd1123, 32; | |
mul.lo.s64 %rd2546, %rd1124, 3449720151; | |
shr.u64 %rd1125, %rd2546, 32; | |
and.b64 %rd1126, %rd1122, 4294967295; | |
mul.lo.s64 %rd1127, %rd1126, 3449720151; | |
and.b64 %rd1128, %rd1127, 4294967295; | |
xor.b64 %rd1129, %rd1128, %rd1125; | |
xor.b64 %rd1130, %rd1129, 2654435769; | |
mul.lo.s64 %rd2549, %rd1130, 3528531795; | |
xor.b64 %rd2539, %rd1127, %rd159; | |
mov.u32 %r324, -845247145; | |
mov.u32 %r323, -616729560; | |
mov.u64 %rd2556, 3041712726; | |
mov.u64 %rd2555, 1401181199; | |
mov.u64 %rd2554, 2835769497; | |
mov.u64 %rd2553, 1684936478; | |
mov.u64 %rd2552, 2027808484; | |
mov.u64 %rd2551, 387276957; | |
mov.u64 %rd2550, 842468239; | |
mov.u64 %rd2548, 3986602516; | |
mov.u64 %rd2547, 1013904242; | |
mov.u64 %rd2545, 3668340011; | |
mov.u64 %rd2544, 3144134277; | |
mov.u64 %rd2542, 3449720151; | |
mov.u64 %rd2541, 1993301258; | |
mov.u64 %rd2540, 3528531795; | |
bra.uni LBB30_20; | |
LBB30_19: | |
selp.u64 %rd1095, 1, 0, %p71; | |
add.s64 %rd1096, %rd2464, %rd1095; | |
and.b64 %rd1097, %rd1096, 4294967295; | |
mul.lo.s64 %rd2543, %rd1097, 3449720151; | |
xor.b64 %rd1098, %rd2543, %rd159; | |
shr.u64 %rd1099, %rd1098, 32; | |
mul.lo.s64 %rd2546, %rd1099, 3528531795; | |
shr.u64 %rd1100, %rd2546, 32; | |
mul.lo.s64 %rd1102, %rd2426, 3528531795; | |
and.b64 %rd1103, %rd1102, 4294967295; | |
xor.b64 %rd1104, %rd1103, %rd1100; | |
xor.b64 %rd1105, %rd1104, 3144134277; | |
mul.lo.s64 %rd2549, %rd1105, 3449720151; | |
xor.b64 %rd2539, %rd1096, %rd1102; | |
mov.u32 %r324, -766435501; | |
mov.u32 %r323, -239350328; | |
mov.u64 %rd2556, 1684936478; | |
mov.u64 %rd2555, 534103459; | |
mov.u64 %rd2554, 387276957; | |
mov.u64 %rd2553, 3041712726; | |
mov.u64 %rd2552, 3986602516; | |
mov.u64 %rd2551, 2835769497; | |
mov.u64 %rd2550, 3668340011; | |
mov.u64 %rd2548, 2027808484; | |
mov.u64 %rd2547, 1993301258; | |
mov.u64 %rd2545, 842468239; | |
mov.u64 %rd2544, 2654435769; | |
mov.u64 %rd2542, 3528531795; | |
mov.u64 %rd2541, 1013904242; | |
mov.u64 %rd2540, 3449720151; | |
LBB30_20: | |
shr.u64 %rd1131, %rd2549, 32; | |
shr.u64 %rd1132, %rd2539, 32; | |
mul.lo.s64 %rd1133, %rd1132, %rd2540; | |
and.b64 %rd1134, %rd1133, 4294967295; | |
xor.b64 %rd1135, %rd1134, %rd1131; | |
xor.b64 %rd1136, %rd1135, %rd2541; | |
mul.lo.s64 %rd1137, %rd1136, %rd2542; | |
shr.u64 %rd1138, %rd1137, 32; | |
shr.u64 %rd1139, %rd1133, 32; | |
and.b64 %rd1140, %rd2543, 4294967295; | |
xor.b64 %rd1141, %rd1140, %rd1139; | |
xor.b64 %rd1142, %rd1141, %rd2544; | |
mul.lo.s64 %rd1143, %rd1142, %rd2542; | |
and.b64 %rd1144, %rd1143, 4294967295; | |
xor.b64 %rd1145, %rd1144, %rd1138; | |
xor.b64 %rd1146, %rd1145, %rd2545; | |
mul.lo.s64 %rd1147, %rd1146, %rd2540; | |
shr.u64 %rd1148, %rd1147, 32; | |
shr.u64 %rd1149, %rd1143, 32; | |
and.b64 %rd1150, %rd2546, 4294967295; | |
xor.b64 %rd1151, %rd1150, %rd1149; | |
xor.b64 %rd1152, %rd1151, %rd2547; | |
mul.lo.s64 %rd1153, %rd1152, %rd2540; | |
and.b64 %rd1154, %rd1153, 4294967295; | |
xor.b64 %rd1155, %rd1154, %rd1148; | |
xor.b64 %rd1156, %rd1155, %rd2548; | |
mul.lo.s64 %rd1157, %rd1156, %rd2542; | |
shr.u64 %rd1158, %rd1157, 32; | |
shr.u64 %rd1159, %rd1153, 32; | |
and.b64 %rd1160, %rd2549, 4294967295; | |
xor.b64 %rd1161, %rd1160, %rd1159; | |
xor.b64 %rd1162, %rd1161, %rd2550; | |
mul.lo.s64 %rd1163, %rd1162, %rd2542; | |
and.b64 %rd1164, %rd1163, 4294967295; | |
xor.b64 %rd1165, %rd1164, %rd1158; | |
xor.b64 %rd1166, %rd1165, %rd2551; | |
mul.lo.s64 %rd1167, %rd1166, %rd2540; | |
shr.u64 %rd1168, %rd1167, 32; | |
shr.u64 %rd1169, %rd1163, 32; | |
and.b64 %rd1170, %rd1137, 4294967295; | |
xor.b64 %rd1171, %rd1170, %rd1169; | |
xor.b64 %rd1172, %rd1171, %rd2552; | |
mul.lo.s64 %rd1173, %rd1172, %rd2540; | |
and.b64 %rd1174, %rd1173, 4294967295; | |
xor.b64 %rd1175, %rd1174, %rd1168; | |
xor.b64 %rd1176, %rd1175, %rd2553; | |
mul.lo.s64 %rd1177, %rd1176, %rd2542; | |
shr.u64 %rd1178, %rd1177, 32; | |
shr.u64 %rd1179, %rd1173, 32; | |
and.b64 %rd1180, %rd1147, 4294967295; | |
xor.b64 %rd1181, %rd1180, %rd1179; | |
xor.b64 %rd1182, %rd1181, %rd2554; | |
mul.lo.s64 %rd1183, %rd1182, %rd2542; | |
and.b64 %rd1184, %rd1183, 4294967295; | |
xor.b64 %rd1185, %rd1184, %rd1178; | |
xor.b64 %rd1186, %rd1185, %rd2555; | |
mul.lo.s64 %rd1187, %rd1186, %rd2540; | |
shr.u64 %rd1188, %rd1187, 32; | |
shr.u64 %rd1189, %rd1183, 32; | |
xor.b64 %rd1190, %rd1157, %rd1189; | |
xor.b64 %rd1191, %rd1190, %rd2556; | |
mul.lo.s64 %rd1192, %rd1191, %rd2540; | |
xor.b64 %rd1193, %rd1188, %rd1192; | |
cvt.u32.u64 %r125, %rd1193; | |
xor.b32 %r126, %r323, %r125; | |
mul.lo.s32 %r127, %r126, %r324; | |
shr.u32 %r128, %r127, 9; | |
cvt.rn.f32.u32 %f100, %r128; | |
mul.rn.f32 %f101, %f100, 0f34000000; | |
cvt.rn.f16.f32 %h46, %f101; | |
mov.b16 %h47, 0x2E66; | |
setp.ge.f16 %p25, %h46, %h47; | |
ld.global.nc.b16 %h48, [%rd45+514]; | |
ld.global.nc.f32 %f102, [%rd46+1028]; | |
cvt.rn.f16.f32 %h49, %f102; | |
add.rn.f16 %h50, %h48, %h49; | |
mov.b16 %h51, 0x3C72; | |
mul.rn.f16 %h52, %h50, %h51; | |
selp.b16 %h53, %h52, 0x0000, %p25; | |
cvt.f32.f16 %f103, %h53; | |
ld.global.nc.b16 %h54, [%rd47+514]; | |
cvt.f32.f16 %f104, %h54; | |
ld.global.nc.f32 %f105, [%rd48+1028]; | |
mul.rn.f32 %f106, %f1, %f105; | |
mul.rn.f32 %f107, %f106, %f104; | |
ld.global.nc.f32 %f108, [%rd49+1028]; | |
mul.rn.f32 %f109, %f2, %f106; | |
sub.rn.f32 %f110, %f108, %f109; | |
add.rn.f32 %f111, %f107, %f110; | |
add.rn.f32 %f112, %f111, %f103; | |
sub.rn.f32 %f113, %f112, %f3; | |
mul.rn.f32 %f114, %f113, %f113; | |
add.rn.f32 %f9, %f8, %f114; | |
or.b32 %r130, %r73, 384; | |
shr.u32 %r131, %r130, 2; | |
cvt.u64.u32 %rd1194, %r131; | |
add.s64 %rd186, %rd12, %rd1194; | |
and.b64 %rd2422, %rd186, 4294967295; | |
setp.lt.u64 %p70, %rd186, %rd12; | |
@%p8 bra LBB30_22; | |
mul.lo.s64 %rd2561, %rd2422, 3528531795; | |
selp.u64 %rd1237, 1, 0, %p70; | |
add.s64 %rd1238, %rd2464, %rd1237; | |
xor.b64 %rd1239, %rd1238, %rd2561; | |
shr.u64 %rd1240, %rd1239, 32; | |
mul.lo.s64 %rd2564, %rd1240, 3449720151; | |
shr.u64 %rd1241, %rd2564, 32; | |
and.b64 %rd1242, %rd1238, 4294967295; | |
mul.lo.s64 %rd1243, %rd1242, 3449720151; | |
and.b64 %rd1244, %rd1243, 4294967295; | |
xor.b64 %rd1245, %rd1244, %rd1241; | |
xor.b64 %rd1246, %rd1245, 2654435769; | |
mul.lo.s64 %rd2567, %rd1246, 3528531795; | |
xor.b64 %rd2557, %rd1243, %rd186; | |
mov.u32 %r327, -1879881855; | |
mov.u32 %r326, -845247145; | |
mov.u32 %r325, 534103459; | |
mov.u64 %rd2575, 3678237736; | |
mov.u64 %rd2574, 3041712726; | |
mov.u64 %rd2573, 1401181199; | |
mov.u64 %rd2572, 2835769497; | |
mov.u64 %rd2571, 1684936478; | |
mov.u64 %rd2570, 2027808484; | |
mov.u64 %rd2569, 387276957; | |
mov.u64 %rd2568, 842468239; | |
mov.u64 %rd2566, 3986602516; | |
mov.u64 %rd2565, 1013904242; | |
mov.u64 %rd2563, 3668340011; | |
mov.u64 %rd2562, 3144134277; | |
mov.u64 %rd2560, 3449720151; | |
mov.u64 %rd2559, 1993301258; | |
mov.u64 %rd2558, 3528531795; | |
bra.uni LBB30_23; | |
LBB30_22: | |
selp.u64 %rd1210, 1, 0, %p70; | |
add.s64 %rd1211, %rd2464, %rd1210; | |
and.b64 %rd1212, %rd1211, 4294967295; | |
mul.lo.s64 %rd2561, %rd1212, 3449720151; | |
xor.b64 %rd1213, %rd2561, %rd186; | |
shr.u64 %rd1214, %rd1213, 32; | |
mul.lo.s64 %rd2564, %rd1214, 3528531795; | |
shr.u64 %rd1215, %rd2564, 32; | |
mul.lo.s64 %rd1217, %rd2422, 3528531795; | |
and.b64 %rd1218, %rd1217, 4294967295; | |
xor.b64 %rd1219, %rd1218, %rd1215; | |
xor.b64 %rd1220, %rd1219, 3144134277; | |
mul.lo.s64 %rd2567, %rd1220, 3449720151; | |
xor.b64 %rd2557, %rd1211, %rd1217; | |
mov.u32 %r327, -1767562579; | |
mov.u32 %r326, -766435501; | |
mov.u32 %r325, 1401181199; | |
mov.u64 %rd2575, 4055616968; | |
mov.u64 %rd2574, 1684936478; | |
mov.u64 %rd2573, 534103459; | |
mov.u64 %rd2572, 387276957; | |
mov.u64 %rd2571, 3041712726; | |
mov.u64 %rd2570, 3986602516; | |
mov.u64 %rd2569, 2835769497; | |
mov.u64 %rd2568, 3668340011; | |
mov.u64 %rd2566, 2027808484; | |
mov.u64 %rd2565, 1993301258; | |
mov.u64 %rd2563, 842468239; | |
mov.u64 %rd2562, 2654435769; | |
mov.u64 %rd2560, 3528531795; | |
mov.u64 %rd2559, 1013904242; | |
mov.u64 %rd2558, 3449720151; | |
LBB30_23: | |
shr.u64 %rd1247, %rd2567, 32; | |
shr.u64 %rd1248, %rd2557, 32; | |
mul.lo.s64 %rd1249, %rd1248, %rd2558; | |
and.b64 %rd1250, %rd1249, 4294967295; | |
xor.b64 %rd1251, %rd1250, %rd1247; | |
xor.b64 %rd1252, %rd1251, %rd2559; | |
mul.lo.s64 %rd1253, %rd1252, %rd2560; | |
shr.u64 %rd1254, %rd1253, 32; | |
shr.u64 %rd1255, %rd1249, 32; | |
and.b64 %rd1256, %rd2561, 4294967295; | |
xor.b64 %rd1257, %rd1256, %rd1255; | |
xor.b64 %rd1258, %rd1257, %rd2562; | |
mul.lo.s64 %rd1259, %rd1258, %rd2560; | |
and.b64 %rd1260, %rd1259, 4294967295; | |
xor.b64 %rd1261, %rd1260, %rd1254; | |
xor.b64 %rd1262, %rd1261, %rd2563; | |
mul.lo.s64 %rd1263, %rd1262, %rd2558; | |
shr.u64 %rd1264, %rd1263, 32; | |
shr.u64 %rd1265, %rd1259, 32; | |
and.b64 %rd1266, %rd2564, 4294967295; | |
xor.b64 %rd1267, %rd1266, %rd1265; | |
xor.b64 %rd1268, %rd1267, %rd2565; | |
mul.lo.s64 %rd1269, %rd1268, %rd2558; | |
and.b64 %rd1270, %rd1269, 4294967295; | |
xor.b64 %rd1271, %rd1270, %rd1264; | |
xor.b64 %rd1272, %rd1271, %rd2566; | |
mul.lo.s64 %rd1273, %rd1272, %rd2560; | |
shr.u64 %rd1274, %rd1273, 32; | |
shr.u64 %rd1275, %rd1269, 32; | |
and.b64 %rd1276, %rd2567, 4294967295; | |
xor.b64 %rd1277, %rd1276, %rd1275; | |
xor.b64 %rd1278, %rd1277, %rd2568; | |
mul.lo.s64 %rd1279, %rd1278, %rd2560; | |
and.b64 %rd1280, %rd1279, 4294967295; | |
xor.b64 %rd1281, %rd1280, %rd1274; | |
xor.b64 %rd1282, %rd1281, %rd2569; | |
mul.lo.s64 %rd1283, %rd1282, %rd2558; | |
shr.u64 %rd1284, %rd1283, 32; | |
shr.u64 %rd1285, %rd1279, 32; | |
and.b64 %rd1286, %rd1253, 4294967295; | |
xor.b64 %rd1287, %rd1286, %rd1285; | |
xor.b64 %rd1288, %rd1287, %rd2570; | |
mul.lo.s64 %rd1289, %rd1288, %rd2558; | |
and.b64 %rd1290, %rd1289, 4294967295; | |
xor.b64 %rd1291, %rd1290, %rd1284; | |
xor.b64 %rd1292, %rd1291, %rd2571; | |
mul.lo.s64 %rd1293, %rd1292, %rd2560; | |
shr.u64 %rd1294, %rd1293, 32; | |
shr.u64 %rd1295, %rd1289, 32; | |
and.b64 %rd1296, %rd1263, 4294967295; | |
xor.b64 %rd1297, %rd1296, %rd1295; | |
xor.b64 %rd1298, %rd1297, %rd2572; | |
mul.lo.s64 %rd1299, %rd1298, %rd2560; | |
and.b64 %rd1300, %rd1299, 4294967295; | |
xor.b64 %rd1301, %rd1300, %rd1294; | |
xor.b64 %rd1302, %rd1301, %rd2573; | |
mul.lo.s64 %rd1303, %rd1302, %rd2558; | |
shr.u64 %rd1304, %rd1303, 32; | |
shr.u64 %rd1305, %rd1299, 32; | |
and.b64 %rd1306, %rd1273, 4294967295; | |
xor.b64 %rd1307, %rd1306, %rd1305; | |
xor.b64 %rd1308, %rd1307, %rd2574; | |
mul.lo.s64 %rd1309, %rd1308, %rd2558; | |
and.b64 %rd1310, %rd1309, 4294967295; | |
xor.b64 %rd1311, %rd1310, %rd1304; | |
xor.b64 %rd1312, %rd1311, %rd2575; | |
mul.lo.s64 %rd1313, %rd1312, %rd2560; | |
shr.u64 %rd1314, %rd1313, 32; | |
cvt.u32.u64 %r138, %rd1314; | |
shr.u64 %rd1315, %rd1309, 32; | |
xor.b64 %rd1316, %rd1315, %rd1283; | |
cvt.u32.u64 %r139, %rd1316; | |
xor.b32 %r140, %r325, %r139; | |
mul.lo.s32 %r141, %r140, %r326; | |
xor.b32 %r142, %r141, %r138; | |
xor.b32 %r143, %r142, %r327; | |
shr.u32 %r144, %r143, 9; | |
cvt.rn.f32.u32 %f115, %r144; | |
mul.rn.f32 %f116, %f115, 0f34000000; | |
cvt.rn.f16.f32 %h55, %f116; | |
mov.b16 %h56, 0x2E66; | |
setp.ge.f16 %p28, %h55, %h56; | |
ld.global.nc.b16 %h57, [%rd45+768]; | |
ld.global.nc.f32 %f117, [%rd46+1536]; | |
cvt.rn.f16.f32 %h58, %f117; | |
add.rn.f16 %h59, %h57, %h58; | |
mov.b16 %h60, 0x3C72; | |
mul.rn.f16 %h61, %h59, %h60; | |
selp.b16 %h62, %h61, 0x0000, %p28; | |
cvt.f32.f16 %f118, %h62; | |
ld.global.nc.b16 %h63, [%rd47+768]; | |
cvt.f32.f16 %f119, %h63; | |
ld.global.nc.f32 %f120, [%rd48+1536]; | |
mul.rn.f32 %f121, %f1, %f120; | |
mul.rn.f32 %f122, %f121, %f119; | |
ld.global.nc.f32 %f123, [%rd49+1536]; | |
mul.rn.f32 %f124, %f2, %f121; | |
sub.rn.f32 %f125, %f123, %f124; | |
add.rn.f32 %f126, %f122, %f125; | |
add.rn.f32 %f127, %f126, %f118; | |
sub.rn.f32 %f128, %f127, %f3; | |
mul.rn.f32 %f129, %f128, %f128; | |
add.rn.f32 %f10, %f9, %f129; | |
or.b32 %r145, %r3, 385; | |
or.b32 %r146, %r145, %r4; | |
and.b32 %r147, %r145, 3; | |
shr.u32 %r148, %r146, 2; | |
setp.ne.s32 %p29, %r147, 1; | |
cvt.u64.u32 %rd1317, %r148; | |
add.s64 %rd214, %rd12, %rd1317; | |
@%p29 bra LBB30_25; | |
and.b64 %rd1357, %rd214, 4294967295; | |
mul.lo.s64 %rd2580, %rd1357, 3528531795; | |
setp.lt.u64 %p31, %rd214, %rd12; | |
selp.u64 %rd1358, 1, 0, %p31; | |
add.s64 %rd1359, %rd2464, %rd1358; | |
xor.b64 %rd1360, %rd1359, %rd2580; | |
shr.u64 %rd1361, %rd1360, 32; | |
mul.lo.s64 %rd2583, %rd1361, 3449720151; | |
shr.u64 %rd1362, %rd2583, 32; | |
and.b64 %rd1363, %rd1359, 4294967295; | |
mul.lo.s64 %rd1364, %rd1363, 3449720151; | |
and.b64 %rd1365, %rd1364, 4294967295; | |
xor.b64 %rd1366, %rd1365, %rd1362; | |
xor.b64 %rd1367, %rd1366, 2654435769; | |
mul.lo.s64 %rd2586, %rd1367, 3528531795; | |
xor.b64 %rd2576, %rd1364, %rd214; | |
mov.u32 %r329, -845247145; | |
mov.u32 %r328, -616729560; | |
mov.u64 %rd2593, 3041712726; | |
mov.u64 %rd2592, 1401181199; | |
mov.u64 %rd2591, 2835769497; | |
mov.u64 %rd2590, 1684936478; | |
mov.u64 %rd2589, 2027808484; | |
mov.u64 %rd2588, 387276957; | |
mov.u64 %rd2587, 842468239; | |
mov.u64 %rd2585, 3986602516; | |
mov.u64 %rd2584, 1013904242; | |
mov.u64 %rd2582, 3668340011; | |
mov.u64 %rd2581, 3144134277; | |
mov.u64 %rd2579, 3449720151; | |
mov.u64 %rd2578, 1993301258; | |
mov.u64 %rd2577, 3528531795; | |
bra.uni LBB30_26; | |
LBB30_25: | |
setp.lt.u64 %p30, %rd214, %rd12; | |
selp.u64 %rd1332, 1, 0, %p30; | |
add.s64 %rd1333, %rd2464, %rd1332; | |
and.b64 %rd1334, %rd1333, 4294967295; | |
mul.lo.s64 %rd2580, %rd1334, 3449720151; | |
xor.b64 %rd1335, %rd2580, %rd214; | |
shr.u64 %rd1336, %rd1335, 32; | |
mul.lo.s64 %rd2583, %rd1336, 3528531795; | |
shr.u64 %rd1337, %rd2583, 32; | |
and.b64 %rd1338, %rd214, 4294967295; | |
mul.lo.s64 %rd1339, %rd1338, 3528531795; | |
and.b64 %rd1340, %rd1339, 4294967295; | |
xor.b64 %rd1341, %rd1340, %rd1337; | |
xor.b64 %rd1342, %rd1341, 3144134277; | |
mul.lo.s64 %rd2586, %rd1342, 3449720151; | |
xor.b64 %rd2576, %rd1333, %rd1339; | |
mov.u32 %r329, -766435501; | |
mov.u32 %r328, -239350328; | |
mov.u64 %rd2593, 1684936478; | |
mov.u64 %rd2592, 534103459; | |
mov.u64 %rd2591, 387276957; | |
mov.u64 %rd2590, 3041712726; | |
mov.u64 %rd2589, 3986602516; | |
mov.u64 %rd2588, 2835769497; | |
mov.u64 %rd2587, 3668340011; | |
mov.u64 %rd2585, 2027808484; | |
mov.u64 %rd2584, 1993301258; | |
mov.u64 %rd2582, 842468239; | |
mov.u64 %rd2581, 2654435769; | |
mov.u64 %rd2579, 3528531795; | |
mov.u64 %rd2578, 1013904242; | |
mov.u64 %rd2577, 3449720151; | |
LBB30_26: | |
shr.u64 %rd1368, %rd2586, 32; | |
shr.u64 %rd1369, %rd2576, 32; | |
mul.lo.s64 %rd1370, %rd1369, %rd2577; | |
and.b64 %rd1371, %rd1370, 4294967295; | |
xor.b64 %rd1372, %rd1371, %rd1368; | |
xor.b64 %rd1373, %rd1372, %rd2578; | |
mul.lo.s64 %rd1374, %rd1373, %rd2579; | |
shr.u64 %rd1375, %rd1374, 32; | |
shr.u64 %rd1376, %rd1370, 32; | |
and.b64 %rd1377, %rd2580, 4294967295; | |
xor.b64 %rd1378, %rd1377, %rd1376; | |
xor.b64 %rd1379, %rd1378, %rd2581; | |
mul.lo.s64 %rd1380, %rd1379, %rd2579; | |
and.b64 %rd1381, %rd1380, 4294967295; | |
xor.b64 %rd1382, %rd1381, %rd1375; | |
xor.b64 %rd1383, %rd1382, %rd2582; | |
mul.lo.s64 %rd1384, %rd1383, %rd2577; | |
shr.u64 %rd1385, %rd1384, 32; | |
shr.u64 %rd1386, %rd1380, 32; | |
and.b64 %rd1387, %rd2583, 4294967295; | |
xor.b64 %rd1388, %rd1387, %rd1386; | |
xor.b64 %rd1389, %rd1388, %rd2584; | |
mul.lo.s64 %rd1390, %rd1389, %rd2577; | |
and.b64 %rd1391, %rd1390, 4294967295; | |
xor.b64 %rd1392, %rd1391, %rd1385; | |
xor.b64 %rd1393, %rd1392, %rd2585; | |
mul.lo.s64 %rd1394, %rd1393, %rd2579; | |
shr.u64 %rd1395, %rd1394, 32; | |
shr.u64 %rd1396, %rd1390, 32; | |
and.b64 %rd1397, %rd2586, 4294967295; | |
xor.b64 %rd1398, %rd1397, %rd1396; | |
xor.b64 %rd1399, %rd1398, %rd2587; | |
mul.lo.s64 %rd1400, %rd1399, %rd2579; | |
and.b64 %rd1401, %rd1400, 4294967295; | |
xor.b64 %rd1402, %rd1401, %rd1395; | |
xor.b64 %rd1403, %rd1402, %rd2588; | |
mul.lo.s64 %rd1404, %rd1403, %rd2577; | |
shr.u64 %rd1405, %rd1404, 32; | |
shr.u64 %rd1406, %rd1400, 32; | |
and.b64 %rd1407, %rd1374, 4294967295; | |
xor.b64 %rd1408, %rd1407, %rd1406; | |
xor.b64 %rd1409, %rd1408, %rd2589; | |
mul.lo.s64 %rd1410, %rd1409, %rd2577; | |
and.b64 %rd1411, %rd1410, 4294967295; | |
xor.b64 %rd1412, %rd1411, %rd1405; | |
xor.b64 %rd1413, %rd1412, %rd2590; | |
mul.lo.s64 %rd1414, %rd1413, %rd2579; | |
shr.u64 %rd1415, %rd1414, 32; | |
shr.u64 %rd1416, %rd1410, 32; | |
and.b64 %rd1417, %rd1384, 4294967295; | |
xor.b64 %rd1418, %rd1417, %rd1416; | |
xor.b64 %rd1419, %rd1418, %rd2591; | |
mul.lo.s64 %rd1420, %rd1419, %rd2579; | |
and.b64 %rd1421, %rd1420, 4294967295; | |
xor.b64 %rd1422, %rd1421, %rd1415; | |
xor.b64 %rd1423, %rd1422, %rd2592; | |
mul.lo.s64 %rd1424, %rd1423, %rd2577; | |
shr.u64 %rd1425, %rd1424, 32; | |
shr.u64 %rd1426, %rd1420, 32; | |
xor.b64 %rd1427, %rd1394, %rd1426; | |
xor.b64 %rd1428, %rd1427, %rd2593; | |
mul.lo.s64 %rd1429, %rd1428, %rd2577; | |
xor.b64 %rd1430, %rd1425, %rd1429; | |
cvt.u32.u64 %r153, %rd1430; | |
xor.b32 %r154, %r328, %r153; | |
mul.lo.s32 %r155, %r154, %r329; | |
shr.u32 %r156, %r155, 9; | |
cvt.rn.f32.u32 %f130, %r156; | |
mul.rn.f32 %f131, %f130, 0f34000000; | |
cvt.rn.f16.f32 %h64, %f131; | |
mov.b16 %h65, 0x2E66; | |
setp.ge.f16 %p33, %h64, %h65; | |
ld.global.nc.b16 %h66, [%rd45+770]; | |
ld.global.nc.f32 %f132, [%rd46+1540]; | |
cvt.rn.f16.f32 %h67, %f132; | |
add.rn.f16 %h68, %h66, %h67; | |
mov.b16 %h69, 0x3C72; | |
mul.rn.f16 %h70, %h68, %h69; | |
selp.b16 %h71, %h70, 0x0000, %p33; | |
cvt.f32.f16 %f133, %h71; | |
ld.global.nc.b16 %h72, [%rd47+770]; | |
cvt.f32.f16 %f134, %h72; | |
ld.global.nc.f32 %f135, [%rd48+1540]; | |
mul.rn.f32 %f136, %f1, %f135; | |
mul.rn.f32 %f137, %f136, %f134; | |
ld.global.nc.f32 %f138, [%rd49+1540]; | |
mul.rn.f32 %f139, %f2, %f136; | |
sub.rn.f32 %f140, %f138, %f139; | |
add.rn.f32 %f141, %f137, %f140; | |
add.rn.f32 %f142, %f141, %f133; | |
sub.rn.f32 %f143, %f142, %f3; | |
mul.rn.f32 %f144, %f143, %f143; | |
add.rn.f32 %f11, %f10, %f144; | |
or.b32 %r158, %r73, 512; | |
shr.u32 %r159, %r158, 2; | |
cvt.u64.u32 %rd1431, %r159; | |
add.s64 %rd241, %rd12, %rd1431; | |
@%p8 bra LBB30_28; | |
and.b64 %rd1473, %rd241, 4294967295; | |
mul.lo.s64 %rd2598, %rd1473, 3528531795; | |
setp.lt.u64 %p35, %rd241, %rd12; | |
selp.u64 %rd1474, 1, 0, %p35; | |
add.s64 %rd1475, %rd2464, %rd1474; | |
xor.b64 %rd1476, %rd1475, %rd2598; | |
shr.u64 %rd1477, %rd1476, 32; | |
mul.lo.s64 %rd2601, %rd1477, 3449720151; | |
shr.u64 %rd1478, %rd2601, 32; | |
and.b64 %rd1479, %rd1475, 4294967295; | |
mul.lo.s64 %rd1480, %rd1479, 3449720151; | |
and.b64 %rd1481, %rd1480, 4294967295; | |
xor.b64 %rd1482, %rd1481, %rd1478; | |
xor.b64 %rd1483, %rd1482, 2654435769; | |
mul.lo.s64 %rd2604, %rd1483, 3528531795; | |
xor.b64 %rd2594, %rd1480, %rd241; | |
mov.u32 %r332, -1879881855; | |
mov.u32 %r331, -845247145; | |
mov.u32 %r330, 534103459; | |
mov.u64 %rd2612, 3678237736; | |
mov.u64 %rd2611, 3041712726; | |
mov.u64 %rd2610, 1401181199; | |
mov.u64 %rd2609, 2835769497; | |
mov.u64 %rd2608, 1684936478; | |
mov.u64 %rd2607, 2027808484; | |
mov.u64 %rd2606, 387276957; | |
mov.u64 %rd2605, 842468239; | |
mov.u64 %rd2603, 3986602516; | |
mov.u64 %rd2602, 1013904242; | |
mov.u64 %rd2600, 3668340011; | |
mov.u64 %rd2599, 3144134277; | |
mov.u64 %rd2597, 3449720151; | |
mov.u64 %rd2596, 1993301258; | |
mov.u64 %rd2595, 3528531795; | |
bra.uni LBB30_29; | |
LBB30_28: | |
setp.lt.u64 %p34, %rd241, %rd12; | |
selp.u64 %rd1447, 1, 0, %p34; | |
add.s64 %rd1448, %rd2464, %rd1447; | |
and.b64 %rd1449, %rd1448, 4294967295; | |
mul.lo.s64 %rd2598, %rd1449, 3449720151; | |
xor.b64 %rd1450, %rd2598, %rd241; | |
shr.u64 %rd1451, %rd1450, 32; | |
mul.lo.s64 %rd2601, %rd1451, 3528531795; | |
shr.u64 %rd1452, %rd2601, 32; | |
and.b64 %rd1453, %rd241, 4294967295; | |
mul.lo.s64 %rd1454, %rd1453, 3528531795; | |
and.b64 %rd1455, %rd1454, 4294967295; | |
xor.b64 %rd1456, %rd1455, %rd1452; | |
xor.b64 %rd1457, %rd1456, 3144134277; | |
mul.lo.s64 %rd2604, %rd1457, 3449720151; | |
xor.b64 %rd2594, %rd1448, %rd1454; | |
mov.u32 %r332, -1767562579; | |
mov.u32 %r331, -766435501; | |
mov.u32 %r330, 1401181199; | |
mov.u64 %rd2612, 4055616968; | |
mov.u64 %rd2611, 1684936478; | |
mov.u64 %rd2610, 534103459; | |
mov.u64 %rd2609, 387276957; | |
mov.u64 %rd2608, 3041712726; | |
mov.u64 %rd2607, 3986602516; | |
mov.u64 %rd2606, 2835769497; | |
mov.u64 %rd2605, 3668340011; | |
mov.u64 %rd2603, 2027808484; | |
mov.u64 %rd2602, 1993301258; | |
mov.u64 %rd2600, 842468239; | |
mov.u64 %rd2599, 2654435769; | |
mov.u64 %rd2597, 3528531795; | |
mov.u64 %rd2596, 1013904242; | |
mov.u64 %rd2595, 3449720151; | |
LBB30_29: | |
shr.u64 %rd1484, %rd2604, 32; | |
shr.u64 %rd1485, %rd2594, 32; | |
mul.lo.s64 %rd1486, %rd1485, %rd2595; | |
and.b64 %rd1487, %rd1486, 4294967295; | |
xor.b64 %rd1488, %rd1487, %rd1484; | |
xor.b64 %rd1489, %rd1488, %rd2596; | |
mul.lo.s64 %rd1490, %rd1489, %rd2597; | |
shr.u64 %rd1491, %rd1490, 32; | |
shr.u64 %rd1492, %rd1486, 32; | |
and.b64 %rd1493, %rd2598, 4294967295; | |
xor.b64 %rd1494, %rd1493, %rd1492; | |
xor.b64 %rd1495, %rd1494, %rd2599; | |
mul.lo.s64 %rd1496, %rd1495, %rd2597; | |
and.b64 %rd1497, %rd1496, 4294967295; | |
xor.b64 %rd1498, %rd1497, %rd1491; | |
xor.b64 %rd1499, %rd1498, %rd2600; | |
mul.lo.s64 %rd1500, %rd1499, %rd2595; | |
shr.u64 %rd1501, %rd1500, 32; | |
shr.u64 %rd1502, %rd1496, 32; | |
and.b64 %rd1503, %rd2601, 4294967295; | |
xor.b64 %rd1504, %rd1503, %rd1502; | |
xor.b64 %rd1505, %rd1504, %rd2602; | |
mul.lo.s64 %rd1506, %rd1505, %rd2595; | |
and.b64 %rd1507, %rd1506, 4294967295; | |
xor.b64 %rd1508, %rd1507, %rd1501; | |
xor.b64 %rd1509, %rd1508, %rd2603; | |
mul.lo.s64 %rd1510, %rd1509, %rd2597; | |
shr.u64 %rd1511, %rd1510, 32; | |
shr.u64 %rd1512, %rd1506, 32; | |
and.b64 %rd1513, %rd2604, 4294967295; | |
xor.b64 %rd1514, %rd1513, %rd1512; | |
xor.b64 %rd1515, %rd1514, %rd2605; | |
mul.lo.s64 %rd1516, %rd1515, %rd2597; | |
and.b64 %rd1517, %rd1516, 4294967295; | |
xor.b64 %rd1518, %rd1517, %rd1511; | |
xor.b64 %rd1519, %rd1518, %rd2606; | |
mul.lo.s64 %rd1520, %rd1519, %rd2595; | |
shr.u64 %rd1521, %rd1520, 32; | |
shr.u64 %rd1522, %rd1516, 32; | |
and.b64 %rd1523, %rd1490, 4294967295; | |
xor.b64 %rd1524, %rd1523, %rd1522; | |
xor.b64 %rd1525, %rd1524, %rd2607; | |
mul.lo.s64 %rd1526, %rd1525, %rd2595; | |
and.b64 %rd1527, %rd1526, 4294967295; | |
xor.b64 %rd1528, %rd1527, %rd1521; | |
xor.b64 %rd1529, %rd1528, %rd2608; | |
mul.lo.s64 %rd1530, %rd1529, %rd2597; | |
shr.u64 %rd1531, %rd1530, 32; | |
shr.u64 %rd1532, %rd1526, 32; | |
and.b64 %rd1533, %rd1500, 4294967295; | |
xor.b64 %rd1534, %rd1533, %rd1532; | |
xor.b64 %rd1535, %rd1534, %rd2609; | |
mul.lo.s64 %rd1536, %rd1535, %rd2597; | |
and.b64 %rd1537, %rd1536, 4294967295; | |
xor.b64 %rd1538, %rd1537, %rd1531; | |
xor.b64 %rd1539, %rd1538, %rd2610; | |
mul.lo.s64 %rd1540, %rd1539, %rd2595; | |
shr.u64 %rd1541, %rd1540, 32; | |
shr.u64 %rd1542, %rd1536, 32; | |
and.b64 %rd1543, %rd1510, 4294967295; | |
xor.b64 %rd1544, %rd1543, %rd1542; | |
xor.b64 %rd1545, %rd1544, %rd2611; | |
mul.lo.s64 %rd1546, %rd1545, %rd2595; | |
and.b64 %rd1547, %rd1546, 4294967295; | |
xor.b64 %rd1548, %rd1547, %rd1541; | |
xor.b64 %rd1549, %rd1548, %rd2612; | |
mul.lo.s64 %rd1550, %rd1549, %rd2597; | |
shr.u64 %rd1551, %rd1550, 32; | |
cvt.u32.u64 %r166, %rd1551; | |
shr.u64 %rd1552, %rd1546, 32; | |
xor.b64 %rd1553, %rd1552, %rd1520; | |
cvt.u32.u64 %r167, %rd1553; | |
xor.b32 %r168, %r330, %r167; | |
mul.lo.s32 %r169, %r168, %r331; | |
xor.b32 %r170, %r169, %r166; | |
xor.b32 %r171, %r170, %r332; | |
shr.u32 %r172, %r171, 9; | |
cvt.rn.f32.u32 %f145, %r172; | |
mul.rn.f32 %f146, %f145, 0f34000000; | |
cvt.rn.f16.f32 %h73, %f146; | |
mov.b16 %h74, 0x2E66; | |
setp.ge.f16 %p36, %h73, %h74; | |
ld.global.nc.b16 %h75, [%rd45+1024]; | |
ld.global.nc.f32 %f147, [%rd46+2048]; | |
cvt.rn.f16.f32 %h76, %f147; | |
add.rn.f16 %h77, %h75, %h76; | |
mov.b16 %h78, 0x3C72; | |
mul.rn.f16 %h79, %h77, %h78; | |
selp.b16 %h80, %h79, 0x0000, %p36; | |
cvt.f32.f16 %f148, %h80; | |
ld.global.nc.b16 %h81, [%rd47+1024]; | |
cvt.f32.f16 %f149, %h81; | |
ld.global.nc.f32 %f150, [%rd48+2048]; | |
mul.rn.f32 %f151, %f1, %f150; | |
mul.rn.f32 %f152, %f151, %f149; | |
ld.global.nc.f32 %f153, [%rd49+2048]; | |
mul.rn.f32 %f154, %f2, %f151; | |
sub.rn.f32 %f155, %f153, %f154; | |
add.rn.f32 %f156, %f152, %f155; | |
add.rn.f32 %f157, %f156, %f148; | |
sub.rn.f32 %f158, %f157, %f3; | |
mul.rn.f32 %f159, %f158, %f158; | |
add.rn.f32 %f12, %f11, %f159; | |
or.b32 %r173, %r3, 513; | |
or.b32 %r174, %r173, %r4; | |
and.b32 %r175, %r173, 3; | |
shr.u32 %r176, %r174, 2; | |
setp.ne.s32 %p37, %r175, 1; | |
cvt.u64.u32 %rd1554, %r176; | |
add.s64 %rd269, %rd12, %rd1554; | |
@%p37 bra LBB30_31; | |
and.b64 %rd1594, %rd269, 4294967295; | |
mul.lo.s64 %rd2617, %rd1594, 3528531795; | |
setp.lt.u64 %p39, %rd269, %rd12; | |
selp.u64 %rd1595, 1, 0, %p39; | |
add.s64 %rd1596, %rd2464, %rd1595; | |
xor.b64 %rd1597, %rd1596, %rd2617; | |
shr.u64 %rd1598, %rd1597, 32; | |
mul.lo.s64 %rd2620, %rd1598, 3449720151; | |
shr.u64 %rd1599, %rd2620, 32; | |
and.b64 %rd1600, %rd1596, 4294967295; | |
mul.lo.s64 %rd1601, %rd1600, 3449720151; | |
and.b64 %rd1602, %rd1601, 4294967295; | |
xor.b64 %rd1603, %rd1602, %rd1599; | |
xor.b64 %rd1604, %rd1603, 2654435769; | |
mul.lo.s64 %rd2623, %rd1604, 3528531795; | |
xor.b64 %rd2613, %rd1601, %rd269; | |
mov.u32 %r334, -845247145; | |
mov.u32 %r333, -616729560; | |
mov.u64 %rd2630, 3041712726; | |
mov.u64 %rd2629, 1401181199; | |
mov.u64 %rd2628, 2835769497; | |
mov.u64 %rd2627, 1684936478; | |
mov.u64 %rd2626, 2027808484; | |
mov.u64 %rd2625, 387276957; | |
mov.u64 %rd2624, 842468239; | |
mov.u64 %rd2622, 3986602516; | |
mov.u64 %rd2621, 1013904242; | |
mov.u64 %rd2619, 3668340011; | |
mov.u64 %rd2618, 3144134277; | |
mov.u64 %rd2616, 3449720151; | |
mov.u64 %rd2615, 1993301258; | |
mov.u64 %rd2614, 3528531795; | |
bra.uni LBB30_32; | |
LBB30_31: | |
setp.lt.u64 %p38, %rd269, %rd12; | |
selp.u64 %rd1569, 1, 0, %p38; | |
add.s64 %rd1570, %rd2464, %rd1569; | |
and.b64 %rd1571, %rd1570, 4294967295; | |
mul.lo.s64 %rd2617, %rd1571, 3449720151; | |
xor.b64 %rd1572, %rd2617, %rd269; | |
shr.u64 %rd1573, %rd1572, 32; | |
mul.lo.s64 %rd2620, %rd1573, 3528531795; | |
shr.u64 %rd1574, %rd2620, 32; | |
and.b64 %rd1575, %rd269, 4294967295; | |
mul.lo.s64 %rd1576, %rd1575, 3528531795; | |
and.b64 %rd1577, %rd1576, 4294967295; | |
xor.b64 %rd1578, %rd1577, %rd1574; | |
xor.b64 %rd1579, %rd1578, 3144134277; | |
mul.lo.s64 %rd2623, %rd1579, 3449720151; | |
xor.b64 %rd2613, %rd1570, %rd1576; | |
mov.u32 %r334, -766435501; | |
mov.u32 %r333, -239350328; | |
mov.u64 %rd2630, 1684936478; | |
mov.u64 %rd2629, 534103459; | |
mov.u64 %rd2628, 387276957; | |
mov.u64 %rd2627, 3041712726; | |
mov.u64 %rd2626, 3986602516; | |
mov.u64 %rd2625, 2835769497; | |
mov.u64 %rd2624, 3668340011; | |
mov.u64 %rd2622, 2027808484; | |
mov.u64 %rd2621, 1993301258; | |
mov.u64 %rd2619, 842468239; | |
mov.u64 %rd2618, 2654435769; | |
mov.u64 %rd2616, 3528531795; | |
mov.u64 %rd2615, 1013904242; | |
mov.u64 %rd2614, 3449720151; | |
LBB30_32: | |
shr.u64 %rd1605, %rd2623, 32; | |
shr.u64 %rd1606, %rd2613, 32; | |
mul.lo.s64 %rd1607, %rd1606, %rd2614; | |
and.b64 %rd1608, %rd1607, 4294967295; | |
xor.b64 %rd1609, %rd1608, %rd1605; | |
xor.b64 %rd1610, %rd1609, %rd2615; | |
mul.lo.s64 %rd1611, %rd1610, %rd2616; | |
shr.u64 %rd1612, %rd1611, 32; | |
shr.u64 %rd1613, %rd1607, 32; | |
and.b64 %rd1614, %rd2617, 4294967295; | |
xor.b64 %rd1615, %rd1614, %rd1613; | |
xor.b64 %rd1616, %rd1615, %rd2618; | |
mul.lo.s64 %rd1617, %rd1616, %rd2616; | |
and.b64 %rd1618, %rd1617, 4294967295; | |
xor.b64 %rd1619, %rd1618, %rd1612; | |
xor.b64 %rd1620, %rd1619, %rd2619; | |
mul.lo.s64 %rd1621, %rd1620, %rd2614; | |
shr.u64 %rd1622, %rd1621, 32; | |
shr.u64 %rd1623, %rd1617, 32; | |
and.b64 %rd1624, %rd2620, 4294967295; | |
xor.b64 %rd1625, %rd1624, %rd1623; | |
xor.b64 %rd1626, %rd1625, %rd2621; | |
mul.lo.s64 %rd1627, %rd1626, %rd2614; | |
and.b64 %rd1628, %rd1627, 4294967295; | |
xor.b64 %rd1629, %rd1628, %rd1622; | |
xor.b64 %rd1630, %rd1629, %rd2622; | |
mul.lo.s64 %rd1631, %rd1630, %rd2616; | |
shr.u64 %rd1632, %rd1631, 32; | |
shr.u64 %rd1633, %rd1627, 32; | |
and.b64 %rd1634, %rd2623, 4294967295; | |
xor.b64 %rd1635, %rd1634, %rd1633; | |
xor.b64 %rd1636, %rd1635, %rd2624; | |
mul.lo.s64 %rd1637, %rd1636, %rd2616; | |
and.b64 %rd1638, %rd1637, 4294967295; | |
xor.b64 %rd1639, %rd1638, %rd1632; | |
xor.b64 %rd1640, %rd1639, %rd2625; | |
mul.lo.s64 %rd1641, %rd1640, %rd2614; | |
shr.u64 %rd1642, %rd1641, 32; | |
shr.u64 %rd1643, %rd1637, 32; | |
and.b64 %rd1644, %rd1611, 4294967295; | |
xor.b64 %rd1645, %rd1644, %rd1643; | |
xor.b64 %rd1646, %rd1645, %rd2626; | |
mul.lo.s64 %rd1647, %rd1646, %rd2614; | |
and.b64 %rd1648, %rd1647, 4294967295; | |
xor.b64 %rd1649, %rd1648, %rd1642; | |
xor.b64 %rd1650, %rd1649, %rd2627; | |
mul.lo.s64 %rd1651, %rd1650, %rd2616; | |
shr.u64 %rd1652, %rd1651, 32; | |
shr.u64 %rd1653, %rd1647, 32; | |
and.b64 %rd1654, %rd1621, 4294967295; | |
xor.b64 %rd1655, %rd1654, %rd1653; | |
xor.b64 %rd1656, %rd1655, %rd2628; | |
mul.lo.s64 %rd1657, %rd1656, %rd2616; | |
and.b64 %rd1658, %rd1657, 4294967295; | |
xor.b64 %rd1659, %rd1658, %rd1652; | |
xor.b64 %rd1660, %rd1659, %rd2629; | |
mul.lo.s64 %rd1661, %rd1660, %rd2614; | |
shr.u64 %rd1662, %rd1661, 32; | |
shr.u64 %rd1663, %rd1657, 32; | |
xor.b64 %rd1664, %rd1631, %rd1663; | |
xor.b64 %rd1665, %rd1664, %rd2630; | |
mul.lo.s64 %rd1666, %rd1665, %rd2614; | |
xor.b64 %rd1667, %rd1662, %rd1666; | |
cvt.u32.u64 %r181, %rd1667; | |
xor.b32 %r182, %r333, %r181; | |
mul.lo.s32 %r183, %r182, %r334; | |
shr.u32 %r184, %r183, 9; | |
cvt.rn.f32.u32 %f160, %r184; | |
mul.rn.f32 %f161, %f160, 0f34000000; | |
cvt.rn.f16.f32 %h82, %f161; | |
mov.b16 %h83, 0x2E66; | |
setp.ge.f16 %p41, %h82, %h83; | |
ld.global.nc.b16 %h84, [%rd45+1026]; | |
ld.global.nc.f32 %f162, [%rd46+2052]; | |
cvt.rn.f16.f32 %h85, %f162; | |
add.rn.f16 %h86, %h84, %h85; | |
mov.b16 %h87, 0x3C72; | |
mul.rn.f16 %h88, %h86, %h87; | |
selp.b16 %h89, %h88, 0x0000, %p41; | |
cvt.f32.f16 %f163, %h89; | |
ld.global.nc.b16 %h90, [%rd47+1026]; | |
cvt.f32.f16 %f164, %h90; | |
ld.global.nc.f32 %f165, [%rd48+2052]; | |
mul.rn.f32 %f166, %f1, %f165; | |
mul.rn.f32 %f167, %f166, %f164; | |
ld.global.nc.f32 %f168, [%rd49+2052]; | |
mul.rn.f32 %f169, %f2, %f166; | |
sub.rn.f32 %f170, %f168, %f169; | |
add.rn.f32 %f171, %f167, %f170; | |
add.rn.f32 %f172, %f171, %f163; | |
sub.rn.f32 %f173, %f172, %f3; | |
mul.rn.f32 %f174, %f173, %f173; | |
add.rn.f32 %f13, %f12, %f174; | |
or.b32 %r186, %r73, 640; | |
shr.u32 %r187, %r186, 2; | |
cvt.u64.u32 %rd1668, %r187; | |
add.s64 %rd296, %rd12, %rd1668; | |
@%p8 bra LBB30_34; | |
and.b64 %rd1710, %rd296, 4294967295; | |
mul.lo.s64 %rd2635, %rd1710, 3528531795; | |
setp.lt.u64 %p43, %rd296, %rd12; | |
selp.u64 %rd1711, 1, 0, %p43; | |
add.s64 %rd1712, %rd2464, %rd1711; | |
xor.b64 %rd1713, %rd1712, %rd2635; | |
shr.u64 %rd1714, %rd1713, 32; | |
mul.lo.s64 %rd2638, %rd1714, 3449720151; | |
shr.u64 %rd1715, %rd2638, 32; | |
and.b64 %rd1716, %rd1712, 4294967295; | |
mul.lo.s64 %rd1717, %rd1716, 3449720151; | |
and.b64 %rd1718, %rd1717, 4294967295; | |
xor.b64 %rd1719, %rd1718, %rd1715; | |
xor.b64 %rd1720, %rd1719, 2654435769; | |
mul.lo.s64 %rd2641, %rd1720, 3528531795; | |
xor.b64 %rd2631, %rd1717, %rd296; | |
mov.u32 %r337, -1879881855; | |
mov.u32 %r336, -845247145; | |
mov.u32 %r335, 534103459; | |
mov.u64 %rd2649, 3678237736; | |
mov.u64 %rd2648, 3041712726; | |
mov.u64 %rd2647, 1401181199; | |
mov.u64 %rd2646, 2835769497; | |
mov.u64 %rd2645, 1684936478; | |
mov.u64 %rd2644, 2027808484; | |
mov.u64 %rd2643, 387276957; | |
mov.u64 %rd2642, 842468239; | |
mov.u64 %rd2640, 3986602516; | |
mov.u64 %rd2639, 1013904242; | |
mov.u64 %rd2637, 3668340011; | |
mov.u64 %rd2636, 3144134277; | |
mov.u64 %rd2634, 3449720151; | |
mov.u64 %rd2633, 1993301258; | |
mov.u64 %rd2632, 3528531795; | |
bra.uni LBB30_35; | |
LBB30_34: | |
setp.lt.u64 %p42, %rd296, %rd12; | |
selp.u64 %rd1684, 1, 0, %p42; | |
add.s64 %rd1685, %rd2464, %rd1684; | |
and.b64 %rd1686, %rd1685, 4294967295; | |
mul.lo.s64 %rd2635, %rd1686, 3449720151; | |
xor.b64 %rd1687, %rd2635, %rd296; | |
shr.u64 %rd1688, %rd1687, 32; | |
mul.lo.s64 %rd2638, %rd1688, 3528531795; | |
shr.u64 %rd1689, %rd2638, 32; | |
and.b64 %rd1690, %rd296, 4294967295; | |
mul.lo.s64 %rd1691, %rd1690, 3528531795; | |
and.b64 %rd1692, %rd1691, 4294967295; | |
xor.b64 %rd1693, %rd1692, %rd1689; | |
xor.b64 %rd1694, %rd1693, 3144134277; | |
mul.lo.s64 %rd2641, %rd1694, 3449720151; | |
xor.b64 %rd2631, %rd1685, %rd1691; | |
mov.u32 %r337, -1767562579; | |
mov.u32 %r336, -766435501; | |
mov.u32 %r335, 1401181199; | |
mov.u64 %rd2649, 4055616968; | |
mov.u64 %rd2648, 1684936478; | |
mov.u64 %rd2647, 534103459; | |
mov.u64 %rd2646, 387276957; | |
mov.u64 %rd2645, 3041712726; | |
mov.u64 %rd2644, 3986602516; | |
mov.u64 %rd2643, 2835769497; | |
mov.u64 %rd2642, 3668340011; | |
mov.u64 %rd2640, 2027808484; | |
mov.u64 %rd2639, 1993301258; | |
mov.u64 %rd2637, 842468239; | |
mov.u64 %rd2636, 2654435769; | |
mov.u64 %rd2634, 3528531795; | |
mov.u64 %rd2633, 1013904242; | |
mov.u64 %rd2632, 3449720151; | |
LBB30_35: | |
shr.u64 %rd1721, %rd2641, 32; | |
shr.u64 %rd1722, %rd2631, 32; | |
mul.lo.s64 %rd1723, %rd1722, %rd2632; | |
and.b64 %rd1724, %rd1723, 4294967295; | |
xor.b64 %rd1725, %rd1724, %rd1721; | |
xor.b64 %rd1726, %rd1725, %rd2633; | |
mul.lo.s64 %rd1727, %rd1726, %rd2634; | |
shr.u64 %rd1728, %rd1727, 32; | |
shr.u64 %rd1729, %rd1723, 32; | |
and.b64 %rd1730, %rd2635, 4294967295; | |
xor.b64 %rd1731, %rd1730, %rd1729; | |
xor.b64 %rd1732, %rd1731, %rd2636; | |
mul.lo.s64 %rd1733, %rd1732, %rd2634; | |
and.b64 %rd1734, %rd1733, 4294967295; | |
xor.b64 %rd1735, %rd1734, %rd1728; | |
xor.b64 %rd1736, %rd1735, %rd2637; | |
mul.lo.s64 %rd1737, %rd1736, %rd2632; | |
shr.u64 %rd1738, %rd1737, 32; | |
shr.u64 %rd1739, %rd1733, 32; | |
and.b64 %rd1740, %rd2638, 4294967295; | |
xor.b64 %rd1741, %rd1740, %rd1739; | |
xor.b64 %rd1742, %rd1741, %rd2639; | |
mul.lo.s64 %rd1743, %rd1742, %rd2632; | |
and.b64 %rd1744, %rd1743, 4294967295; | |
xor.b64 %rd1745, %rd1744, %rd1738; | |
xor.b64 %rd1746, %rd1745, %rd2640; | |
mul.lo.s64 %rd1747, %rd1746, %rd2634; | |
shr.u64 %rd1748, %rd1747, 32; | |
shr.u64 %rd1749, %rd1743, 32; | |
and.b64 %rd1750, %rd2641, 4294967295; | |
xor.b64 %rd1751, %rd1750, %rd1749; | |
xor.b64 %rd1752, %rd1751, %rd2642; | |
mul.lo.s64 %rd1753, %rd1752, %rd2634; | |
and.b64 %rd1754, %rd1753, 4294967295; | |
xor.b64 %rd1755, %rd1754, %rd1748; | |
xor.b64 %rd1756, %rd1755, %rd2643; | |
mul.lo.s64 %rd1757, %rd1756, %rd2632; | |
shr.u64 %rd1758, %rd1757, 32; | |
shr.u64 %rd1759, %rd1753, 32; | |
and.b64 %rd1760, %rd1727, 4294967295; | |
xor.b64 %rd1761, %rd1760, %rd1759; | |
xor.b64 %rd1762, %rd1761, %rd2644; | |
mul.lo.s64 %rd1763, %rd1762, %rd2632; | |
and.b64 %rd1764, %rd1763, 4294967295; | |
xor.b64 %rd1765, %rd1764, %rd1758; | |
xor.b64 %rd1766, %rd1765, %rd2645; | |
mul.lo.s64 %rd1767, %rd1766, %rd2634; | |
shr.u64 %rd1768, %rd1767, 32; | |
shr.u64 %rd1769, %rd1763, 32; | |
and.b64 %rd1770, %rd1737, 4294967295; | |
xor.b64 %rd1771, %rd1770, %rd1769; | |
xor.b64 %rd1772, %rd1771, %rd2646; | |
mul.lo.s64 %rd1773, %rd1772, %rd2634; | |
and.b64 %rd1774, %rd1773, 4294967295; | |
xor.b64 %rd1775, %rd1774, %rd1768; | |
xor.b64 %rd1776, %rd1775, %rd2647; | |
mul.lo.s64 %rd1777, %rd1776, %rd2632; | |
shr.u64 %rd1778, %rd1777, 32; | |
shr.u64 %rd1779, %rd1773, 32; | |
and.b64 %rd1780, %rd1747, 4294967295; | |
xor.b64 %rd1781, %rd1780, %rd1779; | |
xor.b64 %rd1782, %rd1781, %rd2648; | |
mul.lo.s64 %rd1783, %rd1782, %rd2632; | |
and.b64 %rd1784, %rd1783, 4294967295; | |
xor.b64 %rd1785, %rd1784, %rd1778; | |
xor.b64 %rd1786, %rd1785, %rd2649; | |
mul.lo.s64 %rd1787, %rd1786, %rd2634; | |
shr.u64 %rd1788, %rd1787, 32; | |
cvt.u32.u64 %r194, %rd1788; | |
shr.u64 %rd1789, %rd1783, 32; | |
xor.b64 %rd1790, %rd1789, %rd1757; | |
cvt.u32.u64 %r195, %rd1790; | |
xor.b32 %r196, %r335, %r195; | |
mul.lo.s32 %r197, %r196, %r336; | |
xor.b32 %r198, %r197, %r194; | |
xor.b32 %r199, %r198, %r337; | |
shr.u32 %r200, %r199, 9; | |
cvt.rn.f32.u32 %f175, %r200; | |
mul.rn.f32 %f176, %f175, 0f34000000; | |
cvt.rn.f16.f32 %h91, %f176; | |
mov.b16 %h92, 0x2E66; | |
setp.ge.f16 %p44, %h91, %h92; | |
ld.global.nc.b16 %h93, [%rd45+1280]; | |
ld.global.nc.f32 %f177, [%rd46+2560]; | |
cvt.rn.f16.f32 %h94, %f177; | |
add.rn.f16 %h95, %h93, %h94; | |
mov.b16 %h96, 0x3C72; | |
mul.rn.f16 %h97, %h95, %h96; | |
selp.b16 %h98, %h97, 0x0000, %p44; | |
cvt.f32.f16 %f178, %h98; | |
ld.global.nc.b16 %h99, [%rd47+1280]; | |
cvt.f32.f16 %f179, %h99; | |
ld.global.nc.f32 %f180, [%rd48+2560]; | |
mul.rn.f32 %f181, %f1, %f180; | |
mul.rn.f32 %f182, %f181, %f179; | |
ld.global.nc.f32 %f183, [%rd49+2560]; | |
mul.rn.f32 %f184, %f2, %f181; | |
sub.rn.f32 %f185, %f183, %f184; | |
add.rn.f32 %f186, %f182, %f185; | |
add.rn.f32 %f187, %f186, %f178; | |
sub.rn.f32 %f188, %f187, %f3; | |
mul.rn.f32 %f189, %f188, %f188; | |
add.rn.f32 %f14, %f13, %f189; | |
or.b32 %r201, %r3, 641; | |
or.b32 %r202, %r201, %r4; | |
and.b32 %r203, %r201, 3; | |
shr.u32 %r204, %r202, 2; | |
setp.ne.s32 %p45, %r203, 1; | |
cvt.u64.u32 %rd1791, %r204; | |
add.s64 %rd324, %rd12, %rd1791; | |
@%p45 bra LBB30_37; | |
and.b64 %rd1831, %rd324, 4294967295; | |
mul.lo.s64 %rd2654, %rd1831, 3528531795; | |
setp.lt.u64 %p47, %rd324, %rd12; | |
selp.u64 %rd1832, 1, 0, %p47; | |
add.s64 %rd1833, %rd2464, %rd1832; | |
xor.b64 %rd1834, %rd1833, %rd2654; | |
shr.u64 %rd1835, %rd1834, 32; | |
mul.lo.s64 %rd2657, %rd1835, 3449720151; | |
shr.u64 %rd1836, %rd2657, 32; | |
and.b64 %rd1837, %rd1833, 4294967295; | |
mul.lo.s64 %rd1838, %rd1837, 3449720151; | |
and.b64 %rd1839, %rd1838, 4294967295; | |
xor.b64 %rd1840, %rd1839, %rd1836; | |
xor.b64 %rd1841, %rd1840, 2654435769; | |
mul.lo.s64 %rd2660, %rd1841, 3528531795; | |
xor.b64 %rd2650, %rd1838, %rd324; | |
mov.u32 %r339, -845247145; | |
mov.u32 %r338, -616729560; | |
mov.u64 %rd2667, 3041712726; | |
mov.u64 %rd2666, 1401181199; | |
mov.u64 %rd2665, 2835769497; | |
mov.u64 %rd2664, 1684936478; | |
mov.u64 %rd2663, 2027808484; | |
mov.u64 %rd2662, 387276957; | |
mov.u64 %rd2661, 842468239; | |
mov.u64 %rd2659, 3986602516; | |
mov.u64 %rd2658, 1013904242; | |
mov.u64 %rd2656, 3668340011; | |
mov.u64 %rd2655, 3144134277; | |
mov.u64 %rd2653, 3449720151; | |
mov.u64 %rd2652, 1993301258; | |
mov.u64 %rd2651, 3528531795; | |
bra.uni LBB30_38; | |
LBB30_37: | |
setp.lt.u64 %p46, %rd324, %rd12; | |
selp.u64 %rd1806, 1, 0, %p46; | |
add.s64 %rd1807, %rd2464, %rd1806; | |
and.b64 %rd1808, %rd1807, 4294967295; | |
mul.lo.s64 %rd2654, %rd1808, 3449720151; | |
xor.b64 %rd1809, %rd2654, %rd324; | |
shr.u64 %rd1810, %rd1809, 32; | |
mul.lo.s64 %rd2657, %rd1810, 3528531795; | |
shr.u64 %rd1811, %rd2657, 32; | |
and.b64 %rd1812, %rd324, 4294967295; | |
mul.lo.s64 %rd1813, %rd1812, 3528531795; | |
and.b64 %rd1814, %rd1813, 4294967295; | |
xor.b64 %rd1815, %rd1814, %rd1811; | |
xor.b64 %rd1816, %rd1815, 3144134277; | |
mul.lo.s64 %rd2660, %rd1816, 3449720151; | |
xor.b64 %rd2650, %rd1807, %rd1813; | |
mov.u32 %r339, -766435501; | |
mov.u32 %r338, -239350328; | |
mov.u64 %rd2667, 1684936478; | |
mov.u64 %rd2666, 534103459; | |
mov.u64 %rd2665, 387276957; | |
mov.u64 %rd2664, 3041712726; | |
mov.u64 %rd2663, 3986602516; | |
mov.u64 %rd2662, 2835769497; | |
mov.u64 %rd2661, 3668340011; | |
mov.u64 %rd2659, 2027808484; | |
mov.u64 %rd2658, 1993301258; | |
mov.u64 %rd2656, 842468239; | |
mov.u64 %rd2655, 2654435769; | |
mov.u64 %rd2653, 3528531795; | |
mov.u64 %rd2652, 1013904242; | |
mov.u64 %rd2651, 3449720151; | |
LBB30_38: | |
shr.u64 %rd1842, %rd2660, 32; | |
shr.u64 %rd1843, %rd2650, 32; | |
mul.lo.s64 %rd1844, %rd1843, %rd2651; | |
and.b64 %rd1845, %rd1844, 4294967295; | |
xor.b64 %rd1846, %rd1845, %rd1842; | |
xor.b64 %rd1847, %rd1846, %rd2652; | |
mul.lo.s64 %rd1848, %rd1847, %rd2653; | |
shr.u64 %rd1849, %rd1848, 32; | |
shr.u64 %rd1850, %rd1844, 32; | |
and.b64 %rd1851, %rd2654, 4294967295; | |
xor.b64 %rd1852, %rd1851, %rd1850; | |
xor.b64 %rd1853, %rd1852, %rd2655; | |
mul.lo.s64 %rd1854, %rd1853, %rd2653; | |
and.b64 %rd1855, %rd1854, 4294967295; | |
xor.b64 %rd1856, %rd1855, %rd1849; | |
xor.b64 %rd1857, %rd1856, %rd2656; | |
mul.lo.s64 %rd1858, %rd1857, %rd2651; | |
shr.u64 %rd1859, %rd1858, 32; | |
shr.u64 %rd1860, %rd1854, 32; | |
and.b64 %rd1861, %rd2657, 4294967295; | |
xor.b64 %rd1862, %rd1861, %rd1860; | |
xor.b64 %rd1863, %rd1862, %rd2658; | |
mul.lo.s64 %rd1864, %rd1863, %rd2651; | |
and.b64 %rd1865, %rd1864, 4294967295; | |
xor.b64 %rd1866, %rd1865, %rd1859; | |
xor.b64 %rd1867, %rd1866, %rd2659; | |
mul.lo.s64 %rd1868, %rd1867, %rd2653; | |
shr.u64 %rd1869, %rd1868, 32; | |
shr.u64 %rd1870, %rd1864, 32; | |
and.b64 %rd1871, %rd2660, 4294967295; | |
xor.b64 %rd1872, %rd1871, %rd1870; | |
xor.b64 %rd1873, %rd1872, %rd2661; | |
mul.lo.s64 %rd1874, %rd1873, %rd2653; | |
and.b64 %rd1875, %rd1874, 4294967295; | |
xor.b64 %rd1876, %rd1875, %rd1869; | |
xor.b64 %rd1877, %rd1876, %rd2662; | |
mul.lo.s64 %rd1878, %rd1877, %rd2651; | |
shr.u64 %rd1879, %rd1878, 32; | |
shr.u64 %rd1880, %rd1874, 32; | |
and.b64 %rd1881, %rd1848, 4294967295; | |
xor.b64 %rd1882, %rd1881, %rd1880; | |
xor.b64 %rd1883, %rd1882, %rd2663; | |
mul.lo.s64 %rd1884, %rd1883, %rd2651; | |
and.b64 %rd1885, %rd1884, 4294967295; | |
xor.b64 %rd1886, %rd1885, %rd1879; | |
xor.b64 %rd1887, %rd1886, %rd2664; | |
mul.lo.s64 %rd1888, %rd1887, %rd2653; | |
shr.u64 %rd1889, %rd1888, 32; | |
shr.u64 %rd1890, %rd1884, 32; | |
and.b64 %rd1891, %rd1858, 4294967295; | |
xor.b64 %rd1892, %rd1891, %rd1890; | |
xor.b64 %rd1893, %rd1892, %rd2665; | |
mul.lo.s64 %rd1894, %rd1893, %rd2653; | |
and.b64 %rd1895, %rd1894, 4294967295; | |
xor.b64 %rd1896, %rd1895, %rd1889; | |
xor.b64 %rd1897, %rd1896, %rd2666; | |
mul.lo.s64 %rd1898, %rd1897, %rd2651; | |
shr.u64 %rd1899, %rd1898, 32; | |
shr.u64 %rd1900, %rd1894, 32; | |
xor.b64 %rd1901, %rd1868, %rd1900; | |
xor.b64 %rd1902, %rd1901, %rd2667; | |
mul.lo.s64 %rd1903, %rd1902, %rd2651; | |
xor.b64 %rd1904, %rd1899, %rd1903; | |
cvt.u32.u64 %r209, %rd1904; | |
xor.b32 %r210, %r338, %r209; | |
mul.lo.s32 %r211, %r210, %r339; | |
shr.u32 %r212, %r211, 9; | |
cvt.rn.f32.u32 %f190, %r212; | |
mul.rn.f32 %f191, %f190, 0f34000000; | |
cvt.rn.f16.f32 %h100, %f191; | |
mov.b16 %h101, 0x2E66; | |
setp.ge.f16 %p49, %h100, %h101; | |
ld.global.nc.b16 %h102, [%rd45+1282]; | |
ld.global.nc.f32 %f192, [%rd46+2564]; | |
cvt.rn.f16.f32 %h103, %f192; | |
add.rn.f16 %h104, %h102, %h103; | |
mov.b16 %h105, 0x3C72; | |
mul.rn.f16 %h106, %h104, %h105; | |
selp.b16 %h107, %h106, 0x0000, %p49; | |
cvt.f32.f16 %f193, %h107; | |
ld.global.nc.b16 %h108, [%rd47+1282]; | |
cvt.f32.f16 %f194, %h108; | |
ld.global.nc.f32 %f195, [%rd48+2564]; | |
mul.rn.f32 %f196, %f1, %f195; | |
mul.rn.f32 %f197, %f196, %f194; | |
ld.global.nc.f32 %f198, [%rd49+2564]; | |
mul.rn.f32 %f199, %f2, %f196; | |
sub.rn.f32 %f200, %f198, %f199; | |
add.rn.f32 %f201, %f197, %f200; | |
add.rn.f32 %f202, %f201, %f193; | |
sub.rn.f32 %f203, %f202, %f3; | |
mul.rn.f32 %f204, %f203, %f203; | |
add.rn.f32 %f15, %f14, %f204; | |
or.b32 %r214, %r73, 768; | |
shr.u32 %r215, %r214, 2; | |
cvt.u64.u32 %rd1905, %r215; | |
add.s64 %rd351, %rd12, %rd1905; | |
@%p8 bra LBB30_40; | |
and.b64 %rd1947, %rd351, 4294967295; | |
mul.lo.s64 %rd2672, %rd1947, 3528531795; | |
setp.lt.u64 %p51, %rd351, %rd12; | |
selp.u64 %rd1948, 1, 0, %p51; | |
add.s64 %rd1949, %rd2464, %rd1948; | |
xor.b64 %rd1950, %rd1949, %rd2672; | |
shr.u64 %rd1951, %rd1950, 32; | |
mul.lo.s64 %rd2675, %rd1951, 3449720151; | |
shr.u64 %rd1952, %rd2675, 32; | |
and.b64 %rd1953, %rd1949, 4294967295; | |
mul.lo.s64 %rd1954, %rd1953, 3449720151; | |
and.b64 %rd1955, %rd1954, 4294967295; | |
xor.b64 %rd1956, %rd1955, %rd1952; | |
xor.b64 %rd1957, %rd1956, 2654435769; | |
mul.lo.s64 %rd2678, %rd1957, 3528531795; | |
xor.b64 %rd2668, %rd1954, %rd351; | |
mov.u32 %r342, -1879881855; | |
mov.u32 %r341, -845247145; | |
mov.u32 %r340, 534103459; | |
mov.u64 %rd2686, 3678237736; | |
mov.u64 %rd2685, 3041712726; | |
mov.u64 %rd2684, 1401181199; | |
mov.u64 %rd2683, 2835769497; | |
mov.u64 %rd2682, 1684936478; | |
mov.u64 %rd2681, 2027808484; | |
mov.u64 %rd2680, 387276957; | |
mov.u64 %rd2679, 842468239; | |
mov.u64 %rd2677, 3986602516; | |
mov.u64 %rd2676, 1013904242; | |
mov.u64 %rd2674, 3668340011; | |
mov.u64 %rd2673, 3144134277; | |
mov.u64 %rd2671, 3449720151; | |
mov.u64 %rd2670, 1993301258; | |
mov.u64 %rd2669, 3528531795; | |
bra.uni LBB30_41; | |
LBB30_40: | |
setp.lt.u64 %p50, %rd351, %rd12; | |
selp.u64 %rd1921, 1, 0, %p50; | |
add.s64 %rd1922, %rd2464, %rd1921; | |
and.b64 %rd1923, %rd1922, 4294967295; | |
mul.lo.s64 %rd2672, %rd1923, 3449720151; | |
xor.b64 %rd1924, %rd2672, %rd351; | |
shr.u64 %rd1925, %rd1924, 32; | |
mul.lo.s64 %rd2675, %rd1925, 3528531795; | |
shr.u64 %rd1926, %rd2675, 32; | |
and.b64 %rd1927, %rd351, 4294967295; | |
mul.lo.s64 %rd1928, %rd1927, 3528531795; | |
and.b64 %rd1929, %rd1928, 4294967295; | |
xor.b64 %rd1930, %rd1929, %rd1926; | |
xor.b64 %rd1931, %rd1930, 3144134277; | |
mul.lo.s64 %rd2678, %rd1931, 3449720151; | |
xor.b64 %rd2668, %rd1922, %rd1928; | |
mov.u32 %r342, -1767562579; | |
mov.u32 %r341, -766435501; | |
mov.u32 %r340, 1401181199; | |
mov.u64 %rd2686, 4055616968; | |
mov.u64 %rd2685, 1684936478; | |
mov.u64 %rd2684, 534103459; | |
mov.u64 %rd2683, 387276957; | |
mov.u64 %rd2682, 3041712726; | |
mov.u64 %rd2681, 3986602516; | |
mov.u64 %rd2680, 2835769497; | |
mov.u64 %rd2679, 3668340011; | |
mov.u64 %rd2677, 2027808484; | |
mov.u64 %rd2676, 1993301258; | |
mov.u64 %rd2674, 842468239; | |
mov.u64 %rd2673, 2654435769; | |
mov.u64 %rd2671, 3528531795; | |
mov.u64 %rd2670, 1013904242; | |
mov.u64 %rd2669, 3449720151; | |
LBB30_41: | |
shr.u64 %rd1958, %rd2678, 32; | |
shr.u64 %rd1959, %rd2668, 32; | |
mul.lo.s64 %rd1960, %rd1959, %rd2669; | |
and.b64 %rd1961, %rd1960, 4294967295; | |
xor.b64 %rd1962, %rd1961, %rd1958; | |
xor.b64 %rd1963, %rd1962, %rd2670; | |
mul.lo.s64 %rd1964, %rd1963, %rd2671; | |
shr.u64 %rd1965, %rd1964, 32; | |
shr.u64 %rd1966, %rd1960, 32; | |
and.b64 %rd1967, %rd2672, 4294967295; | |
xor.b64 %rd1968, %rd1967, %rd1966; | |
xor.b64 %rd1969, %rd1968, %rd2673; | |
mul.lo.s64 %rd1970, %rd1969, %rd2671; | |
and.b64 %rd1971, %rd1970, 4294967295; | |
xor.b64 %rd1972, %rd1971, %rd1965; | |
xor.b64 %rd1973, %rd1972, %rd2674; | |
mul.lo.s64 %rd1974, %rd1973, %rd2669; | |
shr.u64 %rd1975, %rd1974, 32; | |
shr.u64 %rd1976, %rd1970, 32; | |
and.b64 %rd1977, %rd2675, 4294967295; | |
xor.b64 %rd1978, %rd1977, %rd1976; | |
xor.b64 %rd1979, %rd1978, %rd2676; | |
mul.lo.s64 %rd1980, %rd1979, %rd2669; | |
and.b64 %rd1981, %rd1980, 4294967295; | |
xor.b64 %rd1982, %rd1981, %rd1975; | |
xor.b64 %rd1983, %rd1982, %rd2677; | |
mul.lo.s64 %rd1984, %rd1983, %rd2671; | |
shr.u64 %rd1985, %rd1984, 32; | |
shr.u64 %rd1986, %rd1980, 32; | |
and.b64 %rd1987, %rd2678, 4294967295; | |
xor.b64 %rd1988, %rd1987, %rd1986; | |
xor.b64 %rd1989, %rd1988, %rd2679; | |
mul.lo.s64 %rd1990, %rd1989, %rd2671; | |
and.b64 %rd1991, %rd1990, 4294967295; | |
xor.b64 %rd1992, %rd1991, %rd1985; | |
xor.b64 %rd1993, %rd1992, %rd2680; | |
mul.lo.s64 %rd1994, %rd1993, %rd2669; | |
shr.u64 %rd1995, %rd1994, 32; | |
shr.u64 %rd1996, %rd1990, 32; | |
and.b64 %rd1997, %rd1964, 4294967295; | |
xor.b64 %rd1998, %rd1997, %rd1996; | |
xor.b64 %rd1999, %rd1998, %rd2681; | |
mul.lo.s64 %rd2000, %rd1999, %rd2669; | |
and.b64 %rd2001, %rd2000, 4294967295; | |
xor.b64 %rd2002, %rd2001, %rd1995; | |
xor.b64 %rd2003, %rd2002, %rd2682; | |
mul.lo.s64 %rd2004, %rd2003, %rd2671; | |
shr.u64 %rd2005, %rd2004, 32; | |
shr.u64 %rd2006, %rd2000, 32; | |
and.b64 %rd2007, %rd1974, 4294967295; | |
xor.b64 %rd2008, %rd2007, %rd2006; | |
xor.b64 %rd2009, %rd2008, %rd2683; | |
mul.lo.s64 %rd2010, %rd2009, %rd2671; | |
and.b64 %rd2011, %rd2010, 4294967295; | |
xor.b64 %rd2012, %rd2011, %rd2005; | |
xor.b64 %rd2013, %rd2012, %rd2684; | |
mul.lo.s64 %rd2014, %rd2013, %rd2669; | |
shr.u64 %rd2015, %rd2014, 32; | |
shr.u64 %rd2016, %rd2010, 32; | |
and.b64 %rd2017, %rd1984, 4294967295; | |
xor.b64 %rd2018, %rd2017, %rd2016; | |
xor.b64 %rd2019, %rd2018, %rd2685; | |
mul.lo.s64 %rd2020, %rd2019, %rd2669; | |
and.b64 %rd2021, %rd2020, 4294967295; | |
xor.b64 %rd2022, %rd2021, %rd2015; | |
xor.b64 %rd2023, %rd2022, %rd2686; | |
mul.lo.s64 %rd2024, %rd2023, %rd2671; | |
shr.u64 %rd2025, %rd2024, 32; | |
cvt.u32.u64 %r222, %rd2025; | |
shr.u64 %rd2026, %rd2020, 32; | |
xor.b64 %rd2027, %rd2026, %rd1994; | |
cvt.u32.u64 %r223, %rd2027; | |
xor.b32 %r224, %r340, %r223; | |
mul.lo.s32 %r225, %r224, %r341; | |
xor.b32 %r226, %r225, %r222; | |
xor.b32 %r227, %r226, %r342; | |
shr.u32 %r228, %r227, 9; | |
cvt.rn.f32.u32 %f205, %r228; | |
mul.rn.f32 %f206, %f205, 0f34000000; | |
cvt.rn.f16.f32 %h109, %f206; | |
mov.b16 %h110, 0x2E66; | |
setp.ge.f16 %p52, %h109, %h110; | |
ld.global.nc.b16 %h111, [%rd45+1536]; | |
ld.global.nc.f32 %f207, [%rd46+3072]; | |
cvt.rn.f16.f32 %h112, %f207; | |
add.rn.f16 %h113, %h111, %h112; | |
mov.b16 %h114, 0x3C72; | |
mul.rn.f16 %h115, %h113, %h114; | |
selp.b16 %h116, %h115, 0x0000, %p52; | |
cvt.f32.f16 %f208, %h116; | |
ld.global.nc.b16 %h117, [%rd47+1536]; | |
cvt.f32.f16 %f209, %h117; | |
ld.global.nc.f32 %f210, [%rd48+3072]; | |
mul.rn.f32 %f211, %f1, %f210; | |
mul.rn.f32 %f212, %f211, %f209; | |
ld.global.nc.f32 %f213, [%rd49+3072]; | |
mul.rn.f32 %f214, %f2, %f211; | |
sub.rn.f32 %f215, %f213, %f214; | |
add.rn.f32 %f216, %f212, %f215; | |
add.rn.f32 %f217, %f216, %f208; | |
sub.rn.f32 %f218, %f217, %f3; | |
mul.rn.f32 %f219, %f218, %f218; | |
add.rn.f32 %f16, %f15, %f219; | |
or.b32 %r229, %r3, 769; | |
or.b32 %r230, %r229, %r4; | |
and.b32 %r231, %r229, 3; | |
shr.u32 %r232, %r230, 2; | |
setp.ne.s32 %p53, %r231, 1; | |
cvt.u64.u32 %rd2028, %r232; | |
add.s64 %rd379, %rd12, %rd2028; | |
@%p53 bra LBB30_43; | |
and.b64 %rd2068, %rd379, 4294967295; | |
mul.lo.s64 %rd2691, %rd2068, 3528531795; | |
setp.lt.u64 %p55, %rd379, %rd12; | |
selp.u64 %rd2069, 1, 0, %p55; | |
add.s64 %rd2070, %rd2464, %rd2069; | |
xor.b64 %rd2071, %rd2070, %rd2691; | |
shr.u64 %rd2072, %rd2071, 32; | |
mul.lo.s64 %rd2694, %rd2072, 3449720151; | |
shr.u64 %rd2073, %rd2694, 32; | |
and.b64 %rd2074, %rd2070, 4294967295; | |
mul.lo.s64 %rd2075, %rd2074, 3449720151; | |
and.b64 %rd2076, %rd2075, 4294967295; | |
xor.b64 %rd2077, %rd2076, %rd2073; | |
xor.b64 %rd2078, %rd2077, 2654435769; | |
mul.lo.s64 %rd2697, %rd2078, 3528531795; | |
xor.b64 %rd2687, %rd2075, %rd379; | |
mov.u32 %r344, -845247145; | |
mov.u32 %r343, -616729560; | |
mov.u64 %rd2704, 3041712726; | |
mov.u64 %rd2703, 1401181199; | |
mov.u64 %rd2702, 2835769497; | |
mov.u64 %rd2701, 1684936478; | |
mov.u64 %rd2700, 2027808484; | |
mov.u64 %rd2699, 387276957; | |
mov.u64 %rd2698, 842468239; | |
mov.u64 %rd2696, 3986602516; | |
mov.u64 %rd2695, 1013904242; | |
mov.u64 %rd2693, 3668340011; | |
mov.u64 %rd2692, 3144134277; | |
mov.u64 %rd2690, 3449720151; | |
mov.u64 %rd2689, 1993301258; | |
mov.u64 %rd2688, 3528531795; | |
bra.uni LBB30_44; | |
LBB30_43: | |
setp.lt.u64 %p54, %rd379, %rd12; | |
selp.u64 %rd2043, 1, 0, %p54; | |
add.s64 %rd2044, %rd2464, %rd2043; | |
and.b64 %rd2045, %rd2044, 4294967295; | |
mul.lo.s64 %rd2691, %rd2045, 3449720151; | |
xor.b64 %rd2046, %rd2691, %rd379; | |
shr.u64 %rd2047, %rd2046, 32; | |
mul.lo.s64 %rd2694, %rd2047, 3528531795; | |
shr.u64 %rd2048, %rd2694, 32; | |
and.b64 %rd2049, %rd379, 4294967295; | |
mul.lo.s64 %rd2050, %rd2049, 3528531795; | |
and.b64 %rd2051, %rd2050, 4294967295; | |
xor.b64 %rd2052, %rd2051, %rd2048; | |
xor.b64 %rd2053, %rd2052, 3144134277; | |
mul.lo.s64 %rd2697, %rd2053, 3449720151; | |
xor.b64 %rd2687, %rd2044, %rd2050; | |
mov.u32 %r344, -766435501; | |
mov.u32 %r343, -239350328; | |
mov.u64 %rd2704, 1684936478; | |
mov.u64 %rd2703, 534103459; | |
mov.u64 %rd2702, 387276957; | |
mov.u64 %rd2701, 3041712726; | |
mov.u64 %rd2700, 3986602516; | |
mov.u64 %rd2699, 2835769497; | |
mov.u64 %rd2698, 3668340011; | |
mov.u64 %rd2696, 2027808484; | |
mov.u64 %rd2695, 1993301258; | |
mov.u64 %rd2693, 842468239; | |
mov.u64 %rd2692, 2654435769; | |
mov.u64 %rd2690, 3528531795; | |
mov.u64 %rd2689, 1013904242; | |
mov.u64 %rd2688, 3449720151; | |
LBB30_44: | |
shr.u64 %rd2079, %rd2697, 32; | |
shr.u64 %rd2080, %rd2687, 32; | |
mul.lo.s64 %rd2081, %rd2080, %rd2688; | |
and.b64 %rd2082, %rd2081, 4294967295; | |
xor.b64 %rd2083, %rd2082, %rd2079; | |
xor.b64 %rd2084, %rd2083, %rd2689; | |
mul.lo.s64 %rd2085, %rd2084, %rd2690; | |
shr.u64 %rd2086, %rd2085, 32; | |
shr.u64 %rd2087, %rd2081, 32; | |
and.b64 %rd2088, %rd2691, 4294967295; | |
xor.b64 %rd2089, %rd2088, %rd2087; | |
xor.b64 %rd2090, %rd2089, %rd2692; | |
mul.lo.s64 %rd2091, %rd2090, %rd2690; | |
and.b64 %rd2092, %rd2091, 4294967295; | |
xor.b64 %rd2093, %rd2092, %rd2086; | |
xor.b64 %rd2094, %rd2093, %rd2693; | |
mul.lo.s64 %rd2095, %rd2094, %rd2688; | |
shr.u64 %rd2096, %rd2095, 32; | |
shr.u64 %rd2097, %rd2091, 32; | |
and.b64 %rd2098, %rd2694, 4294967295; | |
xor.b64 %rd2099, %rd2098, %rd2097; | |
xor.b64 %rd2100, %rd2099, %rd2695; | |
mul.lo.s64 %rd2101, %rd2100, %rd2688; | |
and.b64 %rd2102, %rd2101, 4294967295; | |
xor.b64 %rd2103, %rd2102, %rd2096; | |
xor.b64 %rd2104, %rd2103, %rd2696; | |
mul.lo.s64 %rd2105, %rd2104, %rd2690; | |
shr.u64 %rd2106, %rd2105, 32; | |
shr.u64 %rd2107, %rd2101, 32; | |
and.b64 %rd2108, %rd2697, 4294967295; | |
xor.b64 %rd2109, %rd2108, %rd2107; | |
xor.b64 %rd2110, %rd2109, %rd2698; | |
mul.lo.s64 %rd2111, %rd2110, %rd2690; | |
and.b64 %rd2112, %rd2111, 4294967295; | |
xor.b64 %rd2113, %rd2112, %rd2106; | |
xor.b64 %rd2114, %rd2113, %rd2699; | |
mul.lo.s64 %rd2115, %rd2114, %rd2688; | |
shr.u64 %rd2116, %rd2115, 32; | |
shr.u64 %rd2117, %rd2111, 32; | |
and.b64 %rd2118, %rd2085, 4294967295; | |
xor.b64 %rd2119, %rd2118, %rd2117; | |
xor.b64 %rd2120, %rd2119, %rd2700; | |
mul.lo.s64 %rd2121, %rd2120, %rd2688; | |
and.b64 %rd2122, %rd2121, 4294967295; | |
xor.b64 %rd2123, %rd2122, %rd2116; | |
xor.b64 %rd2124, %rd2123, %rd2701; | |
mul.lo.s64 %rd2125, %rd2124, %rd2690; | |
shr.u64 %rd2126, %rd2125, 32; | |
shr.u64 %rd2127, %rd2121, 32; | |
and.b64 %rd2128, %rd2095, 4294967295; | |
xor.b64 %rd2129, %rd2128, %rd2127; | |
xor.b64 %rd2130, %rd2129, %rd2702; | |
mul.lo.s64 %rd2131, %rd2130, %rd2690; | |
and.b64 %rd2132, %rd2131, 4294967295; | |
xor.b64 %rd2133, %rd2132, %rd2126; | |
xor.b64 %rd2134, %rd2133, %rd2703; | |
mul.lo.s64 %rd2135, %rd2134, %rd2688; | |
shr.u64 %rd2136, %rd2135, 32; | |
shr.u64 %rd2137, %rd2131, 32; | |
xor.b64 %rd2138, %rd2105, %rd2137; | |
xor.b64 %rd2139, %rd2138, %rd2704; | |
mul.lo.s64 %rd2140, %rd2139, %rd2688; | |
xor.b64 %rd2141, %rd2136, %rd2140; | |
cvt.u32.u64 %r237, %rd2141; | |
xor.b32 %r238, %r343, %r237; | |
mul.lo.s32 %r239, %r238, %r344; | |
shr.u32 %r240, %r239, 9; | |
cvt.rn.f32.u32 %f220, %r240; | |
mul.rn.f32 %f221, %f220, 0f34000000; | |
cvt.rn.f16.f32 %h118, %f221; | |
mov.b16 %h119, 0x2E66; | |
setp.ge.f16 %p57, %h118, %h119; | |
ld.global.nc.b16 %h120, [%rd45+1538]; | |
ld.global.nc.f32 %f222, [%rd46+3076]; | |
cvt.rn.f16.f32 %h121, %f222; | |
add.rn.f16 %h122, %h120, %h121; | |
mov.b16 %h123, 0x3C72; | |
mul.rn.f16 %h124, %h122, %h123; | |
selp.b16 %h125, %h124, 0x0000, %p57; | |
cvt.f32.f16 %f223, %h125; | |
ld.global.nc.b16 %h126, [%rd47+1538]; | |
cvt.f32.f16 %f224, %h126; | |
ld.global.nc.f32 %f225, [%rd48+3076]; | |
mul.rn.f32 %f226, %f1, %f225; | |
mul.rn.f32 %f227, %f226, %f224; | |
ld.global.nc.f32 %f228, [%rd49+3076]; | |
mul.rn.f32 %f229, %f2, %f226; | |
sub.rn.f32 %f230, %f228, %f229; | |
add.rn.f32 %f231, %f227, %f230; | |
add.rn.f32 %f232, %f231, %f223; | |
sub.rn.f32 %f233, %f232, %f3; | |
mul.rn.f32 %f234, %f233, %f233; | |
add.rn.f32 %f17, %f16, %f234; | |
or.b32 %r242, %r73, 896; | |
shr.u32 %r243, %r242, 2; | |
cvt.u64.u32 %rd2142, %r243; | |
add.s64 %rd406, %rd12, %rd2142; | |
@%p8 bra LBB30_46; | |
mov.u32 %r347, -1879881855; | |
mov.u32 %r345, 534103459; | |
mov.u64 %rd2723, 3678237736; | |
and.b64 %rd2184, %rd406, 4294967295; | |
mul.lo.s64 %rd2709, %rd2184, 3528531795; | |
setp.lt.u64 %p59, %rd406, %rd12; | |
selp.u64 %rd2185, 1, 0, %p59; | |
add.s64 %rd2186, %rd2464, %rd2185; | |
xor.b64 %rd2187, %rd2186, %rd2709; | |
shr.u64 %rd2188, %rd2187, 32; | |
mul.lo.s64 %rd2712, %rd2188, 3449720151; | |
shr.u64 %rd2189, %rd2712, 32; | |
and.b64 %rd2190, %rd2186, 4294967295; | |
mul.lo.s64 %rd2191, %rd2190, 3449720151; | |
and.b64 %rd2192, %rd2191, 4294967295; | |
xor.b64 %rd2193, %rd2192, %rd2189; | |
xor.b64 %rd2194, %rd2193, 2654435769; | |
mul.lo.s64 %rd2715, %rd2194, 3528531795; | |
xor.b64 %rd2705, %rd2191, %rd406; | |
mov.u32 %r346, -845247145; | |
mov.u64 %rd2722, 3041712726; | |
mov.u64 %rd2721, 1401181199; | |
mov.u64 %rd2720, 2835769497; | |
mov.u64 %rd2719, 1684936478; | |
mov.u64 %rd2718, 2027808484; | |
mov.u64 %rd2717, 387276957; | |
mov.u64 %rd2716, 842468239; | |
mov.u64 %rd2714, 3986602516; | |
mov.u64 %rd2713, 1013904242; | |
mov.u64 %rd2711, 3668340011; | |
mov.u64 %rd2710, 3144134277; | |
mov.u64 %rd2708, 3449720151; | |
mov.u64 %rd2707, 1993301258; | |
mov.u64 %rd2706, 3528531795; | |
bra.uni LBB30_47; | |
LBB30_46: | |
setp.lt.u64 %p58, %rd406, %rd12; | |
selp.u64 %rd2158, 1, 0, %p58; | |
add.s64 %rd2159, %rd2464, %rd2158; | |
and.b64 %rd2160, %rd2159, 4294967295; | |
mul.lo.s64 %rd2709, %rd2160, 3449720151; | |
xor.b64 %rd2161, %rd2709, %rd406; | |
shr.u64 %rd2162, %rd2161, 32; | |
mul.lo.s64 %rd2712, %rd2162, 3528531795; | |
shr.u64 %rd2163, %rd2712, 32; | |
and.b64 %rd2164, %rd406, 4294967295; | |
mul.lo.s64 %rd2165, %rd2164, 3528531795; | |
and.b64 %rd2166, %rd2165, 4294967295; | |
xor.b64 %rd2167, %rd2166, %rd2163; | |
xor.b64 %rd2168, %rd2167, 3144134277; | |
mul.lo.s64 %rd2715, %rd2168, 3449720151; | |
xor.b64 %rd2705, %rd2159, %rd2165; | |
mov.u32 %r347, -1767562579; | |
mov.u32 %r346, -766435501; | |
mov.u32 %r345, 1401181199; | |
mov.u64 %rd2723, 4055616968; | |
mov.u64 %rd2722, 1684936478; | |
mov.u64 %rd2721, 534103459; | |
mov.u64 %rd2720, 387276957; | |
mov.u64 %rd2719, 3041712726; | |
mov.u64 %rd2718, 3986602516; | |
mov.u64 %rd2717, 2835769497; | |
mov.u64 %rd2716, 3668340011; | |
mov.u64 %rd2714, 2027808484; | |
mov.u64 %rd2713, 1993301258; | |
mov.u64 %rd2711, 842468239; | |
mov.u64 %rd2710, 2654435769; | |
mov.u64 %rd2708, 3528531795; | |
mov.u64 %rd2707, 1013904242; | |
mov.u64 %rd2706, 3449720151; | |
LBB30_47: | |
shr.u64 %rd2195, %rd2715, 32; | |
shr.u64 %rd2196, %rd2705, 32; | |
mul.lo.s64 %rd2197, %rd2196, %rd2706; | |
and.b64 %rd2198, %rd2197, 4294967295; | |
xor.b64 %rd2199, %rd2198, %rd2195; | |
xor.b64 %rd2200, %rd2199, %rd2707; | |
mul.lo.s64 %rd2201, %rd2200, %rd2708; | |
shr.u64 %rd2202, %rd2201, 32; | |
shr.u64 %rd2203, %rd2197, 32; | |
and.b64 %rd2204, %rd2709, 4294967295; | |
xor.b64 %rd2205, %rd2204, %rd2203; | |
xor.b64 %rd2206, %rd2205, %rd2710; | |
mul.lo.s64 %rd2207, %rd2206, %rd2708; | |
and.b64 %rd2208, %rd2207, 4294967295; | |
xor.b64 %rd2209, %rd2208, %rd2202; | |
xor.b64 %rd2210, %rd2209, %rd2711; | |
mul.lo.s64 %rd2211, %rd2210, %rd2706; | |
shr.u64 %rd2212, %rd2211, 32; | |
shr.u64 %rd2213, %rd2207, 32; | |
and.b64 %rd2214, %rd2712, 4294967295; | |
xor.b64 %rd2215, %rd2214, %rd2213; | |
xor.b64 %rd2216, %rd2215, %rd2713; | |
mul.lo.s64 %rd2217, %rd2216, %rd2706; | |
and.b64 %rd2218, %rd2217, 4294967295; | |
xor.b64 %rd2219, %rd2218, %rd2212; | |
xor.b64 %rd2220, %rd2219, %rd2714; | |
mul.lo.s64 %rd2221, %rd2220, %rd2708; | |
shr.u64 %rd2222, %rd2221, 32; | |
shr.u64 %rd2223, %rd2217, 32; | |
and.b64 %rd2224, %rd2715, 4294967295; | |
xor.b64 %rd2225, %rd2224, %rd2223; | |
xor.b64 %rd2226, %rd2225, %rd2716; | |
mul.lo.s64 %rd2227, %rd2226, %rd2708; | |
and.b64 %rd2228, %rd2227, 4294967295; | |
xor.b64 %rd2229, %rd2228, %rd2222; | |
xor.b64 %rd2230, %rd2229, %rd2717; | |
mul.lo.s64 %rd2231, %rd2230, %rd2706; | |
shr.u64 %rd2232, %rd2231, 32; | |
shr.u64 %rd2233, %rd2227, 32; | |
and.b64 %rd2234, %rd2201, 4294967295; | |
xor.b64 %rd2235, %rd2234, %rd2233; | |
xor.b64 %rd2236, %rd2235, %rd2718; | |
mul.lo.s64 %rd2237, %rd2236, %rd2706; | |
and.b64 %rd2238, %rd2237, 4294967295; | |
xor.b64 %rd2239, %rd2238, %rd2232; | |
xor.b64 %rd2240, %rd2239, %rd2719; | |
mul.lo.s64 %rd2241, %rd2240, %rd2708; | |
shr.u64 %rd2242, %rd2241, 32; | |
shr.u64 %rd2243, %rd2237, 32; | |
and.b64 %rd2244, %rd2211, 4294967295; | |
xor.b64 %rd2245, %rd2244, %rd2243; | |
xor.b64 %rd2246, %rd2245, %rd2720; | |
mul.lo.s64 %rd2247, %rd2246, %rd2708; | |
and.b64 %rd2248, %rd2247, 4294967295; | |
xor.b64 %rd2249, %rd2248, %rd2242; | |
xor.b64 %rd2250, %rd2249, %rd2721; | |
mul.lo.s64 %rd2251, %rd2250, %rd2706; | |
shr.u64 %rd2252, %rd2251, 32; | |
shr.u64 %rd2253, %rd2247, 32; | |
and.b64 %rd2254, %rd2221, 4294967295; | |
xor.b64 %rd2255, %rd2254, %rd2253; | |
xor.b64 %rd2256, %rd2255, %rd2722; | |
mul.lo.s64 %rd2257, %rd2256, %rd2706; | |
and.b64 %rd2258, %rd2257, 4294967295; | |
xor.b64 %rd2259, %rd2258, %rd2252; | |
xor.b64 %rd2260, %rd2259, %rd2723; | |
mul.lo.s64 %rd2261, %rd2260, %rd2708; | |
shr.u64 %rd2262, %rd2261, 32; | |
cvt.u32.u64 %r250, %rd2262; | |
shr.u64 %rd2263, %rd2257, 32; | |
xor.b64 %rd2264, %rd2263, %rd2231; | |
cvt.u32.u64 %r251, %rd2264; | |
xor.b32 %r252, %r345, %r251; | |
mul.lo.s32 %r253, %r252, %r346; | |
xor.b32 %r254, %r253, %r250; | |
xor.b32 %r255, %r254, %r347; | |
shr.u32 %r256, %r255, 9; | |
cvt.rn.f32.u32 %f235, %r256; | |
mul.rn.f32 %f236, %f235, 0f34000000; | |
cvt.rn.f16.f32 %h127, %f236; | |
mov.b16 %h128, 0x2E66; | |
setp.ge.f16 %p60, %h127, %h128; | |
ld.global.nc.b16 %h129, [%rd45+1792]; | |
ld.global.nc.f32 %f237, [%rd46+3584]; | |
cvt.rn.f16.f32 %h130, %f237; | |
add.rn.f16 %h131, %h129, %h130; | |
mov.b16 %h132, 0x3C72; | |
mul.rn.f16 %h133, %h131, %h132; | |
selp.b16 %h134, %h133, 0x0000, %p60; | |
cvt.f32.f16 %f238, %h134; | |
ld.global.nc.b16 %h135, [%rd47+1792]; | |
cvt.f32.f16 %f239, %h135; | |
ld.global.nc.f32 %f240, [%rd48+3584]; | |
mul.rn.f32 %f241, %f1, %f240; | |
mul.rn.f32 %f242, %f241, %f239; | |
ld.global.nc.f32 %f243, [%rd49+3584]; | |
mul.rn.f32 %f244, %f2, %f241; | |
sub.rn.f32 %f245, %f243, %f244; | |
add.rn.f32 %f246, %f242, %f245; | |
add.rn.f32 %f247, %f246, %f238; | |
sub.rn.f32 %f248, %f247, %f3; | |
mul.rn.f32 %f249, %f248, %f248; | |
add.rn.f32 %f18, %f17, %f249; | |
or.b32 %r257, %r3, 897; | |
or.b32 %r258, %r257, %r4; | |
and.b32 %r259, %r257, 3; | |
shr.u32 %r260, %r258, 2; | |
setp.ne.s32 %p61, %r259, 1; | |
cvt.u64.u32 %rd2265, %r260; | |
add.s64 %rd434, %rd12, %rd2265; | |
@%p61 bra LBB30_49; | |
mov.u32 %r349, -845247145; | |
mov.u64 %rd2740, 1401181199; | |
mov.u64 %rd2729, 3144134277; | |
mov.u32 %r348, -616729560; | |
and.b64 %rd2305, %rd434, 4294967295; | |
mul.lo.s64 %rd2728, %rd2305, 3528531795; | |
setp.lt.u64 %p63, %rd434, %rd12; | |
selp.u64 %rd2306, 1, 0, %p63; | |
add.s64 %rd2307, %rd2464, %rd2306; | |
xor.b64 %rd2308, %rd2307, %rd2728; | |
shr.u64 %rd2309, %rd2308, 32; | |
mul.lo.s64 %rd2731, %rd2309, 3449720151; | |
shr.u64 %rd2310, %rd2731, 32; | |
and.b64 %rd2311, %rd2307, 4294967295; | |
mul.lo.s64 %rd2312, %rd2311, 3449720151; | |
and.b64 %rd2313, %rd2312, 4294967295; | |
xor.b64 %rd2314, %rd2313, %rd2310; | |
xor.b64 %rd2315, %rd2314, 2654435769; | |
mul.lo.s64 %rd2734, %rd2315, 3528531795; | |
xor.b64 %rd2724, %rd2312, %rd434; | |
mov.u64 %rd2741, 3041712726; | |
mov.u64 %rd2739, 2835769497; | |
mov.u64 %rd2738, 1684936478; | |
mov.u64 %rd2737, 2027808484; | |
mov.u64 %rd2736, 387276957; | |
mov.u64 %rd2735, 842468239; | |
mov.u64 %rd2733, 3986602516; | |
mov.u64 %rd2732, 1013904242; | |
mov.u64 %rd2730, 3668340011; | |
mov.u64 %rd2727, 3449720151; | |
mov.u64 %rd2726, 1993301258; | |
mov.u64 %rd2725, 3528531795; | |
bra.uni LBB30_50; | |
LBB30_49: | |
setp.lt.u64 %p62, %rd434, %rd12; | |
selp.u64 %rd2280, 1, 0, %p62; | |
add.s64 %rd2281, %rd2464, %rd2280; | |
and.b64 %rd2282, %rd2281, 4294967295; | |
mul.lo.s64 %rd2728, %rd2282, 3449720151; | |
xor.b64 %rd2283, %rd2728, %rd434; | |
shr.u64 %rd2284, %rd2283, 32; | |
mul.lo.s64 %rd2731, %rd2284, 3528531795; | |
shr.u64 %rd2285, %rd2731, 32; | |
and.b64 %rd2286, %rd434, 4294967295; | |
mul.lo.s64 %rd2287, %rd2286, 3528531795; | |
and.b64 %rd2288, %rd2287, 4294967295; | |
xor.b64 %rd2289, %rd2288, %rd2285; | |
xor.b64 %rd2290, %rd2289, 3144134277; | |
mul.lo.s64 %rd2734, %rd2290, 3449720151; | |
xor.b64 %rd2724, %rd2281, %rd2287; | |
mov.u32 %r349, -766435501; | |
mov.u32 %r348, -239350328; | |
mov.u64 %rd2741, 1684936478; | |
mov.u64 %rd2740, 534103459; | |
mov.u64 %rd2739, 387276957; | |
mov.u64 %rd2738, 3041712726; | |
mov.u64 %rd2737, 3986602516; | |
mov.u64 %rd2736, 2835769497; | |
mov.u64 %rd2735, 3668340011; | |
mov.u64 %rd2733, 2027808484; | |
mov.u64 %rd2732, 1993301258; | |
mov.u64 %rd2730, 842468239; | |
mov.u64 %rd2729, 2654435769; | |
mov.u64 %rd2727, 3528531795; | |
mov.u64 %rd2726, 1013904242; | |
mov.u64 %rd2725, 3449720151; | |
LBB30_50: | |
shr.u64 %rd2316, %rd2734, 32; | |
shr.u64 %rd2317, %rd2724, 32; | |
mul.lo.s64 %rd2318, %rd2317, %rd2725; | |
and.b64 %rd2319, %rd2318, 4294967295; | |
xor.b64 %rd2320, %rd2319, %rd2316; | |
xor.b64 %rd2321, %rd2320, %rd2726; | |
mul.lo.s64 %rd2322, %rd2321, %rd2727; | |
shr.u64 %rd2323, %rd2322, 32; | |
shr.u64 %rd2324, %rd2318, 32; | |
and.b64 %rd2325, %rd2728, 4294967295; | |
xor.b64 %rd2326, %rd2325, %rd2324; | |
xor.b64 %rd2327, %rd2326, %rd2729; | |
mul.lo.s64 %rd2328, %rd2327, %rd2727; | |
and.b64 %rd2329, %rd2328, 4294967295; | |
xor.b64 %rd2330, %rd2329, %rd2323; | |
xor.b64 %rd2331, %rd2330, %rd2730; | |
mul.lo.s64 %rd2332, %rd2331, %rd2725; | |
shr.u64 %rd2333, %rd2332, 32; | |
shr.u64 %rd2334, %rd2328, 32; | |
and.b64 %rd2335, %rd2731, 4294967295; | |
xor.b64 %rd2336, %rd2335, %rd2334; | |
xor.b64 %rd2337, %rd2336, %rd2732; | |
mul.lo.s64 %rd2338, %rd2337, %rd2725; | |
and.b64 %rd2339, %rd2338, 4294967295; | |
xor.b64 %rd2340, %rd2339, %rd2333; | |
xor.b64 %rd2341, %rd2340, %rd2733; | |
mul.lo.s64 %rd2342, %rd2341, %rd2727; | |
shr.u64 %rd2343, %rd2342, 32; | |
shr.u64 %rd2344, %rd2338, 32; | |
and.b64 %rd2345, %rd2734, 4294967295; | |
xor.b64 %rd2346, %rd2345, %rd2344; | |
xor.b64 %rd2347, %rd2346, %rd2735; | |
mul.lo.s64 %rd2348, %rd2347, %rd2727; | |
and.b64 %rd2349, %rd2348, 4294967295; | |
xor.b64 %rd2350, %rd2349, %rd2343; | |
xor.b64 %rd2351, %rd2350, %rd2736; | |
mul.lo.s64 %rd2352, %rd2351, %rd2725; | |
shr.u64 %rd2353, %rd2352, 32; | |
shr.u64 %rd2354, %rd2348, 32; | |
and.b64 %rd2355, %rd2322, 4294967295; | |
xor.b64 %rd2356, %rd2355, %rd2354; | |
xor.b64 %rd2357, %rd2356, %rd2737; | |
mul.lo.s64 %rd2358, %rd2357, %rd2725; | |
and.b64 %rd2359, %rd2358, 4294967295; | |
xor.b64 %rd2360, %rd2359, %rd2353; | |
xor.b64 %rd2361, %rd2360, %rd2738; | |
mul.lo.s64 %rd2362, %rd2361, %rd2727; | |
shr.u64 %rd2363, %rd2362, 32; | |
shr.u64 %rd2364, %rd2358, 32; | |
and.b64 %rd2365, %rd2332, 4294967295; | |
xor.b64 %rd2366, %rd2365, %rd2364; | |
xor.b64 %rd2367, %rd2366, %rd2739; | |
mul.lo.s64 %rd2368, %rd2367, %rd2727; | |
and.b64 %rd2369, %rd2368, 4294967295; | |
xor.b64 %rd2370, %rd2369, %rd2363; | |
xor.b64 %rd2371, %rd2370, %rd2740; | |
mul.lo.s64 %rd2372, %rd2371, %rd2725; | |
shr.u64 %rd2373, %rd2372, 32; | |
shr.u64 %rd2374, %rd2368, 32; | |
xor.b64 %rd2375, %rd2342, %rd2374; | |
xor.b64 %rd2376, %rd2375, %rd2741; | |
mul.lo.s64 %rd2377, %rd2376, %rd2725; | |
xor.b64 %rd2378, %rd2373, %rd2377; | |
cvt.u32.u64 %r265, %rd2378; | |
xor.b32 %r266, %r348, %r265; | |
mul.lo.s32 %r267, %r266, %r349; | |
shr.u32 %r268, %r267, 9; | |
cvt.rn.f32.u32 %f250, %r268; | |
mul.rn.f32 %f251, %f250, 0f34000000; | |
cvt.rn.f16.f32 %h136, %f251; | |
mov.b16 %h137, 0x2E66; | |
setp.ge.f16 %p64, %h136, %h137; | |
ld.global.nc.b16 %h138, [%rd45+1794]; | |
ld.global.nc.f32 %f252, [%rd46+3588]; | |
cvt.rn.f16.f32 %h139, %f252; | |
add.rn.f16 %h140, %h138, %h139; | |
mov.b16 %h141, 0x3C72; | |
mul.rn.f16 %h142, %h140, %h141; | |
selp.b16 %h143, %h142, 0x0000, %p64; | |
cvt.f32.f16 %f253, %h143; | |
ld.global.nc.b16 %h144, [%rd47+1794]; | |
cvt.f32.f16 %f254, %h144; | |
ld.global.nc.f32 %f255, [%rd48+3588]; | |
mul.rn.f32 %f256, %f1, %f255; | |
mul.rn.f32 %f257, %f256, %f254; | |
ld.global.nc.f32 %f258, [%rd49+3588]; | |
mul.rn.f32 %f259, %f2, %f256; | |
sub.rn.f32 %f260, %f258, %f259; | |
add.rn.f32 %f261, %f257, %f260; | |
add.rn.f32 %f262, %f261, %f253; | |
sub.rn.f32 %f263, %f262, %f3; | |
mul.rn.f32 %f264, %f263, %f263; | |
add.rn.f32 %f265, %f18, %f264; | |
and.b32 %r46, %r1, 31; | |
shfl.sync.down.b32 %f266, %f265, 16, 31, -1; | |
add.rn.f32 %f267, %f266, %f265; | |
shfl.sync.down.b32 %f268, %f267, 8, 31, -1; | |
add.rn.f32 %f269, %f268, %f267; | |
shfl.sync.down.b32 %f270, %f269, 4, 31, -1; | |
add.rn.f32 %f271, %f270, %f269; | |
shfl.sync.down.b32 %f272, %f271, 2, 31, -1; | |
add.rn.f32 %f273, %f272, %f271; | |
shfl.sync.down.b32 %f274, %f273, 1, 31, -1; | |
shr.u32 %r47, %r1, 5; | |
setp.ne.s32 %p65, %r46, 0; | |
mov.u64 %rd2381, shared_cache_07; | |
@%p65 bra LBB30_2; | |
mul.wide.u32 %rd2380, %r47, 4; | |
add.s64 %rd462, %rd2381, %rd2380; | |
add.rn.f32 %f19, %f274, %f273; | |
st.shared.f32 [%rd462], %f19; | |
LBB30_2: | |
bar.sync 0; | |
setp.eq.s32 %p66, %r47, 0; | |
@%p66 bra LBB30_52; | |
bra.uni LBB30_3; | |
LBB30_52: | |
add.u64 %rd474, %SP, 0; | |
add.u64 %rd11, %SPL, 0; | |
mul.wide.u32 %rd2382, %r46, 4; | |
add.s64 %rd463, %rd2381, %rd2382; | |
cvta.shared.u64 %rd2384, %rd463; | |
mov.u32 %r269, 0; | |
st.local.u32 [%rd11], %r269; | |
setp.lt.u32 %p67, %r1, 2; | |
selp.b64 %rd2386, %rd2384, %rd474, %p67; | |
ld.f32 %f275, [%rd2386]; | |
shfl.sync.down.b32 %f276, %f275, 16, 31, -1; | |
add.rn.f32 %f277, %f275, %f276; | |
shfl.sync.down.b32 %f278, %f277, 8, 31, -1; | |
add.rn.f32 %f279, %f277, %f278; | |
shfl.sync.down.b32 %f280, %f279, 4, 31, -1; | |
add.rn.f32 %f281, %f279, %f280; | |
shfl.sync.down.b32 %f282, %f281, 2, 31, -1; | |
add.rn.f32 %f283, %f281, %f282; | |
shfl.sync.down.b32 %f284, %f283, 1, 31, -1; | |
add.rn.f32 %f285, %f283, %f284; | |
st.f32 [%rd2386], %f285; | |
setp.ne.s32 %p68, %r1, 0; | |
@%p68 bra LBB30_3; | |
ld.param.u64 %rd470, [fusion_2246_param_3]; | |
cvt.u64.u32 %rd44, %r2; | |
cvta.to.global.u64 %rd7, %rd470; | |
shl.b64 %rd2379, %rd44, 2; | |
add.s64 %rd461, %rd7, %rd2379; | |
ld.shared.f32 %f286, [%rd463]; | |
atom.global.add.f32 %f287, [%rd461], %f286; | |
LBB30_3: | |
ret; | |
} | |
// .globl fusion_2243 | |
.visible .entry fusion_2243( | |
.param .u64 fusion_2243_param_0, | |
.param .u64 fusion_2243_param_1, | |
.param .u64 fusion_2243_param_2, | |
.param .u64 fusion_2243_param_3, | |
.param .u64 fusion_2243_param_4, | |
.param .u64 fusion_2243_param_5, | |
.param .u64 fusion_2243_param_6, | |
.param .u64 fusion_2243_param_7, | |
.param .u64 fusion_2243_param_8, | |
.param .u64 fusion_2243_param_9, | |
.param .u64 fusion_2243_param_10, | |
.param .u64 fusion_2243_param_11, | |
.param .u64 fusion_2243_param_12, | |
.param .u64 fusion_2243_param_13 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<39>; | |
.reg .b32 %hh<5>; | |
.reg .f32 %f<97>; | |
.reg .b32 %r<31>; | |
.reg .b64 %rd<162>; | |
ld.param.u64 %rd1, [fusion_2243_param_0]; | |
ld.param.u64 %rd2, [fusion_2243_param_12]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2243_param_1]; | |
ld.param.u64 %rd5, [fusion_2243_param_11]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2243_param_2]; | |
ld.param.u64 %rd8, [fusion_2243_param_10]; | |
cvta.to.global.u64 %rd9, %rd8; | |
ld.param.u64 %rd10, [fusion_2243_param_3]; | |
ld.param.u64 %rd11, [fusion_2243_param_9]; | |
cvta.to.global.u64 %rd12, %rd11; | |
ld.param.u64 %rd13, [fusion_2243_param_4]; | |
ld.param.u64 %rd14, [fusion_2243_param_8]; | |
cvta.to.global.u64 %rd15, %rd14; | |
ld.param.u64 %rd16, [fusion_2243_param_5]; | |
ld.param.u64 %rd17, [fusion_2243_param_7]; | |
cvta.to.global.u64 %rd18, %rd17; | |
ld.param.u64 %rd19, [fusion_2243_param_6]; | |
cvta.to.global.u64 %rd20, %rd19; | |
cvta.to.global.u64 %rd21, %rd16; | |
cvta.to.global.u64 %rd22, %rd13; | |
cvta.to.global.u64 %rd23, %rd10; | |
cvta.to.global.u64 %rd24, %rd7; | |
cvta.to.global.u64 %rd25, %rd4; | |
cvta.to.global.u64 %rd26, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
shr.u32 %r8, %r5, 2; | |
ld.global.nc.v2.u64 {%rd27, %rd28}, [%rd23]; | |
cvt.u64.u32 %rd29, %r8; | |
add.s64 %rd30, %rd27, %rd29; | |
setp.lt.u64 %p1, %rd30, %rd27; | |
and.b64 %rd31, %rd30, 4294967295; | |
mul.lo.s64 %rd32, %rd31, 3528531795; | |
selp.u64 %rd33, 1, 0, %p1; | |
add.s64 %rd34, %rd28, %rd33; | |
xor.b64 %rd35, %rd34, %rd32; | |
shr.u64 %rd36, %rd35, 32; | |
mul.lo.s64 %rd37, %rd36, 3449720151; | |
shr.u64 %rd38, %rd37, 32; | |
and.b64 %rd39, %rd34, 4294967295; | |
mul.lo.s64 %rd40, %rd39, 3449720151; | |
and.b64 %rd41, %rd40, 4294967295; | |
xor.b64 %rd42, %rd41, %rd38; | |
xor.b64 %rd43, %rd42, 2654435769; | |
mul.lo.s64 %rd44, %rd43, 3528531795; | |
shr.u64 %rd45, %rd44, 32; | |
xor.b64 %rd46, %rd40, %rd30; | |
shr.u64 %rd47, %rd46, 32; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
and.b64 %rd49, %rd48, 4294967295; | |
xor.b64 %rd50, %rd49, %rd45; | |
xor.b64 %rd51, %rd50, 1993301258; | |
mul.lo.s64 %rd52, %rd51, 3449720151; | |
shr.u64 %rd53, %rd52, 32; | |
shr.u64 %rd54, %rd48, 32; | |
and.b64 %rd55, %rd32, 4294967295; | |
xor.b64 %rd56, %rd55, %rd54; | |
xor.b64 %rd57, %rd56, 3144134277; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
and.b64 %rd59, %rd58, 4294967295; | |
xor.b64 %rd60, %rd59, %rd53; | |
xor.b64 %rd61, %rd60, 3668340011; | |
mul.lo.s64 %rd62, %rd61, 3528531795; | |
shr.u64 %rd63, %rd62, 32; | |
shr.u64 %rd64, %rd58, 32; | |
and.b64 %rd65, %rd37, 4294967295; | |
xor.b64 %rd66, %rd65, %rd64; | |
xor.b64 %rd67, %rd66, 1013904242; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
and.b64 %rd69, %rd68, 4294967295; | |
xor.b64 %rd70, %rd69, %rd63; | |
xor.b64 %rd71, %rd70, 3986602516; | |
mul.lo.s64 %rd72, %rd71, 3449720151; | |
shr.u64 %rd73, %rd72, 32; | |
shr.u64 %rd74, %rd68, 32; | |
and.b64 %rd75, %rd44, 4294967295; | |
xor.b64 %rd76, %rd75, %rd74; | |
xor.b64 %rd77, %rd76, 842468239; | |
mul.lo.s64 %rd78, %rd77, 3449720151; | |
and.b64 %rd79, %rd78, 4294967295; | |
xor.b64 %rd80, %rd79, %rd73; | |
xor.b64 %rd81, %rd80, 387276957; | |
mul.lo.s64 %rd82, %rd81, 3528531795; | |
shr.u64 %rd83, %rd82, 32; | |
shr.u64 %rd84, %rd78, 32; | |
and.b64 %rd85, %rd52, 4294967295; | |
xor.b64 %rd86, %rd85, %rd84; | |
xor.b64 %rd87, %rd86, 2027808484; | |
mul.lo.s64 %rd88, %rd87, 3528531795; | |
and.b64 %rd89, %rd88, 4294967295; | |
shr.u64 %rd90, %rd88, 32; | |
and.b64 %rd91, %rd62, 4294967295; | |
xor.b64 %rd92, %rd91, %rd90; | |
xor.b64 %rd93, %rd92, 2835769497; | |
mul.lo.s64 %rd94, %rd93, 3449720151; | |
and.b64 %rd95, %rd94, 4294967295; | |
shr.u64 %rd96, %rd94, 32; | |
and.b64 %rd97, %rd72, 4294967295; | |
xor.b64 %rd98, %rd97, %rd96; | |
xor.b64 %rd99, %rd98, 3041712726; | |
mul.lo.s64 %rd100, %rd99, 3528531795; | |
and.b64 %rd101, %rd100, 4294967295; | |
xor.b64 %rd102, %rd89, %rd83; | |
xor.b64 %rd103, %rd102, 1684936478; | |
mul.lo.s64 %rd104, %rd103, 3449720151; | |
shr.u64 %rd105, %rd104, 32; | |
xor.b64 %rd106, %rd95, %rd105; | |
xor.b64 %rd107, %rd106, 1401181199; | |
mul.lo.s64 %rd108, %rd107, 3528531795; | |
shr.u64 %rd109, %rd108, 32; | |
xor.b64 %rd110, %rd101, %rd109; | |
xor.b64 %rd111, %rd110, 3678237736; | |
mul.lo.s64 %rd112, %rd111, 3449720151; | |
shr.u64 %rd113, %rd112, 32; | |
cvt.u32.u64 %r9, %rd113; | |
shr.u64 %rd114, %rd100, 32; | |
xor.b64 %rd115, %rd114, %rd82; | |
cvt.u32.u64 %r10, %rd115; | |
xor.b32 %r11, %r10, 534103459; | |
mul.lo.s32 %r12, %r11, -845247145; | |
xor.b32 %r13, %r12, %r9; | |
shr.u32 %r14, %r13, 9; | |
xor.b32 %r15, %r14, 4716963; | |
cvt.rn.f32.u32 %f1, %r15; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f2; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p2, %h1, %h2; | |
mul.wide.u32 %rd116, %r5, 2; | |
add.s64 %rd117, %rd25, %rd116; | |
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd117]; | |
mov.b32 %hh1, {%h5, %h6}; | |
mov.b32 %hh2, {%h3, %h4}; | |
mov.b32 {%h7, %h8}, %hh2; | |
mov.b32 {%h9, %h10}, %hh1; | |
mul.wide.u32 %rd118, %r4, 4; | |
add.s64 %rd119, %rd6, %rd118; | |
ld.global.nc.f32 %f3, [%rd119]; | |
cvt.rn.f16.f32 %h11, %f3; | |
add.rn.f16 %h12, %h7, %h11; | |
mov.b16 %h13, 0x3C72; | |
mul.rn.f16 %h14, %h12, %h13; | |
cvt.f32.f16 %f4, %h14; | |
selp.f32 %f5, %f4, 0f00000000, %p2; | |
add.s64 %rd120, %rd24, %rd116; | |
ld.global.nc.v4.b16 {%h15, %h16, %h17, %h18}, [%rd120]; | |
mov.b32 %hh3, {%h17, %h18}; | |
mov.b32 %hh4, {%h15, %h16}; | |
mov.b32 {%h19, %h20}, %hh4; | |
mov.b32 {%h21, %h22}, %hh3; | |
cvt.f32.f16 %f6, %h19; | |
mul.wide.u32 %rd121, %r1, 4; | |
add.s64 %rd122, %rd20, %rd121; | |
ld.global.nc.f32 %f7, [%rd122]; | |
mul.rn.f32 %f8, %f7, 0f3A800000; | |
add.rn.f32 %f9, %f8, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f10, %f9; | |
add.s64 %rd123, %rd9, %rd118; | |
ld.global.nc.f32 %f11, [%rd123]; | |
mul.rn.f32 %f12, %f10, %f11; | |
mul.rn.f32 %f13, %f12, %f6; | |
add.s64 %rd124, %rd12, %rd118; | |
ld.global.nc.f32 %f14, [%rd124]; | |
add.s64 %rd125, %rd18, %rd121; | |
ld.global.nc.f32 %f15, [%rd125]; | |
mul.rn.f32 %f16, %f15, 0f3A800000; | |
mul.rn.f32 %f17, %f12, %f16; | |
sub.rn.f32 %f18, %f14, %f17; | |
add.rn.f32 %f19, %f13, %f18; | |
add.rn.f32 %f20, %f19, %f5; | |
add.s64 %rd126, %rd22, %rd121; | |
ld.global.nc.f32 %f21, [%rd126]; | |
mul.rn.f32 %f22, %f21, 0f3A800000; | |
add.rn.f32 %f23, %f22, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f24, %f23; | |
add.s64 %rd127, %rd15, %rd118; | |
ld.global.nc.f32 %f25, [%rd127]; | |
mul.rn.f32 %f26, %f24, %f25; | |
mul.rn.f32 %f27, %f26, %f20; | |
add.s64 %rd128, %rd3, %rd118; | |
ld.global.nc.f32 %f28, [%rd128]; | |
add.s64 %rd129, %rd21, %rd121; | |
ld.global.nc.f32 %f29, [%rd129]; | |
mul.rn.f32 %f30, %f29, 0f3A800000; | |
mul.rn.f32 %f31, %f26, %f30; | |
sub.rn.f32 %f32, %f28, %f31; | |
add.rn.f32 %f33, %f32, %f27; | |
cvt.rn.f16.f32 %h23, %f33; | |
add.s64 %rd130, %rd26, %rd116; | |
xor.b64 %rd131, %rd72, %rd96; | |
xor.b64 %rd132, %rd131, 3041712726; | |
mul.lo.s64 %rd133, %rd132, 3528531795; | |
xor.b64 %rd134, %rd109, %rd133; | |
cvt.u32.u64 %r16, %rd134; | |
xor.b32 %r17, %r16, -616729560; | |
mul.lo.s32 %r18, %r17, -845247145; | |
shr.u32 %r19, %r18, 9; | |
cvt.rn.f32.u32 %f34, %r19; | |
mul.rn.f32 %f35, %f34, 0f34000000; | |
cvt.rn.f16.f32 %h24, %f35; | |
setp.ge.f16 %p3, %h24, %h2; | |
mul.wide.u32 %rd135, %r6, 4; | |
add.s64 %rd136, %rd6, %rd135; | |
ld.global.nc.f32 %f36, [%rd136]; | |
cvt.rn.f16.f32 %h25, %f36; | |
add.rn.f16 %h26, %h8, %h25; | |
mul.rn.f16 %h27, %h26, %h13; | |
cvt.f32.f16 %f37, %h27; | |
selp.f32 %f38, %f37, 0f00000000, %p3; | |
cvt.f32.f16 %f39, %h20; | |
add.s64 %rd137, %rd9, %rd135; | |
ld.global.nc.f32 %f40, [%rd137]; | |
mul.rn.f32 %f41, %f10, %f40; | |
mul.rn.f32 %f42, %f41, %f39; | |
add.s64 %rd138, %rd12, %rd135; | |
ld.global.nc.f32 %f43, [%rd138]; | |
mul.rn.f32 %f44, %f16, %f41; | |
sub.rn.f32 %f45, %f43, %f44; | |
add.rn.f32 %f46, %f42, %f45; | |
add.rn.f32 %f47, %f46, %f38; | |
add.s64 %rd139, %rd15, %rd135; | |
ld.global.nc.f32 %f48, [%rd139]; | |
mul.rn.f32 %f49, %f24, %f48; | |
mul.rn.f32 %f50, %f49, %f47; | |
add.s64 %rd140, %rd3, %rd135; | |
ld.global.nc.f32 %f51, [%rd140]; | |
mul.rn.f32 %f52, %f30, %f49; | |
sub.rn.f32 %f53, %f51, %f52; | |
add.rn.f32 %f54, %f53, %f50; | |
cvt.rn.f16.f32 %h28, %f54; | |
and.b64 %rd141, %rd104, 4294967295; | |
and.b64 %rd142, %rd82, 4294967295; | |
xor.b64 %rd143, %rd142, %rd114; | |
xor.b64 %rd144, %rd143, 534103459; | |
mul.lo.s64 %rd145, %rd144, 3449720151; | |
shr.u64 %rd146, %rd145, 32; | |
xor.b64 %rd147, %rd141, %rd146; | |
xor.b64 %rd148, %rd147, 4055616968; | |
mul.lo.s64 %rd149, %rd148, 3528531795; | |
shr.u64 %rd150, %rd149, 32; | |
cvt.u32.u64 %r20, %rd150; | |
xor.b64 %rd151, %rd105, %rd94; | |
cvt.u32.u64 %r21, %rd151; | |
xor.b32 %r22, %r21, 1401181199; | |
mul.lo.s32 %r23, %r22, -766435501; | |
xor.b32 %r24, %r23, %r20; | |
shr.u32 %r25, %r24, 9; | |
xor.b32 %r26, %r25, 4936337; | |
cvt.rn.f32.u32 %f55, %r26; | |
mul.rn.f32 %f56, %f55, 0f34000000; | |
cvt.rn.f16.f32 %h29, %f56; | |
setp.ge.f16 %p4, %h29, %h2; | |
mul.wide.u32 %rd152, %r7, 4; | |
add.s64 %rd153, %rd6, %rd152; | |
ld.global.nc.f32 %f57, [%rd153]; | |
cvt.rn.f16.f32 %h30, %f57; | |
add.rn.f16 %h31, %h9, %h30; | |
mul.rn.f16 %h32, %h31, %h13; | |
cvt.f32.f16 %f58, %h32; | |
selp.f32 %f59, %f58, 0f00000000, %p4; | |
cvt.f32.f16 %f60, %h21; | |
add.s64 %rd154, %rd9, %rd152; | |
ld.global.nc.f32 %f61, [%rd154]; | |
mul.rn.f32 %f62, %f10, %f61; | |
mul.rn.f32 %f63, %f62, %f60; | |
add.s64 %rd155, %rd12, %rd152; | |
ld.global.nc.f32 %f64, [%rd155]; | |
mul.rn.f32 %f65, %f16, %f62; | |
sub.rn.f32 %f66, %f64, %f65; | |
add.rn.f32 %f67, %f63, %f66; | |
add.rn.f32 %f68, %f67, %f59; | |
add.s64 %rd156, %rd15, %rd152; | |
ld.global.nc.f32 %f69, [%rd156]; | |
mul.rn.f32 %f70, %f24, %f69; | |
mul.rn.f32 %f71, %f70, %f68; | |
add.s64 %rd157, %rd3, %rd152; | |
ld.global.nc.f32 %f72, [%rd157]; | |
mul.rn.f32 %f73, %f30, %f70; | |
sub.rn.f32 %f74, %f72, %f73; | |
add.rn.f32 %f75, %f74, %f71; | |
cvt.rn.f16.f32 %h33, %f75; | |
xor.b64 %rd158, %rd83, %rd88; | |
xor.b64 %rd159, %rd158, 1684936478; | |
mul.lo.s64 %rd160, %rd159, 3449720151; | |
xor.b64 %rd161, %rd146, %rd160; | |
cvt.u32.u64 %r27, %rd161; | |
xor.b32 %r28, %r27, -239350328; | |
mul.lo.s32 %r29, %r28, -766435501; | |
shr.u32 %r30, %r29, 9; | |
cvt.rn.f32.u32 %f76, %r30; | |
mul.rn.f32 %f77, %f76, 0f34000000; | |
cvt.rn.f16.f32 %h34, %f77; | |
setp.ge.f16 %p5, %h34, %h2; | |
ld.global.nc.f32 %f78, [%rd119+12]; | |
cvt.rn.f16.f32 %h35, %f78; | |
add.rn.f16 %h36, %h10, %h35; | |
mul.rn.f16 %h37, %h36, %h13; | |
cvt.f32.f16 %f79, %h37; | |
selp.f32 %f80, %f79, 0f00000000, %p5; | |
cvt.f32.f16 %f81, %h22; | |
ld.global.nc.f32 %f82, [%rd123+12]; | |
mul.rn.f32 %f83, %f10, %f82; | |
mul.rn.f32 %f84, %f83, %f81; | |
ld.global.nc.f32 %f85, [%rd124+12]; | |
mul.rn.f32 %f86, %f16, %f83; | |
sub.rn.f32 %f87, %f85, %f86; | |
add.rn.f32 %f88, %f84, %f87; | |
add.rn.f32 %f89, %f88, %f80; | |
ld.global.nc.f32 %f90, [%rd127+12]; | |
mul.rn.f32 %f91, %f24, %f90; | |
mul.rn.f32 %f92, %f91, %f89; | |
ld.global.nc.f32 %f93, [%rd128+12]; | |
mul.rn.f32 %f94, %f30, %f91; | |
sub.rn.f32 %f95, %f93, %f94; | |
add.rn.f32 %f96, %f95, %f92; | |
cvt.rn.f16.f32 %h38, %f96; | |
st.global.v4.b16 [%rd130], {%h23, %h28, %h33, %h38}; | |
ret; | |
} | |
// .globl fusion_2706 | |
.visible .entry fusion_2706( | |
.param .u64 fusion_2706_param_0, | |
.param .u64 fusion_2706_param_1, | |
.param .u64 fusion_2706_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2706_param_0]; | |
ld.param.u64 %rd2, [fusion_2706_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2239 | |
.visible .entry fusion_2239( | |
.param .u64 fusion_2239_param_0, | |
.param .u64 fusion_2239_param_1, | |
.param .u64 fusion_2239_param_2, | |
.param .u64 fusion_2239_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2239_param_0]; | |
ld.param.u64 %rd2, [fusion_2239_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2239_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd5, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd6, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2707 | |
.visible .entry fusion_2707( | |
.param .u64 fusion_2707_param_0, | |
.param .u64 fusion_2707_param_1, | |
.param .u64 fusion_2707_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2707_param_0]; | |
ld.param.u64 %rd2, [fusion_2707_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2240 | |
.visible .entry fusion_2240( | |
.param .u64 fusion_2240_param_0, | |
.param .u64 fusion_2240_param_1, | |
.param .u64 fusion_2240_param_2, | |
.param .u64 fusion_2240_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2240_param_0]; | |
ld.param.u64 %rd2, [fusion_2240_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2240_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd5, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd6, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2237 | |
.visible .entry fusion_2237( | |
.param .u64 fusion_2237_param_0, | |
.param .u64 fusion_2237_param_1, | |
.param .u64 fusion_2237_param_2, | |
.param .u64 fusion_2237_param_3 | |
) | |
.reqntid 32, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot36[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<4>; | |
.reg .b16 %h<83>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<57>; | |
.reg .b32 %r<37>; | |
.reg .b64 %rd<37>; | |
mov.u64 %SPL, __local_depot36; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd4, [fusion_2237_param_0]; | |
ld.param.u64 %rd5, [fusion_2237_param_2]; | |
cvta.to.global.u64 %rd6, %rd5; | |
cvta.to.global.u64 %rd9, %rd4; | |
add.u64 %rd10, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r5, %ctaid.x; | |
shl.b32 %r6, %r1, 1; | |
shl.b32 %r7, %r5, 9; | |
or.b32 %r8, %r7, %r6; | |
mul.wide.u32 %rd11, %r8, 2; | |
add.s64 %rd12, %rd9, %rd11; | |
ld.global.nc.b32 %hh1, [%rd12]; | |
mov.b32 {%h1, %h2}, %hh1; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd6, %rd13; | |
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14]; | |
cvt.rn.f16.s32 %h3, %r9; | |
mov.b16 %h4, 0x3C00; | |
sub.rn.f16 %h5, %h4, %h3; | |
mov.b16 %h6, 0x70E2; | |
mul.rn.f16 %h7, %h5, %h6; | |
sub.rn.f16 %h8, %h1, %h7; | |
cvt.f32.f16 %f2, %h8; | |
max.f32 %f3, %f2, 0fFF800000; | |
cvt.rn.f16.s32 %h9, %r10; | |
sub.rn.f16 %h10, %h4, %h9; | |
mul.rn.f16 %h11, %h10, %h6; | |
sub.rn.f16 %h12, %h2, %h11; | |
cvt.f32.f16 %f4, %h12; | |
max.f32 %f5, %f3, %f4; | |
or.b32 %r11, %r6, 64; | |
ld.global.nc.b32 %hh2, [%rd12+128]; | |
mov.b32 {%h13, %h14}, %hh2; | |
mul.wide.u32 %rd15, %r11, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.u32 %r12, [%rd16]; | |
cvt.rn.f16.s32 %h15, %r12; | |
sub.rn.f16 %h16, %h4, %h15; | |
mul.rn.f16 %h17, %h16, %h6; | |
sub.rn.f16 %h18, %h13, %h17; | |
cvt.f32.f16 %f6, %h18; | |
max.f32 %f7, %f5, %f6; | |
ld.global.nc.u32 %r13, [%rd14+260]; | |
cvt.rn.f16.s32 %h19, %r13; | |
sub.rn.f16 %h20, %h4, %h19; | |
mul.rn.f16 %h21, %h20, %h6; | |
sub.rn.f16 %h22, %h14, %h21; | |
cvt.f32.f16 %f8, %h22; | |
max.f32 %f9, %f7, %f8; | |
or.b32 %r14, %r6, 128; | |
ld.global.nc.b32 %hh3, [%rd12+256]; | |
mov.b32 {%h23, %h24}, %hh3; | |
mul.wide.u32 %rd17, %r14, 4; | |
add.s64 %rd18, %rd6, %rd17; | |
ld.global.nc.u32 %r15, [%rd18]; | |
cvt.rn.f16.s32 %h25, %r15; | |
sub.rn.f16 %h26, %h4, %h25; | |
mul.rn.f16 %h27, %h26, %h6; | |
sub.rn.f16 %h28, %h23, %h27; | |
cvt.f32.f16 %f10, %h28; | |
max.f32 %f11, %f9, %f10; | |
ld.global.nc.u32 %r16, [%rd14+516]; | |
cvt.rn.f16.s32 %h29, %r16; | |
sub.rn.f16 %h30, %h4, %h29; | |
mul.rn.f16 %h31, %h30, %h6; | |
sub.rn.f16 %h32, %h24, %h31; | |
cvt.f32.f16 %f12, %h32; | |
max.f32 %f13, %f11, %f12; | |
or.b32 %r17, %r6, 192; | |
ld.global.nc.b32 %hh4, [%rd12+384]; | |
mov.b32 {%h33, %h34}, %hh4; | |
mul.wide.u32 %rd19, %r17, 4; | |
add.s64 %rd20, %rd6, %rd19; | |
ld.global.nc.u32 %r18, [%rd20]; | |
cvt.rn.f16.s32 %h35, %r18; | |
sub.rn.f16 %h36, %h4, %h35; | |
mul.rn.f16 %h37, %h36, %h6; | |
sub.rn.f16 %h38, %h33, %h37; | |
cvt.f32.f16 %f14, %h38; | |
max.f32 %f15, %f13, %f14; | |
ld.global.nc.u32 %r19, [%rd14+772]; | |
cvt.rn.f16.s32 %h39, %r19; | |
sub.rn.f16 %h40, %h4, %h39; | |
mul.rn.f16 %h41, %h40, %h6; | |
sub.rn.f16 %h42, %h34, %h41; | |
cvt.f32.f16 %f16, %h42; | |
max.f32 %f17, %f15, %f16; | |
or.b32 %r20, %r6, 256; | |
ld.global.nc.b32 %hh5, [%rd12+512]; | |
mov.b32 {%h43, %h44}, %hh5; | |
mul.wide.u32 %rd21, %r20, 4; | |
add.s64 %rd22, %rd6, %rd21; | |
ld.global.nc.u32 %r21, [%rd22]; | |
cvt.rn.f16.s32 %h45, %r21; | |
sub.rn.f16 %h46, %h4, %h45; | |
mul.rn.f16 %h47, %h46, %h6; | |
sub.rn.f16 %h48, %h43, %h47; | |
cvt.f32.f16 %f18, %h48; | |
max.f32 %f19, %f17, %f18; | |
ld.global.nc.u32 %r22, [%rd14+1028]; | |
cvt.rn.f16.s32 %h49, %r22; | |
sub.rn.f16 %h50, %h4, %h49; | |
mul.rn.f16 %h51, %h50, %h6; | |
sub.rn.f16 %h52, %h44, %h51; | |
cvt.f32.f16 %f20, %h52; | |
max.f32 %f21, %f19, %f20; | |
or.b32 %r23, %r6, 320; | |
ld.global.nc.b32 %hh6, [%rd12+640]; | |
mov.b32 {%h53, %h54}, %hh6; | |
mul.wide.u32 %rd23, %r23, 4; | |
add.s64 %rd24, %rd6, %rd23; | |
ld.global.nc.u32 %r24, [%rd24]; | |
cvt.rn.f16.s32 %h55, %r24; | |
sub.rn.f16 %h56, %h4, %h55; | |
mul.rn.f16 %h57, %h56, %h6; | |
sub.rn.f16 %h58, %h53, %h57; | |
cvt.f32.f16 %f22, %h58; | |
max.f32 %f23, %f21, %f22; | |
ld.global.nc.u32 %r25, [%rd14+1284]; | |
cvt.rn.f16.s32 %h59, %r25; | |
sub.rn.f16 %h60, %h4, %h59; | |
mul.rn.f16 %h61, %h60, %h6; | |
sub.rn.f16 %h62, %h54, %h61; | |
cvt.f32.f16 %f24, %h62; | |
max.f32 %f25, %f23, %f24; | |
or.b32 %r26, %r6, 384; | |
ld.global.nc.b32 %hh7, [%rd12+768]; | |
mov.b32 {%h63, %h64}, %hh7; | |
mul.wide.u32 %rd25, %r26, 4; | |
add.s64 %rd26, %rd6, %rd25; | |
ld.global.nc.u32 %r27, [%rd26]; | |
cvt.rn.f16.s32 %h65, %r27; | |
sub.rn.f16 %h66, %h4, %h65; | |
mul.rn.f16 %h67, %h66, %h6; | |
sub.rn.f16 %h68, %h63, %h67; | |
cvt.f32.f16 %f26, %h68; | |
max.f32 %f27, %f25, %f26; | |
ld.global.nc.u32 %r28, [%rd14+1540]; | |
cvt.rn.f16.s32 %h69, %r28; | |
sub.rn.f16 %h70, %h4, %h69; | |
mul.rn.f16 %h71, %h70, %h6; | |
sub.rn.f16 %h72, %h64, %h71; | |
cvt.f32.f16 %f28, %h72; | |
max.f32 %f29, %f27, %f28; | |
or.b32 %r29, %r6, 448; | |
ld.global.nc.b32 %hh8, [%rd12+896]; | |
mov.b32 {%h73, %h74}, %hh8; | |
mul.wide.u32 %rd27, %r29, 4; | |
add.s64 %rd28, %rd6, %rd27; | |
ld.global.nc.u32 %r30, [%rd28]; | |
cvt.rn.f16.s32 %h75, %r30; | |
sub.rn.f16 %h76, %h4, %h75; | |
mul.rn.f16 %h77, %h76, %h6; | |
sub.rn.f16 %h78, %h73, %h77; | |
cvt.f32.f16 %f30, %h78; | |
max.f32 %f31, %f29, %f30; | |
ld.global.nc.u32 %r31, [%rd14+1796]; | |
cvt.rn.f16.s32 %h79, %r31; | |
sub.rn.f16 %h80, %h4, %h79; | |
mul.rn.f16 %h81, %h80, %h6; | |
sub.rn.f16 %h82, %h74, %h81; | |
cvt.f32.f16 %f32, %h82; | |
max.f32 %f33, %f31, %f32; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
max.f32 %f35, %f33, %f34; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
max.f32 %f37, %f35, %f36; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
max.f32 %f39, %f37, %f38; | |
shfl.sync.down.b32 %f40, %f39, 2, 31, -1; | |
max.f32 %f41, %f39, %f40; | |
shfl.sync.down.b32 %f42, %f41, 1, 31, -1; | |
setp.eq.s32 %p1, %r1, 0; | |
@%p1 bra LBB36_3; | |
bra.uni LBB36_1; | |
LBB36_3: | |
max.f32 %f1, %f41, %f42; | |
st.shared.f32 [shared_cache_08], %f1; | |
LBB36_1: | |
bar.sync 0; | |
mul.wide.u32 %rd32, %r1, 4; | |
mov.u64 %rd33, shared_cache_08; | |
add.s64 %rd3, %rd33, %rd32; | |
cvta.shared.u64 %rd34, %rd3; | |
mov.u32 %r34, -8388608; | |
st.local.u32 [%rd1], %r34; | |
selp.b64 %rd36, %rd34, %rd10, %p1; | |
ld.f32 %f43, [%rd36]; | |
shfl.sync.down.b32 %f44, %f43, 16, 31, -1; | |
max.f32 %f45, %f43, %f44; | |
shfl.sync.down.b32 %f46, %f45, 8, 31, -1; | |
max.f32 %f47, %f45, %f46; | |
shfl.sync.down.b32 %f48, %f47, 4, 31, -1; | |
max.f32 %f49, %f47, %f48; | |
shfl.sync.down.b32 %f50, %f49, 2, 31, -1; | |
max.f32 %f51, %f49, %f50; | |
shfl.sync.down.b32 %f52, %f51, 1, 31, -1; | |
max.f32 %f53, %f51, %f52; | |
st.f32 [%rd36], %f53; | |
@%p1 bra LBB36_4; | |
bra.uni LBB36_2; | |
LBB36_4: | |
ld.param.u64 %rd7, [fusion_2237_param_1]; | |
shr.u32 %r33, %r5, 9; | |
cvta.to.global.u64 %rd8, %rd7; | |
and.b32 %r32, %r5, 511; | |
mul.wide.u32 %rd29, %r33, 2048; | |
add.s64 %rd30, %rd8, %rd29; | |
mul.wide.u32 %rd31, %r32, 4; | |
add.s64 %rd2, %rd30, %rd31; | |
ld.global.u32 %r36, [%rd2]; | |
LBB36_5: | |
mov.b32 %f54, %r36; | |
ld.shared.f32 %f55, [%rd3]; | |
max.f32 %f56, %f54, %f55; | |
mov.b32 %r35, %f56; | |
atom.global.cas.b32 %r4, [%rd2], %r36, %r35; | |
setp.eq.s32 %p3, %r4, %r36; | |
mov.u32 %r36, %r4; | |
@%p3 bra LBB36_2; | |
bra.uni LBB36_5; | |
LBB36_2: | |
ret; | |
} | |
// .globl fusion_2235 | |
.visible .entry fusion_2235( | |
.param .u64 fusion_2235_param_0, | |
.param .u64 fusion_2235_param_1, | |
.param .u64 fusion_2235_param_2, | |
.param .u64 fusion_2235_param_3, | |
.param .u64 fusion_2235_param_4 | |
) | |
.reqntid 32, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot37[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<35>; | |
.reg .b16 %h<83>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<249>; | |
.reg .b32 %r<32>; | |
.reg .b64 %rd<41>; | |
mov.u64 %SPL, __local_depot37; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd4, [fusion_2235_param_0]; | |
ld.param.u64 %rd5, [fusion_2235_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd8, [fusion_2235_param_2]; | |
cvta.to.global.u64 %rd9, %rd8; | |
cvta.to.global.u64 %rd11, %rd4; | |
add.u64 %rd12, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 9; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd13, %r5, 2; | |
add.s64 %rd14, %rd11, %rd13; | |
ld.global.nc.b32 %hh1, [%rd14]; | |
mov.b32 {%h1, %h2}, %hh1; | |
mul.wide.u32 %rd15, %r3, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.v2.u32 {%r6, %r7}, [%rd16]; | |
cvt.rn.f16.s32 %h3, %r6; | |
mov.b16 %h4, 0x3C00; | |
sub.rn.f16 %h5, %h4, %h3; | |
mov.b16 %h6, 0x70E2; | |
mul.rn.f16 %h7, %h5, %h6; | |
sub.rn.f16 %h8, %h1, %h7; | |
cvt.f32.f16 %f2, %h8; | |
mul.wide.u32 %rd17, %r2, 4; | |
add.s64 %rd18, %rd9, %rd17; | |
ld.global.nc.f32 %f3, [%rd18]; | |
sub.rn.f32 %f4, %f2, %f3; | |
mul.rn.f32 %f5, %f4, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f6, %f5; | |
add.rn.f32 %f7, %f6, 0f00000000; | |
ex2.approx.f32 %f8, %f7; | |
fma.rn.f32 %f9, %f6, 0fBF317200, %f4; | |
fma.rn.f32 %f10, %f6, 0fB5BFBE8E, %f9; | |
mul.rn.f32 %f11, %f10, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f12, %f11; | |
mul.rn.f32 %f13, %f8, %f12; | |
setp.lt.f32 %p1, %f4, 0fC2D20000; | |
setp.gt.f32 %p2, %f4, 0f42D20000; | |
add.rn.f32 %f14, %f13, 0f00000000; | |
selp.f32 %f15, 0f00000000, %f14, %p1; | |
selp.f32 %f16, 0f7F800000, %f15, %p2; | |
cvt.rn.f16.s32 %h9, %r7; | |
sub.rn.f16 %h10, %h4, %h9; | |
mul.rn.f16 %h11, %h10, %h6; | |
sub.rn.f16 %h12, %h2, %h11; | |
cvt.f32.f16 %f17, %h12; | |
sub.rn.f32 %f18, %f17, %f3; | |
mul.rn.f32 %f19, %f18, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f20, %f19; | |
add.rn.f32 %f21, %f20, 0f00000000; | |
ex2.approx.f32 %f22, %f21; | |
fma.rn.f32 %f23, %f20, 0fBF317200, %f18; | |
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23; | |
mul.rn.f32 %f25, %f24, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f26, %f25; | |
mul.rn.f32 %f27, %f22, %f26; | |
setp.lt.f32 %p3, %f18, 0fC2D20000; | |
selp.f32 %f28, 0f00000000, %f27, %p3; | |
setp.gt.f32 %p4, %f18, 0f42D20000; | |
selp.f32 %f29, 0f7F800000, %f28, %p4; | |
add.rn.f32 %f30, %f16, %f29; | |
or.b32 %r8, %r3, 64; | |
ld.global.nc.b32 %hh2, [%rd14+128]; | |
mov.b32 {%h13, %h14}, %hh2; | |
mul.wide.u32 %rd19, %r8, 4; | |
add.s64 %rd20, %rd6, %rd19; | |
ld.global.nc.u32 %r9, [%rd20]; | |
cvt.rn.f16.s32 %h15, %r9; | |
sub.rn.f16 %h16, %h4, %h15; | |
mul.rn.f16 %h17, %h16, %h6; | |
sub.rn.f16 %h18, %h13, %h17; | |
cvt.f32.f16 %f31, %h18; | |
sub.rn.f32 %f32, %f31, %f3; | |
mul.rn.f32 %f33, %f32, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f34, %f33; | |
add.rn.f32 %f35, %f34, 0f00000000; | |
ex2.approx.f32 %f36, %f35; | |
fma.rn.f32 %f37, %f34, 0fBF317200, %f32; | |
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37; | |
mul.rn.f32 %f39, %f38, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f40, %f39; | |
mul.rn.f32 %f41, %f36, %f40; | |
setp.lt.f32 %p5, %f32, 0fC2D20000; | |
selp.f32 %f42, 0f00000000, %f41, %p5; | |
setp.gt.f32 %p6, %f32, 0f42D20000; | |
selp.f32 %f43, 0f7F800000, %f42, %p6; | |
add.rn.f32 %f44, %f30, %f43; | |
ld.global.nc.u32 %r10, [%rd16+260]; | |
cvt.rn.f16.s32 %h19, %r10; | |
sub.rn.f16 %h20, %h4, %h19; | |
mul.rn.f16 %h21, %h20, %h6; | |
sub.rn.f16 %h22, %h14, %h21; | |
cvt.f32.f16 %f45, %h22; | |
sub.rn.f32 %f46, %f45, %f3; | |
mul.rn.f32 %f47, %f46, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f48, %f47; | |
add.rn.f32 %f49, %f48, 0f00000000; | |
ex2.approx.f32 %f50, %f49; | |
fma.rn.f32 %f51, %f48, 0fBF317200, %f46; | |
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51; | |
mul.rn.f32 %f53, %f52, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f54, %f53; | |
mul.rn.f32 %f55, %f50, %f54; | |
setp.lt.f32 %p7, %f46, 0fC2D20000; | |
selp.f32 %f56, 0f00000000, %f55, %p7; | |
setp.gt.f32 %p8, %f46, 0f42D20000; | |
selp.f32 %f57, 0f7F800000, %f56, %p8; | |
add.rn.f32 %f58, %f44, %f57; | |
or.b32 %r11, %r3, 128; | |
ld.global.nc.b32 %hh3, [%rd14+256]; | |
mov.b32 {%h23, %h24}, %hh3; | |
mul.wide.u32 %rd21, %r11, 4; | |
add.s64 %rd22, %rd6, %rd21; | |
ld.global.nc.u32 %r12, [%rd22]; | |
cvt.rn.f16.s32 %h25, %r12; | |
sub.rn.f16 %h26, %h4, %h25; | |
mul.rn.f16 %h27, %h26, %h6; | |
sub.rn.f16 %h28, %h23, %h27; | |
cvt.f32.f16 %f59, %h28; | |
sub.rn.f32 %f60, %f59, %f3; | |
mul.rn.f32 %f61, %f60, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f62, %f61; | |
add.rn.f32 %f63, %f62, 0f00000000; | |
ex2.approx.f32 %f64, %f63; | |
fma.rn.f32 %f65, %f62, 0fBF317200, %f60; | |
fma.rn.f32 %f66, %f62, 0fB5BFBE8E, %f65; | |
mul.rn.f32 %f67, %f66, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f68, %f67; | |
mul.rn.f32 %f69, %f64, %f68; | |
setp.lt.f32 %p9, %f60, 0fC2D20000; | |
selp.f32 %f70, 0f00000000, %f69, %p9; | |
setp.gt.f32 %p10, %f60, 0f42D20000; | |
selp.f32 %f71, 0f7F800000, %f70, %p10; | |
add.rn.f32 %f72, %f58, %f71; | |
ld.global.nc.u32 %r13, [%rd16+516]; | |
cvt.rn.f16.s32 %h29, %r13; | |
sub.rn.f16 %h30, %h4, %h29; | |
mul.rn.f16 %h31, %h30, %h6; | |
sub.rn.f16 %h32, %h24, %h31; | |
cvt.f32.f16 %f73, %h32; | |
sub.rn.f32 %f74, %f73, %f3; | |
mul.rn.f32 %f75, %f74, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f76, %f75; | |
add.rn.f32 %f77, %f76, 0f00000000; | |
ex2.approx.f32 %f78, %f77; | |
fma.rn.f32 %f79, %f76, 0fBF317200, %f74; | |
fma.rn.f32 %f80, %f76, 0fB5BFBE8E, %f79; | |
mul.rn.f32 %f81, %f80, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f82, %f81; | |
mul.rn.f32 %f83, %f78, %f82; | |
setp.lt.f32 %p11, %f74, 0fC2D20000; | |
selp.f32 %f84, 0f00000000, %f83, %p11; | |
setp.gt.f32 %p12, %f74, 0f42D20000; | |
selp.f32 %f85, 0f7F800000, %f84, %p12; | |
add.rn.f32 %f86, %f72, %f85; | |
or.b32 %r14, %r3, 192; | |
ld.global.nc.b32 %hh4, [%rd14+384]; | |
mov.b32 {%h33, %h34}, %hh4; | |
mul.wide.u32 %rd23, %r14, 4; | |
add.s64 %rd24, %rd6, %rd23; | |
ld.global.nc.u32 %r15, [%rd24]; | |
cvt.rn.f16.s32 %h35, %r15; | |
sub.rn.f16 %h36, %h4, %h35; | |
mul.rn.f16 %h37, %h36, %h6; | |
sub.rn.f16 %h38, %h33, %h37; | |
cvt.f32.f16 %f87, %h38; | |
sub.rn.f32 %f88, %f87, %f3; | |
mul.rn.f32 %f89, %f88, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f90, %f89; | |
add.rn.f32 %f91, %f90, 0f00000000; | |
ex2.approx.f32 %f92, %f91; | |
fma.rn.f32 %f93, %f90, 0fBF317200, %f88; | |
fma.rn.f32 %f94, %f90, 0fB5BFBE8E, %f93; | |
mul.rn.f32 %f95, %f94, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f96, %f95; | |
mul.rn.f32 %f97, %f92, %f96; | |
setp.lt.f32 %p13, %f88, 0fC2D20000; | |
selp.f32 %f98, 0f00000000, %f97, %p13; | |
setp.gt.f32 %p14, %f88, 0f42D20000; | |
selp.f32 %f99, 0f7F800000, %f98, %p14; | |
add.rn.f32 %f100, %f86, %f99; | |
ld.global.nc.u32 %r16, [%rd16+772]; | |
cvt.rn.f16.s32 %h39, %r16; | |
sub.rn.f16 %h40, %h4, %h39; | |
mul.rn.f16 %h41, %h40, %h6; | |
sub.rn.f16 %h42, %h34, %h41; | |
cvt.f32.f16 %f101, %h42; | |
sub.rn.f32 %f102, %f101, %f3; | |
mul.rn.f32 %f103, %f102, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f104, %f103; | |
add.rn.f32 %f105, %f104, 0f00000000; | |
ex2.approx.f32 %f106, %f105; | |
fma.rn.f32 %f107, %f104, 0fBF317200, %f102; | |
fma.rn.f32 %f108, %f104, 0fB5BFBE8E, %f107; | |
mul.rn.f32 %f109, %f108, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f110, %f109; | |
mul.rn.f32 %f111, %f106, %f110; | |
setp.lt.f32 %p15, %f102, 0fC2D20000; | |
selp.f32 %f112, 0f00000000, %f111, %p15; | |
setp.gt.f32 %p16, %f102, 0f42D20000; | |
selp.f32 %f113, 0f7F800000, %f112, %p16; | |
add.rn.f32 %f114, %f100, %f113; | |
or.b32 %r17, %r3, 256; | |
ld.global.nc.b32 %hh5, [%rd14+512]; | |
mov.b32 {%h43, %h44}, %hh5; | |
mul.wide.u32 %rd25, %r17, 4; | |
add.s64 %rd26, %rd6, %rd25; | |
ld.global.nc.u32 %r18, [%rd26]; | |
cvt.rn.f16.s32 %h45, %r18; | |
sub.rn.f16 %h46, %h4, %h45; | |
mul.rn.f16 %h47, %h46, %h6; | |
sub.rn.f16 %h48, %h43, %h47; | |
cvt.f32.f16 %f115, %h48; | |
sub.rn.f32 %f116, %f115, %f3; | |
mul.rn.f32 %f117, %f116, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f118, %f117; | |
add.rn.f32 %f119, %f118, 0f00000000; | |
ex2.approx.f32 %f120, %f119; | |
fma.rn.f32 %f121, %f118, 0fBF317200, %f116; | |
fma.rn.f32 %f122, %f118, 0fB5BFBE8E, %f121; | |
mul.rn.f32 %f123, %f122, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f124, %f123; | |
mul.rn.f32 %f125, %f120, %f124; | |
setp.lt.f32 %p17, %f116, 0fC2D20000; | |
selp.f32 %f126, 0f00000000, %f125, %p17; | |
setp.gt.f32 %p18, %f116, 0f42D20000; | |
selp.f32 %f127, 0f7F800000, %f126, %p18; | |
add.rn.f32 %f128, %f114, %f127; | |
ld.global.nc.u32 %r19, [%rd16+1028]; | |
cvt.rn.f16.s32 %h49, %r19; | |
sub.rn.f16 %h50, %h4, %h49; | |
mul.rn.f16 %h51, %h50, %h6; | |
sub.rn.f16 %h52, %h44, %h51; | |
cvt.f32.f16 %f129, %h52; | |
sub.rn.f32 %f130, %f129, %f3; | |
mul.rn.f32 %f131, %f130, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f132, %f131; | |
add.rn.f32 %f133, %f132, 0f00000000; | |
ex2.approx.f32 %f134, %f133; | |
fma.rn.f32 %f135, %f132, 0fBF317200, %f130; | |
fma.rn.f32 %f136, %f132, 0fB5BFBE8E, %f135; | |
mul.rn.f32 %f137, %f136, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f138, %f137; | |
mul.rn.f32 %f139, %f134, %f138; | |
setp.lt.f32 %p19, %f130, 0fC2D20000; | |
selp.f32 %f140, 0f00000000, %f139, %p19; | |
setp.gt.f32 %p20, %f130, 0f42D20000; | |
selp.f32 %f141, 0f7F800000, %f140, %p20; | |
add.rn.f32 %f142, %f128, %f141; | |
or.b32 %r20, %r3, 320; | |
ld.global.nc.b32 %hh6, [%rd14+640]; | |
mov.b32 {%h53, %h54}, %hh6; | |
mul.wide.u32 %rd27, %r20, 4; | |
add.s64 %rd28, %rd6, %rd27; | |
ld.global.nc.u32 %r21, [%rd28]; | |
cvt.rn.f16.s32 %h55, %r21; | |
sub.rn.f16 %h56, %h4, %h55; | |
mul.rn.f16 %h57, %h56, %h6; | |
sub.rn.f16 %h58, %h53, %h57; | |
cvt.f32.f16 %f143, %h58; | |
sub.rn.f32 %f144, %f143, %f3; | |
mul.rn.f32 %f145, %f144, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f146, %f145; | |
add.rn.f32 %f147, %f146, 0f00000000; | |
ex2.approx.f32 %f148, %f147; | |
fma.rn.f32 %f149, %f146, 0fBF317200, %f144; | |
fma.rn.f32 %f150, %f146, 0fB5BFBE8E, %f149; | |
mul.rn.f32 %f151, %f150, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f152, %f151; | |
mul.rn.f32 %f153, %f148, %f152; | |
setp.lt.f32 %p21, %f144, 0fC2D20000; | |
selp.f32 %f154, 0f00000000, %f153, %p21; | |
setp.gt.f32 %p22, %f144, 0f42D20000; | |
selp.f32 %f155, 0f7F800000, %f154, %p22; | |
add.rn.f32 %f156, %f142, %f155; | |
ld.global.nc.u32 %r22, [%rd16+1284]; | |
cvt.rn.f16.s32 %h59, %r22; | |
sub.rn.f16 %h60, %h4, %h59; | |
mul.rn.f16 %h61, %h60, %h6; | |
sub.rn.f16 %h62, %h54, %h61; | |
cvt.f32.f16 %f157, %h62; | |
sub.rn.f32 %f158, %f157, %f3; | |
mul.rn.f32 %f159, %f158, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f160, %f159; | |
add.rn.f32 %f161, %f160, 0f00000000; | |
ex2.approx.f32 %f162, %f161; | |
fma.rn.f32 %f163, %f160, 0fBF317200, %f158; | |
fma.rn.f32 %f164, %f160, 0fB5BFBE8E, %f163; | |
mul.rn.f32 %f165, %f164, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f166, %f165; | |
mul.rn.f32 %f167, %f162, %f166; | |
setp.lt.f32 %p23, %f158, 0fC2D20000; | |
selp.f32 %f168, 0f00000000, %f167, %p23; | |
setp.gt.f32 %p24, %f158, 0f42D20000; | |
selp.f32 %f169, 0f7F800000, %f168, %p24; | |
add.rn.f32 %f170, %f156, %f169; | |
or.b32 %r23, %r3, 384; | |
ld.global.nc.b32 %hh7, [%rd14+768]; | |
mov.b32 {%h63, %h64}, %hh7; | |
mul.wide.u32 %rd29, %r23, 4; | |
add.s64 %rd30, %rd6, %rd29; | |
ld.global.nc.u32 %r24, [%rd30]; | |
cvt.rn.f16.s32 %h65, %r24; | |
sub.rn.f16 %h66, %h4, %h65; | |
mul.rn.f16 %h67, %h66, %h6; | |
sub.rn.f16 %h68, %h63, %h67; | |
cvt.f32.f16 %f171, %h68; | |
sub.rn.f32 %f172, %f171, %f3; | |
mul.rn.f32 %f173, %f172, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f174, %f173; | |
add.rn.f32 %f175, %f174, 0f00000000; | |
ex2.approx.f32 %f176, %f175; | |
fma.rn.f32 %f177, %f174, 0fBF317200, %f172; | |
fma.rn.f32 %f178, %f174, 0fB5BFBE8E, %f177; | |
mul.rn.f32 %f179, %f178, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f180, %f179; | |
mul.rn.f32 %f181, %f176, %f180; | |
setp.lt.f32 %p25, %f172, 0fC2D20000; | |
selp.f32 %f182, 0f00000000, %f181, %p25; | |
setp.gt.f32 %p26, %f172, 0f42D20000; | |
selp.f32 %f183, 0f7F800000, %f182, %p26; | |
add.rn.f32 %f184, %f170, %f183; | |
ld.global.nc.u32 %r25, [%rd16+1540]; | |
cvt.rn.f16.s32 %h69, %r25; | |
sub.rn.f16 %h70, %h4, %h69; | |
mul.rn.f16 %h71, %h70, %h6; | |
sub.rn.f16 %h72, %h64, %h71; | |
cvt.f32.f16 %f185, %h72; | |
sub.rn.f32 %f186, %f185, %f3; | |
mul.rn.f32 %f187, %f186, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f188, %f187; | |
add.rn.f32 %f189, %f188, 0f00000000; | |
ex2.approx.f32 %f190, %f189; | |
fma.rn.f32 %f191, %f188, 0fBF317200, %f186; | |
fma.rn.f32 %f192, %f188, 0fB5BFBE8E, %f191; | |
mul.rn.f32 %f193, %f192, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f194, %f193; | |
mul.rn.f32 %f195, %f190, %f194; | |
setp.lt.f32 %p27, %f186, 0fC2D20000; | |
selp.f32 %f196, 0f00000000, %f195, %p27; | |
setp.gt.f32 %p28, %f186, 0f42D20000; | |
selp.f32 %f197, 0f7F800000, %f196, %p28; | |
add.rn.f32 %f198, %f184, %f197; | |
or.b32 %r26, %r3, 448; | |
ld.global.nc.b32 %hh8, [%rd14+896]; | |
mov.b32 {%h73, %h74}, %hh8; | |
mul.wide.u32 %rd31, %r26, 4; | |
add.s64 %rd32, %rd6, %rd31; | |
ld.global.nc.u32 %r27, [%rd32]; | |
cvt.rn.f16.s32 %h75, %r27; | |
sub.rn.f16 %h76, %h4, %h75; | |
mul.rn.f16 %h77, %h76, %h6; | |
sub.rn.f16 %h78, %h73, %h77; | |
cvt.f32.f16 %f199, %h78; | |
sub.rn.f32 %f200, %f199, %f3; | |
mul.rn.f32 %f201, %f200, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f202, %f201; | |
add.rn.f32 %f203, %f202, 0f00000000; | |
ex2.approx.f32 %f204, %f203; | |
fma.rn.f32 %f205, %f202, 0fBF317200, %f200; | |
fma.rn.f32 %f206, %f202, 0fB5BFBE8E, %f205; | |
mul.rn.f32 %f207, %f206, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f208, %f207; | |
mul.rn.f32 %f209, %f204, %f208; | |
setp.lt.f32 %p29, %f200, 0fC2D20000; | |
selp.f32 %f210, 0f00000000, %f209, %p29; | |
setp.gt.f32 %p30, %f200, 0f42D20000; | |
selp.f32 %f211, 0f7F800000, %f210, %p30; | |
add.rn.f32 %f212, %f198, %f211; | |
ld.global.nc.u32 %r28, [%rd16+1796]; | |
cvt.rn.f16.s32 %h79, %r28; | |
sub.rn.f16 %h80, %h4, %h79; | |
mul.rn.f16 %h81, %h80, %h6; | |
sub.rn.f16 %h82, %h74, %h81; | |
cvt.f32.f16 %f213, %h82; | |
sub.rn.f32 %f214, %f213, %f3; | |
mul.rn.f32 %f215, %f214, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f216, %f215; | |
add.rn.f32 %f217, %f216, 0f00000000; | |
ex2.approx.f32 %f218, %f217; | |
fma.rn.f32 %f219, %f216, 0fBF317200, %f214; | |
fma.rn.f32 %f220, %f216, 0fB5BFBE8E, %f219; | |
mul.rn.f32 %f221, %f220, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f222, %f221; | |
mul.rn.f32 %f223, %f218, %f222; | |
setp.lt.f32 %p31, %f214, 0fC2D20000; | |
selp.f32 %f224, 0f00000000, %f223, %p31; | |
setp.gt.f32 %p32, %f214, 0f42D20000; | |
selp.f32 %f225, 0f7F800000, %f224, %p32; | |
add.rn.f32 %f226, %f212, %f225; | |
shfl.sync.down.b32 %f227, %f226, 16, 31, -1; | |
add.rn.f32 %f228, %f227, %f226; | |
shfl.sync.down.b32 %f229, %f228, 8, 31, -1; | |
add.rn.f32 %f230, %f229, %f228; | |
shfl.sync.down.b32 %f231, %f230, 4, 31, -1; | |
add.rn.f32 %f232, %f231, %f230; | |
shfl.sync.down.b32 %f233, %f232, 2, 31, -1; | |
add.rn.f32 %f234, %f233, %f232; | |
shfl.sync.down.b32 %f235, %f234, 1, 31, -1; | |
setp.eq.s32 %p33, %r1, 0; | |
@%p33 bra LBB37_3; | |
bra.uni LBB37_1; | |
LBB37_3: | |
add.rn.f32 %f1, %f235, %f234; | |
st.shared.f32 [shared_cache_09], %f1; | |
LBB37_1: | |
bar.sync 0; | |
mul.wide.u32 %rd36, %r1, 4; | |
mov.u64 %rd37, shared_cache_09; | |
add.s64 %rd3, %rd37, %rd36; | |
cvta.shared.u64 %rd38, %rd3; | |
mov.u32 %r31, 0; | |
st.local.u32 [%rd1], %r31; | |
selp.b64 %rd40, %rd38, %rd12, %p33; | |
ld.f32 %f236, [%rd40]; | |
shfl.sync.down.b32 %f237, %f236, 16, 31, -1; | |
add.rn.f32 %f238, %f236, %f237; | |
shfl.sync.down.b32 %f239, %f238, 8, 31, -1; | |
add.rn.f32 %f240, %f238, %f239; | |
shfl.sync.down.b32 %f241, %f240, 4, 31, -1; | |
add.rn.f32 %f242, %f240, %f241; | |
shfl.sync.down.b32 %f243, %f242, 2, 31, -1; | |
add.rn.f32 %f244, %f242, %f243; | |
shfl.sync.down.b32 %f245, %f244, 1, 31, -1; | |
add.rn.f32 %f246, %f244, %f245; | |
st.f32 [%rd40], %f246; | |
@%p33 bra LBB37_4; | |
bra.uni LBB37_2; | |
LBB37_4: | |
ld.param.u64 %rd7, [fusion_2235_param_1]; | |
shr.u32 %r30, %r2, 9; | |
cvta.to.global.u64 %rd10, %rd7; | |
and.b32 %r29, %r2, 511; | |
mul.wide.u32 %rd33, %r30, 2048; | |
add.s64 %rd34, %rd10, %rd33; | |
mul.wide.u32 %rd35, %r29, 4; | |
add.s64 %rd2, %rd34, %rd35; | |
ld.shared.f32 %f247, [%rd3]; | |
atom.global.add.f32 %f248, [%rd2], %f247; | |
LBB37_2: | |
ret; | |
} | |
// .globl fusion_2234 | |
.visible .entry fusion_2234( | |
.param .u64 fusion_2234_param_0, | |
.param .u64 fusion_2234_param_1, | |
.param .u64 fusion_2234_param_2, | |
.param .u64 fusion_2234_param_3, | |
.param .u64 fusion_2234_param_4, | |
.param .u64 fusion_2234_param_5 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<9>; | |
.reg .b16 %h<27>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<59>; | |
.reg .b32 %r<18>; | |
.reg .b64 %rd<26>; | |
ld.param.u64 %rd1, [fusion_2234_param_0]; | |
ld.param.u64 %rd2, [fusion_2234_param_4]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2234_param_1]; | |
ld.param.u64 %rd5, [fusion_2234_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2234_param_2]; | |
cvta.to.global.u64 %rd8, %rd7; | |
cvta.to.global.u64 %rd9, %rd4; | |
cvta.to.global.u64 %rd10, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
or.b32 %r8, %r4, 3; | |
shr.u32 %r9, %r5, 9; | |
and.b32 %r10, %r8, 511; | |
and.b32 %r11, %r7, 510; | |
and.b32 %r12, %r6, 509; | |
and.b32 %r13, %r4, 508; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd10, %rd11; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
mul.wide.u32 %rd13, %r13, 4; | |
add.s64 %rd14, %rd3, %rd13; | |
ld.global.nc.u32 %r14, [%rd14]; | |
cvt.rn.f16.s32 %h9, %r14; | |
mov.b16 %h10, 0x3C00; | |
sub.rn.f16 %h11, %h10, %h9; | |
mov.b16 %h12, 0x70E2; | |
mul.rn.f16 %h13, %h11, %h12; | |
sub.rn.f16 %h14, %h5, %h13; | |
cvt.f32.f16 %f1, %h14; | |
mul.wide.u32 %rd15, %r9, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.f32 %f2, [%rd16]; | |
sub.rn.f32 %f3, %f1, %f2; | |
mul.rn.f32 %f4, %f3, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f5, %f4; | |
add.rn.f32 %f6, %f5, 0f00000000; | |
ex2.approx.f32 %f7, %f6; | |
fma.rn.f32 %f8, %f5, 0fBF317200, %f3; | |
fma.rn.f32 %f9, %f5, 0fB5BFBE8E, %f8; | |
mul.rn.f32 %f10, %f9, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f11, %f10; | |
mul.rn.f32 %f12, %f7, %f11; | |
setp.lt.f32 %p1, %f3, 0fC2D20000; | |
selp.f32 %f13, 0f00000000, %f12, %p1; | |
setp.gt.f32 %p2, %f3, 0f42D20000; | |
selp.f32 %f14, 0f7F800000, %f13, %p2; | |
add.s64 %rd17, %rd8, %rd15; | |
ld.global.nc.f32 %f15, [%rd17]; | |
div.full.f32 %f16, %f14, %f15; | |
mul.wide.u32 %rd18, %r5, 4; | |
add.s64 %rd19, %rd9, %rd18; | |
mul.wide.u32 %rd20, %r12, 4; | |
add.s64 %rd21, %rd3, %rd20; | |
ld.global.nc.u32 %r15, [%rd21]; | |
cvt.rn.f16.s32 %h15, %r15; | |
sub.rn.f16 %h16, %h10, %h15; | |
mul.rn.f16 %h17, %h16, %h12; | |
sub.rn.f16 %h18, %h6, %h17; | |
cvt.f32.f16 %f17, %h18; | |
sub.rn.f32 %f18, %f17, %f2; | |
mul.rn.f32 %f19, %f18, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f20, %f19; | |
add.rn.f32 %f21, %f20, 0f00000000; | |
ex2.approx.f32 %f22, %f21; | |
fma.rn.f32 %f23, %f20, 0fBF317200, %f18; | |
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23; | |
mul.rn.f32 %f25, %f24, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f26, %f25; | |
mul.rn.f32 %f27, %f22, %f26; | |
setp.lt.f32 %p3, %f18, 0fC2D20000; | |
selp.f32 %f28, 0f00000000, %f27, %p3; | |
setp.gt.f32 %p4, %f18, 0f42D20000; | |
selp.f32 %f29, 0f7F800000, %f28, %p4; | |
div.full.f32 %f30, %f29, %f15; | |
mul.wide.u32 %rd22, %r11, 4; | |
add.s64 %rd23, %rd3, %rd22; | |
ld.global.nc.u32 %r16, [%rd23]; | |
cvt.rn.f16.s32 %h19, %r16; | |
sub.rn.f16 %h20, %h10, %h19; | |
mul.rn.f16 %h21, %h20, %h12; | |
sub.rn.f16 %h22, %h7, %h21; | |
cvt.f32.f16 %f31, %h22; | |
sub.rn.f32 %f32, %f31, %f2; | |
mul.rn.f32 %f33, %f32, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f34, %f33; | |
add.rn.f32 %f35, %f34, 0f00000000; | |
ex2.approx.f32 %f36, %f35; | |
fma.rn.f32 %f37, %f34, 0fBF317200, %f32; | |
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37; | |
mul.rn.f32 %f39, %f38, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f40, %f39; | |
mul.rn.f32 %f41, %f36, %f40; | |
setp.lt.f32 %p5, %f32, 0fC2D20000; | |
selp.f32 %f42, 0f00000000, %f41, %p5; | |
setp.gt.f32 %p6, %f32, 0f42D20000; | |
selp.f32 %f43, 0f7F800000, %f42, %p6; | |
div.full.f32 %f44, %f43, %f15; | |
mul.wide.u32 %rd24, %r10, 4; | |
add.s64 %rd25, %rd3, %rd24; | |
ld.global.nc.u32 %r17, [%rd25]; | |
cvt.rn.f16.s32 %h23, %r17; | |
sub.rn.f16 %h24, %h10, %h23; | |
mul.rn.f16 %h25, %h24, %h12; | |
sub.rn.f16 %h26, %h8, %h25; | |
cvt.f32.f16 %f45, %h26; | |
sub.rn.f32 %f46, %f45, %f2; | |
mul.rn.f32 %f47, %f46, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f48, %f47; | |
add.rn.f32 %f49, %f48, 0f00000000; | |
ex2.approx.f32 %f50, %f49; | |
fma.rn.f32 %f51, %f48, 0fBF317200, %f46; | |
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51; | |
mul.rn.f32 %f53, %f52, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f54, %f53; | |
mul.rn.f32 %f55, %f50, %f54; | |
setp.lt.f32 %p7, %f46, 0fC2D20000; | |
selp.f32 %f56, 0f00000000, %f55, %p7; | |
setp.gt.f32 %p8, %f46, 0f42D20000; | |
selp.f32 %f57, 0f7F800000, %f56, %p8; | |
div.full.f32 %f58, %f57, %f15; | |
st.global.v4.f32 [%rd19], {%f16, %f30, %f44, %f58}; | |
ret; | |
} | |
// .globl rng_get_and_update_state_6 | |
.visible .entry rng_get_and_update_state_6( | |
.param .u64 rng_get_and_update_state_6_param_0, | |
.param .u64 rng_get_and_update_state_6_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_6_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 4194304; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 4194304; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2233 | |
.visible .entry fusion_2233( | |
.param .u64 fusion_2233_param_0, | |
.param .u64 fusion_2233_param_1, | |
.param .u64 fusion_2233_param_2, | |
.param .u64 fusion_2233_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<19>; | |
.reg .f32 %f<13>; | |
.reg .b32 %r<29>; | |
.reg .b64 %rd<119>; | |
ld.param.u64 %rd1, [fusion_2233_param_0]; | |
ld.param.u64 %rd2, [fusion_2233_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2233_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
ld.global.nc.v2.u64 {%rd7, %rd8}, [%rd3]; | |
shr.u32 %r6, %r5, 2; | |
cvt.u64.u32 %rd9, %r6; | |
add.s64 %rd10, %rd7, %rd9; | |
setp.lt.u64 %p1, %rd10, %rd7; | |
and.b64 %rd11, %rd10, 4294967295; | |
mul.lo.s64 %rd12, %rd11, 3528531795; | |
selp.u64 %rd13, 1, 0, %p1; | |
add.s64 %rd14, %rd8, %rd13; | |
xor.b64 %rd15, %rd14, %rd12; | |
shr.u64 %rd16, %rd15, 32; | |
mul.lo.s64 %rd17, %rd16, 3449720151; | |
shr.u64 %rd18, %rd17, 32; | |
and.b64 %rd19, %rd14, 4294967295; | |
mul.lo.s64 %rd20, %rd19, 3449720151; | |
and.b64 %rd21, %rd20, 4294967295; | |
xor.b64 %rd22, %rd21, %rd18; | |
xor.b64 %rd23, %rd22, 2654435769; | |
mul.lo.s64 %rd24, %rd23, 3528531795; | |
shr.u64 %rd25, %rd24, 32; | |
xor.b64 %rd26, %rd20, %rd10; | |
shr.u64 %rd27, %rd26, 32; | |
mul.lo.s64 %rd28, %rd27, 3528531795; | |
and.b64 %rd29, %rd28, 4294967295; | |
xor.b64 %rd30, %rd29, %rd25; | |
xor.b64 %rd31, %rd30, 1993301258; | |
mul.lo.s64 %rd32, %rd31, 3449720151; | |
shr.u64 %rd33, %rd32, 32; | |
shr.u64 %rd34, %rd28, 32; | |
and.b64 %rd35, %rd12, 4294967295; | |
xor.b64 %rd36, %rd35, %rd34; | |
xor.b64 %rd37, %rd36, 3144134277; | |
mul.lo.s64 %rd38, %rd37, 3449720151; | |
and.b64 %rd39, %rd38, 4294967295; | |
xor.b64 %rd40, %rd39, %rd33; | |
xor.b64 %rd41, %rd40, 3668340011; | |
mul.lo.s64 %rd42, %rd41, 3528531795; | |
shr.u64 %rd43, %rd42, 32; | |
shr.u64 %rd44, %rd38, 32; | |
and.b64 %rd45, %rd17, 4294967295; | |
xor.b64 %rd46, %rd45, %rd44; | |
xor.b64 %rd47, %rd46, 1013904242; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
and.b64 %rd49, %rd48, 4294967295; | |
xor.b64 %rd50, %rd49, %rd43; | |
xor.b64 %rd51, %rd50, 3986602516; | |
mul.lo.s64 %rd52, %rd51, 3449720151; | |
shr.u64 %rd53, %rd52, 32; | |
shr.u64 %rd54, %rd48, 32; | |
and.b64 %rd55, %rd24, 4294967295; | |
xor.b64 %rd56, %rd55, %rd54; | |
xor.b64 %rd57, %rd56, 842468239; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
and.b64 %rd59, %rd58, 4294967295; | |
xor.b64 %rd60, %rd59, %rd53; | |
xor.b64 %rd61, %rd60, 387276957; | |
mul.lo.s64 %rd62, %rd61, 3528531795; | |
shr.u64 %rd63, %rd62, 32; | |
shr.u64 %rd64, %rd58, 32; | |
and.b64 %rd65, %rd32, 4294967295; | |
xor.b64 %rd66, %rd65, %rd64; | |
xor.b64 %rd67, %rd66, 2027808484; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
and.b64 %rd69, %rd68, 4294967295; | |
xor.b64 %rd70, %rd69, %rd63; | |
xor.b64 %rd71, %rd70, 1684936478; | |
mul.lo.s64 %rd72, %rd71, 3449720151; | |
shr.u64 %rd73, %rd72, 32; | |
shr.u64 %rd74, %rd68, 32; | |
and.b64 %rd75, %rd42, 4294967295; | |
xor.b64 %rd76, %rd75, %rd74; | |
xor.b64 %rd77, %rd76, 2835769497; | |
mul.lo.s64 %rd78, %rd77, 3449720151; | |
and.b64 %rd79, %rd78, 4294967295; | |
xor.b64 %rd80, %rd79, %rd73; | |
xor.b64 %rd81, %rd80, 1401181199; | |
mul.lo.s64 %rd82, %rd81, 3528531795; | |
shr.u64 %rd83, %rd82, 32; | |
shr.u64 %rd84, %rd78, 32; | |
and.b64 %rd85, %rd52, 4294967295; | |
xor.b64 %rd86, %rd85, %rd84; | |
xor.b64 %rd87, %rd86, 3041712726; | |
mul.lo.s64 %rd88, %rd87, 3528531795; | |
and.b64 %rd89, %rd88, 4294967295; | |
xor.b64 %rd90, %rd89, %rd83; | |
xor.b64 %rd91, %rd90, 3678237736; | |
mul.lo.s64 %rd92, %rd91, 3449720151; | |
shr.u64 %rd93, %rd92, 32; | |
cvt.u32.u64 %r7, %rd93; | |
shr.u64 %rd94, %rd88, 32; | |
xor.b64 %rd95, %rd94, %rd62; | |
cvt.u32.u64 %r8, %rd95; | |
xor.b32 %r9, %r8, 534103459; | |
mul.lo.s32 %r10, %r9, -845247145; | |
xor.b32 %r11, %r10, %r7; | |
shr.u32 %r12, %r11, 9; | |
xor.b32 %r13, %r12, 4716963; | |
cvt.rn.f32.u32 %f1, %r13; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f2; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p2, %h1, %h2; | |
mul.wide.u32 %rd96, %r5, 4; | |
add.s64 %rd97, %rd5, %rd96; | |
ld.global.nc.v4.f32 {%f3, %f4, %f5, %f6}, [%rd97]; | |
cvt.rn.f16.f32 %h3, %f3; | |
mov.b16 %h4, 0x3C72; | |
mul.rn.f16 %h5, %h3, %h4; | |
selp.b16 %h6, %h5, 0x0000, %p2; | |
mul.wide.u32 %rd98, %r5, 2; | |
add.s64 %rd99, %rd6, %rd98; | |
xor.b64 %rd100, %rd84, %rd52; | |
xor.b64 %rd101, %rd100, 3041712726; | |
mul.lo.s64 %rd102, %rd101, 3528531795; | |
xor.b64 %rd103, %rd83, %rd102; | |
cvt.u32.u64 %r14, %rd103; | |
xor.b32 %r15, %r14, -616729560; | |
mul.lo.s32 %r16, %r15, -845247145; | |
shr.u32 %r17, %r16, 9; | |
cvt.rn.f32.u32 %f7, %r17; | |
mul.rn.f32 %f8, %f7, 0f34000000; | |
cvt.rn.f16.f32 %h7, %f8; | |
setp.ge.f16 %p3, %h7, %h2; | |
cvt.rn.f16.f32 %h8, %f4; | |
mul.rn.f16 %h9, %h8, %h4; | |
selp.b16 %h10, %h9, 0x0000, %p3; | |
and.b64 %rd104, %rd62, 4294967295; | |
xor.b64 %rd105, %rd104, %rd94; | |
xor.b64 %rd106, %rd105, 534103459; | |
mul.lo.s64 %rd107, %rd106, 3449720151; | |
shr.u64 %rd108, %rd107, 32; | |
and.b64 %rd109, %rd72, 4294967295; | |
xor.b64 %rd110, %rd109, %rd108; | |
xor.b64 %rd111, %rd110, 4055616968; | |
mul.lo.s64 %rd112, %rd111, 3528531795; | |
shr.u64 %rd113, %rd112, 32; | |
cvt.u32.u64 %r18, %rd113; | |
xor.b64 %rd114, %rd73, %rd78; | |
cvt.u32.u64 %r19, %rd114; | |
xor.b32 %r20, %r19, 1401181199; | |
mul.lo.s32 %r21, %r20, -766435501; | |
xor.b32 %r22, %r21, %r18; | |
shr.u32 %r23, %r22, 9; | |
xor.b32 %r24, %r23, 4936337; | |
cvt.rn.f32.u32 %f9, %r24; | |
mul.rn.f32 %f10, %f9, 0f34000000; | |
cvt.rn.f16.f32 %h11, %f10; | |
setp.ge.f16 %p4, %h11, %h2; | |
cvt.rn.f16.f32 %h12, %f5; | |
mul.rn.f16 %h13, %h12, %h4; | |
selp.b16 %h14, %h13, 0x0000, %p4; | |
xor.b64 %rd115, %rd63, %rd68; | |
xor.b64 %rd116, %rd115, 1684936478; | |
mul.lo.s64 %rd117, %rd116, 3449720151; | |
xor.b64 %rd118, %rd108, %rd117; | |
cvt.u32.u64 %r25, %rd118; | |
xor.b32 %r26, %r25, -239350328; | |
mul.lo.s32 %r27, %r26, -766435501; | |
shr.u32 %r28, %r27, 9; | |
cvt.rn.f32.u32 %f11, %r28; | |
mul.rn.f32 %f12, %f11, 0f34000000; | |
cvt.rn.f16.f32 %h15, %f12; | |
setp.ge.f16 %p5, %h15, %h2; | |
cvt.rn.f16.f32 %h16, %f6; | |
mul.rn.f16 %h17, %h16, %h4; | |
selp.b16 %h18, %h17, 0x0000, %p5; | |
st.global.v4.b16 [%rd99], {%h6, %h10, %h14, %h18}; | |
ret; | |
} | |
// .globl fusion_2705 | |
.visible .entry fusion_2705( | |
.param .u64 fusion_2705_param_0, | |
.param .u64 fusion_2705_param_1, | |
.param .u64 fusion_2705_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2705_param_0]; | |
ld.param.u64 %rd2, [fusion_2705_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2231 | |
.visible .entry fusion_2231( | |
.param .u64 fusion_2231_param_0, | |
.param .u64 fusion_2231_param_1, | |
.param .u64 fusion_2231_param_2, | |
.param .u64 fusion_2231_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2231_param_0]; | |
ld.param.u64 %rd2, [fusion_2231_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2231_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd6, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd5, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2230 | |
.visible .entry fusion_2230( | |
.param .u64 fusion_2230_param_0, | |
.param .u64 fusion_2230_param_1, | |
.param .u64 fusion_2230_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .b32 %r<12>; | |
.reg .b64 %rd<17>; | |
ld.param.u64 %rd1, [fusion_2230_param_0]; | |
ld.param.u64 %rd2, [fusion_2230_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
and.b32 %r8, %r4, 60; | |
shr.u32 %r9, %r2, 4; | |
mul.wide.u32 %rd5, %r9, 65536; | |
add.s64 %rd6, %rd3, %rd5; | |
mul.wide.u32 %rd7, %r1, 128; | |
add.s64 %rd8, %rd6, %rd7; | |
mul.wide.u32 %rd9, %r8, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd4, %rd11; | |
and.b32 %r10, %r6, 61; | |
mul.wide.u32 %rd13, %r10, 2; | |
add.s64 %rd14, %rd8, %rd13; | |
ld.global.nc.b16 %h2, [%rd14]; | |
and.b32 %r11, %r7, 62; | |
mul.wide.u32 %rd15, %r11, 2; | |
add.s64 %rd16, %rd8, %rd15; | |
ld.global.nc.b16 %h3, [%rd16]; | |
ld.global.nc.b16 %h4, [%rd10+6]; | |
st.global.v4.b16 [%rd12], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2704 | |
.visible .entry fusion_2704( | |
.param .u64 fusion_2704_param_0, | |
.param .u64 fusion_2704_param_1, | |
.param .u64 fusion_2704_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2704_param_0]; | |
ld.param.u64 %rd2, [fusion_2704_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl rng_get_and_update_state_4 | |
.visible .entry rng_get_and_update_state_4( | |
.param .u64 rng_get_and_update_state_4_param_0, | |
.param .u64 rng_get_and_update_state_4_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_4_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 524288; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 524288; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2229 | |
.visible .entry fusion_2229( | |
.param .u64 fusion_2229_param_0, | |
.param .u64 fusion_2229_param_1, | |
.param .u64 fusion_2229_param_2, | |
.param .u64 fusion_2229_param_3, | |
.param .u64 fusion_2229_param_4, | |
.param .u64 fusion_2229_param_5 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<43>; | |
.reg .b32 %hh<5>; | |
.reg .f32 %f<13>; | |
.reg .b32 %r<31>; | |
.reg .b64 %rd<129>; | |
ld.param.u64 %rd1, [fusion_2229_param_0]; | |
ld.param.u64 %rd2, [fusion_2229_param_4]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2229_param_1]; | |
ld.param.u64 %rd5, [fusion_2229_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2229_param_2]; | |
cvta.to.global.u64 %rd8, %rd7; | |
cvta.to.global.u64 %rd9, %rd4; | |
cvta.to.global.u64 %rd10, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd9, %rd11; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
shr.u32 %r8, %r5, 2; | |
ld.global.nc.v2.u64 {%rd13, %rd14}, [%rd6]; | |
cvt.u64.u32 %rd15, %r8; | |
add.s64 %rd16, %rd13, %rd15; | |
setp.lt.u64 %p1, %rd16, %rd13; | |
and.b64 %rd17, %rd16, 4294967295; | |
mul.lo.s64 %rd18, %rd17, 3528531795; | |
selp.u64 %rd19, 1, 0, %p1; | |
add.s64 %rd20, %rd14, %rd19; | |
xor.b64 %rd21, %rd20, %rd18; | |
shr.u64 %rd22, %rd21, 32; | |
mul.lo.s64 %rd23, %rd22, 3449720151; | |
shr.u64 %rd24, %rd23, 32; | |
and.b64 %rd25, %rd20, 4294967295; | |
mul.lo.s64 %rd26, %rd25, 3449720151; | |
and.b64 %rd27, %rd26, 4294967295; | |
xor.b64 %rd28, %rd27, %rd24; | |
xor.b64 %rd29, %rd28, 2654435769; | |
mul.lo.s64 %rd30, %rd29, 3528531795; | |
shr.u64 %rd31, %rd30, 32; | |
xor.b64 %rd32, %rd26, %rd16; | |
shr.u64 %rd33, %rd32, 32; | |
mul.lo.s64 %rd34, %rd33, 3528531795; | |
and.b64 %rd35, %rd34, 4294967295; | |
xor.b64 %rd36, %rd35, %rd31; | |
xor.b64 %rd37, %rd36, 1993301258; | |
mul.lo.s64 %rd38, %rd37, 3449720151; | |
shr.u64 %rd39, %rd38, 32; | |
shr.u64 %rd40, %rd34, 32; | |
and.b64 %rd41, %rd18, 4294967295; | |
xor.b64 %rd42, %rd41, %rd40; | |
xor.b64 %rd43, %rd42, 3144134277; | |
mul.lo.s64 %rd44, %rd43, 3449720151; | |
and.b64 %rd45, %rd44, 4294967295; | |
xor.b64 %rd46, %rd45, %rd39; | |
xor.b64 %rd47, %rd46, 3668340011; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
shr.u64 %rd49, %rd48, 32; | |
shr.u64 %rd50, %rd44, 32; | |
and.b64 %rd51, %rd23, 4294967295; | |
xor.b64 %rd52, %rd51, %rd50; | |
xor.b64 %rd53, %rd52, 1013904242; | |
mul.lo.s64 %rd54, %rd53, 3528531795; | |
and.b64 %rd55, %rd54, 4294967295; | |
xor.b64 %rd56, %rd55, %rd49; | |
xor.b64 %rd57, %rd56, 3986602516; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
shr.u64 %rd59, %rd58, 32; | |
shr.u64 %rd60, %rd54, 32; | |
and.b64 %rd61, %rd30, 4294967295; | |
xor.b64 %rd62, %rd61, %rd60; | |
xor.b64 %rd63, %rd62, 842468239; | |
mul.lo.s64 %rd64, %rd63, 3449720151; | |
and.b64 %rd65, %rd64, 4294967295; | |
xor.b64 %rd66, %rd65, %rd59; | |
xor.b64 %rd67, %rd66, 387276957; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
shr.u64 %rd69, %rd68, 32; | |
shr.u64 %rd70, %rd64, 32; | |
and.b64 %rd71, %rd38, 4294967295; | |
xor.b64 %rd72, %rd71, %rd70; | |
xor.b64 %rd73, %rd72, 2027808484; | |
mul.lo.s64 %rd74, %rd73, 3528531795; | |
and.b64 %rd75, %rd74, 4294967295; | |
shr.u64 %rd76, %rd74, 32; | |
and.b64 %rd77, %rd48, 4294967295; | |
xor.b64 %rd78, %rd77, %rd76; | |
xor.b64 %rd79, %rd78, 2835769497; | |
mul.lo.s64 %rd80, %rd79, 3449720151; | |
and.b64 %rd81, %rd80, 4294967295; | |
shr.u64 %rd82, %rd80, 32; | |
and.b64 %rd83, %rd58, 4294967295; | |
xor.b64 %rd84, %rd83, %rd82; | |
xor.b64 %rd85, %rd84, 3041712726; | |
mul.lo.s64 %rd86, %rd85, 3528531795; | |
and.b64 %rd87, %rd86, 4294967295; | |
xor.b64 %rd88, %rd75, %rd69; | |
xor.b64 %rd89, %rd88, 1684936478; | |
mul.lo.s64 %rd90, %rd89, 3449720151; | |
shr.u64 %rd91, %rd90, 32; | |
xor.b64 %rd92, %rd81, %rd91; | |
xor.b64 %rd93, %rd92, 1401181199; | |
mul.lo.s64 %rd94, %rd93, 3528531795; | |
shr.u64 %rd95, %rd94, 32; | |
xor.b64 %rd96, %rd87, %rd95; | |
xor.b64 %rd97, %rd96, 3678237736; | |
mul.lo.s64 %rd98, %rd97, 3449720151; | |
shr.u64 %rd99, %rd98, 32; | |
cvt.u32.u64 %r9, %rd99; | |
shr.u64 %rd100, %rd86, 32; | |
xor.b64 %rd101, %rd100, %rd68; | |
cvt.u32.u64 %r10, %rd101; | |
xor.b32 %r11, %r10, 534103459; | |
mul.lo.s32 %r12, %r11, -845247145; | |
xor.b32 %r13, %r12, %r9; | |
shr.u32 %r14, %r13, 9; | |
xor.b32 %r15, %r14, 4716963; | |
cvt.rn.f32.u32 %f1, %r15; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h9, %f2; | |
mov.b16 %h10, 0x2E66; | |
setp.ge.f16 %p2, %h9, %h10; | |
add.s64 %rd102, %rd8, %rd11; | |
ld.global.nc.v4.b16 {%h11, %h12, %h13, %h14}, [%rd102]; | |
mov.b32 %hh3, {%h13, %h14}; | |
mov.b32 %hh4, {%h11, %h12}; | |
mov.b32 {%h15, %h16}, %hh4; | |
mov.b32 {%h17, %h18}, %hh3; | |
mul.wide.u32 %rd103, %r4, 4; | |
add.s64 %rd104, %rd3, %rd103; | |
ld.global.nc.f32 %f3, [%rd104]; | |
cvt.rn.f16.f32 %h19, %f3; | |
add.rn.f16 %h20, %h15, %h19; | |
mov.b16 %h21, 0x3C72; | |
mul.rn.f16 %h22, %h20, %h21; | |
selp.b16 %h23, %h22, 0x0000, %p2; | |
add.rn.f16 %h24, %h5, %h23; | |
add.s64 %rd105, %rd10, %rd11; | |
xor.b64 %rd106, %rd58, %rd82; | |
xor.b64 %rd107, %rd106, 3041712726; | |
mul.lo.s64 %rd108, %rd107, 3528531795; | |
xor.b64 %rd109, %rd95, %rd108; | |
cvt.u32.u64 %r16, %rd109; | |
xor.b32 %r17, %r16, -616729560; | |
mul.lo.s32 %r18, %r17, -845247145; | |
shr.u32 %r19, %r18, 9; | |
cvt.rn.f32.u32 %f4, %r19; | |
mul.rn.f32 %f5, %f4, 0f34000000; | |
cvt.rn.f16.f32 %h25, %f5; | |
setp.ge.f16 %p3, %h25, %h10; | |
mul.wide.u32 %rd110, %r6, 4; | |
add.s64 %rd111, %rd3, %rd110; | |
ld.global.nc.f32 %f6, [%rd111]; | |
cvt.rn.f16.f32 %h26, %f6; | |
add.rn.f16 %h27, %h16, %h26; | |
mul.rn.f16 %h28, %h27, %h21; | |
selp.b16 %h29, %h28, 0x0000, %p3; | |
add.rn.f16 %h30, %h6, %h29; | |
and.b64 %rd112, %rd90, 4294967295; | |
and.b64 %rd113, %rd68, 4294967295; | |
xor.b64 %rd114, %rd113, %rd100; | |
xor.b64 %rd115, %rd114, 534103459; | |
mul.lo.s64 %rd116, %rd115, 3449720151; | |
shr.u64 %rd117, %rd116, 32; | |
xor.b64 %rd118, %rd112, %rd117; | |
xor.b64 %rd119, %rd118, 4055616968; | |
mul.lo.s64 %rd120, %rd119, 3528531795; | |
shr.u64 %rd121, %rd120, 32; | |
cvt.u32.u64 %r20, %rd121; | |
xor.b64 %rd122, %rd91, %rd80; | |
cvt.u32.u64 %r21, %rd122; | |
xor.b32 %r22, %r21, 1401181199; | |
mul.lo.s32 %r23, %r22, -766435501; | |
xor.b32 %r24, %r23, %r20; | |
shr.u32 %r25, %r24, 9; | |
xor.b32 %r26, %r25, 4936337; | |
cvt.rn.f32.u32 %f7, %r26; | |
mul.rn.f32 %f8, %f7, 0f34000000; | |
cvt.rn.f16.f32 %h31, %f8; | |
setp.ge.f16 %p4, %h31, %h10; | |
mul.wide.u32 %rd123, %r7, 4; | |
add.s64 %rd124, %rd3, %rd123; | |
ld.global.nc.f32 %f9, [%rd124]; | |
cvt.rn.f16.f32 %h32, %f9; | |
add.rn.f16 %h33, %h17, %h32; | |
mul.rn.f16 %h34, %h33, %h21; | |
selp.b16 %h35, %h34, 0x0000, %p4; | |
add.rn.f16 %h36, %h7, %h35; | |
xor.b64 %rd125, %rd69, %rd74; | |
xor.b64 %rd126, %rd125, 1684936478; | |
mul.lo.s64 %rd127, %rd126, 3449720151; | |
xor.b64 %rd128, %rd117, %rd127; | |
cvt.u32.u64 %r27, %rd128; | |
xor.b32 %r28, %r27, -239350328; | |
mul.lo.s32 %r29, %r28, -766435501; | |
shr.u32 %r30, %r29, 9; | |
cvt.rn.f32.u32 %f10, %r30; | |
mul.rn.f32 %f11, %f10, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f11; | |
setp.ge.f16 %p5, %h37, %h10; | |
ld.global.nc.f32 %f12, [%rd104+12]; | |
cvt.rn.f16.f32 %h38, %f12; | |
add.rn.f16 %h39, %h18, %h38; | |
mul.rn.f16 %h40, %h39, %h21; | |
selp.b16 %h41, %h40, 0x0000, %p5; | |
add.rn.f16 %h42, %h8, %h41; | |
st.global.v4.b16 [%rd105], {%h24, %h30, %h36, %h42}; | |
ret; | |
} | |
// .globl fusion_2228 | |
.visible .entry fusion_2228( | |
.param .u64 fusion_2228_param_0, | |
.param .u64 fusion_2228_param_1, | |
.param .u64 fusion_2228_param_2 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot47[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<56>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<22>; | |
mov.u64 %SPL, __local_depot47; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2228_param_0]; | |
cvta.to.global.u64 %rd8, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd10, %r4, 2048; | |
add.s64 %rd11, %rd8, %rd10; | |
mul.wide.u32 %rd12, %r5, 2; | |
add.s64 %rd13, %rd11, %rd12; | |
ld.global.nc.b32 %hh1, [%rd13]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
add.rn.f32 %f3, %f2, 0f00000000; | |
cvt.f32.f16 %f4, %h2; | |
add.rn.f32 %f5, %f3, %f4; | |
ld.global.nc.b32 %hh2, [%rd13+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f6, %h3; | |
add.rn.f32 %f7, %f5, %f6; | |
cvt.f32.f16 %f8, %h4; | |
add.rn.f32 %f9, %f7, %f8; | |
ld.global.nc.b32 %hh3, [%rd13+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f10, %h5; | |
add.rn.f32 %f11, %f9, %f10; | |
cvt.f32.f16 %f12, %h6; | |
add.rn.f32 %f13, %f11, %f12; | |
ld.global.nc.b32 %hh4, [%rd13+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f14, %h7; | |
add.rn.f32 %f15, %f13, %f14; | |
cvt.f32.f16 %f16, %h8; | |
add.rn.f32 %f17, %f15, %f16; | |
ld.global.nc.b32 %hh5, [%rd13+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f18, %h9; | |
add.rn.f32 %f19, %f17, %f18; | |
cvt.f32.f16 %f20, %h10; | |
add.rn.f32 %f21, %f19, %f20; | |
ld.global.nc.b32 %hh6, [%rd13+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f22, %h11; | |
add.rn.f32 %f23, %f21, %f22; | |
cvt.f32.f16 %f24, %h12; | |
add.rn.f32 %f25, %f23, %f24; | |
ld.global.nc.b32 %hh7, [%rd13+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f26, %h13; | |
add.rn.f32 %f27, %f25, %f26; | |
cvt.f32.f16 %f28, %h14; | |
add.rn.f32 %f29, %f27, %f28; | |
ld.global.nc.b32 %hh8, [%rd13+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f30, %h15; | |
add.rn.f32 %f31, %f29, %f30; | |
cvt.f32.f16 %f32, %h16; | |
add.rn.f32 %f33, %f31, %f32; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
add.rn.f32 %f35, %f34, %f33; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
add.rn.f32 %f37, %f36, %f35; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
add.rn.f32 %f39, %f38, %f37; | |
shfl.sync.down.b32 %f40, %f39, 2, 31, -1; | |
add.rn.f32 %f41, %f40, %f39; | |
shfl.sync.down.b32 %f42, %f41, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd16, shared_cache_010; | |
@%p1 bra LBB47_3; | |
bra.uni LBB47_1; | |
LBB47_3: | |
mul.wide.u32 %rd15, %r3, 4; | |
add.s64 %rd3, %rd16, %rd15; | |
add.rn.f32 %f1, %f42, %f41; | |
st.shared.f32 [%rd3], %f1; | |
LBB47_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB47_4; | |
bra.uni LBB47_2; | |
LBB47_4: | |
add.u64 %rd9, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd17, %r2, 4; | |
add.s64 %rd4, %rd16, %rd17; | |
cvta.shared.u64 %rd19, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd21, %rd19, %rd9, %p3; | |
ld.f32 %f43, [%rd21]; | |
shfl.sync.down.b32 %f44, %f43, 16, 31, -1; | |
add.rn.f32 %f45, %f43, %f44; | |
shfl.sync.down.b32 %f46, %f45, 8, 31, -1; | |
add.rn.f32 %f47, %f45, %f46; | |
shfl.sync.down.b32 %f48, %f47, 4, 31, -1; | |
add.rn.f32 %f49, %f47, %f48; | |
shfl.sync.down.b32 %f50, %f49, 2, 31, -1; | |
add.rn.f32 %f51, %f49, %f50; | |
shfl.sync.down.b32 %f52, %f51, 1, 31, -1; | |
add.rn.f32 %f53, %f51, %f52; | |
st.f32 [%rd21], %f53; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB47_2; | |
ld.param.u64 %rd6, [fusion_2228_param_1]; | |
cvta.to.global.u64 %rd7, %rd6; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd2, %rd7, %rd14; | |
ld.shared.f32 %f54, [%rd4]; | |
atom.global.add.f32 %f55, [%rd2], %f54; | |
LBB47_2: | |
ret; | |
} | |
// .globl fusion_2225 | |
.visible .entry fusion_2225( | |
.param .u64 fusion_2225_param_0, | |
.param .u64 fusion_2225_param_1, | |
.param .u64 fusion_2225_param_2, | |
.param .u64 fusion_2225_param_3 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot48[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<90>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<25>; | |
mov.u64 %SPL, __local_depot48; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2225_param_0]; | |
ld.param.u64 %rd6, [fusion_2225_param_2]; | |
cvta.to.global.u64 %rd7, %rd6; | |
cvta.to.global.u64 %rd10, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd12, %r4, 2048; | |
add.s64 %rd13, %rd10, %rd12; | |
mul.wide.u32 %rd14, %r5, 2; | |
add.s64 %rd15, %rd13, %rd14; | |
ld.global.nc.b32 %hh1, [%rd15]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
mul.wide.u32 %rd16, %r4, 4; | |
add.s64 %rd17, %rd7, %rd16; | |
ld.global.nc.f32 %f3, [%rd17]; | |
mul.rn.f32 %f4, %f3, 0f3A800000; | |
sub.rn.f32 %f5, %f2, %f4; | |
mul.rn.f32 %f6, %f5, %f5; | |
add.rn.f32 %f7, %f6, 0f00000000; | |
cvt.f32.f16 %f8, %h2; | |
sub.rn.f32 %f9, %f8, %f4; | |
mul.rn.f32 %f10, %f9, %f9; | |
add.rn.f32 %f11, %f7, %f10; | |
ld.global.nc.b32 %hh2, [%rd15+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f12, %h3; | |
sub.rn.f32 %f13, %f12, %f4; | |
mul.rn.f32 %f14, %f13, %f13; | |
add.rn.f32 %f15, %f11, %f14; | |
cvt.f32.f16 %f16, %h4; | |
sub.rn.f32 %f17, %f16, %f4; | |
mul.rn.f32 %f18, %f17, %f17; | |
add.rn.f32 %f19, %f15, %f18; | |
ld.global.nc.b32 %hh3, [%rd15+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f20, %h5; | |
sub.rn.f32 %f21, %f20, %f4; | |
mul.rn.f32 %f22, %f21, %f21; | |
add.rn.f32 %f23, %f19, %f22; | |
cvt.f32.f16 %f24, %h6; | |
sub.rn.f32 %f25, %f24, %f4; | |
mul.rn.f32 %f26, %f25, %f25; | |
add.rn.f32 %f27, %f23, %f26; | |
ld.global.nc.b32 %hh4, [%rd15+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f28, %h7; | |
sub.rn.f32 %f29, %f28, %f4; | |
mul.rn.f32 %f30, %f29, %f29; | |
add.rn.f32 %f31, %f27, %f30; | |
cvt.f32.f16 %f32, %h8; | |
sub.rn.f32 %f33, %f32, %f4; | |
mul.rn.f32 %f34, %f33, %f33; | |
add.rn.f32 %f35, %f31, %f34; | |
ld.global.nc.b32 %hh5, [%rd15+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f36, %h9; | |
sub.rn.f32 %f37, %f36, %f4; | |
mul.rn.f32 %f38, %f37, %f37; | |
add.rn.f32 %f39, %f35, %f38; | |
cvt.f32.f16 %f40, %h10; | |
sub.rn.f32 %f41, %f40, %f4; | |
mul.rn.f32 %f42, %f41, %f41; | |
add.rn.f32 %f43, %f39, %f42; | |
ld.global.nc.b32 %hh6, [%rd15+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f44, %h11; | |
sub.rn.f32 %f45, %f44, %f4; | |
mul.rn.f32 %f46, %f45, %f45; | |
add.rn.f32 %f47, %f43, %f46; | |
cvt.f32.f16 %f48, %h12; | |
sub.rn.f32 %f49, %f48, %f4; | |
mul.rn.f32 %f50, %f49, %f49; | |
add.rn.f32 %f51, %f47, %f50; | |
ld.global.nc.b32 %hh7, [%rd15+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f52, %h13; | |
sub.rn.f32 %f53, %f52, %f4; | |
mul.rn.f32 %f54, %f53, %f53; | |
add.rn.f32 %f55, %f51, %f54; | |
cvt.f32.f16 %f56, %h14; | |
sub.rn.f32 %f57, %f56, %f4; | |
mul.rn.f32 %f58, %f57, %f57; | |
add.rn.f32 %f59, %f55, %f58; | |
ld.global.nc.b32 %hh8, [%rd15+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f60, %h15; | |
sub.rn.f32 %f61, %f60, %f4; | |
mul.rn.f32 %f62, %f61, %f61; | |
add.rn.f32 %f63, %f59, %f62; | |
cvt.f32.f16 %f64, %h16; | |
sub.rn.f32 %f65, %f64, %f4; | |
mul.rn.f32 %f66, %f65, %f65; | |
add.rn.f32 %f67, %f63, %f66; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f68, %f67, 16, 31, -1; | |
add.rn.f32 %f69, %f68, %f67; | |
shfl.sync.down.b32 %f70, %f69, 8, 31, -1; | |
add.rn.f32 %f71, %f70, %f69; | |
shfl.sync.down.b32 %f72, %f71, 4, 31, -1; | |
add.rn.f32 %f73, %f72, %f71; | |
shfl.sync.down.b32 %f74, %f73, 2, 31, -1; | |
add.rn.f32 %f75, %f74, %f73; | |
shfl.sync.down.b32 %f76, %f75, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd19, shared_cache_011; | |
@%p1 bra LBB48_3; | |
bra.uni LBB48_1; | |
LBB48_3: | |
mul.wide.u32 %rd18, %r3, 4; | |
add.s64 %rd3, %rd19, %rd18; | |
add.rn.f32 %f1, %f76, %f75; | |
st.shared.f32 [%rd3], %f1; | |
LBB48_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB48_4; | |
bra.uni LBB48_2; | |
LBB48_4: | |
add.u64 %rd11, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd20, %r2, 4; | |
add.s64 %rd4, %rd19, %rd20; | |
cvta.shared.u64 %rd22, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd24, %rd22, %rd11, %p3; | |
ld.f32 %f77, [%rd24]; | |
shfl.sync.down.b32 %f78, %f77, 16, 31, -1; | |
add.rn.f32 %f79, %f77, %f78; | |
shfl.sync.down.b32 %f80, %f79, 8, 31, -1; | |
add.rn.f32 %f81, %f79, %f80; | |
shfl.sync.down.b32 %f82, %f81, 4, 31, -1; | |
add.rn.f32 %f83, %f81, %f82; | |
shfl.sync.down.b32 %f84, %f83, 2, 31, -1; | |
add.rn.f32 %f85, %f83, %f84; | |
shfl.sync.down.b32 %f86, %f85, 1, 31, -1; | |
add.rn.f32 %f87, %f85, %f86; | |
st.f32 [%rd24], %f87; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB48_2; | |
ld.param.u64 %rd8, [fusion_2225_param_1]; | |
cvta.to.global.u64 %rd9, %rd8; | |
add.s64 %rd2, %rd9, %rd16; | |
ld.shared.f32 %f88, [%rd4]; | |
atom.global.add.f32 %f89, [%rd2], %f88; | |
LBB48_2: | |
ret; | |
} | |
// .globl fusion_2221 | |
.visible .entry fusion_2221( | |
.param .u64 fusion_2221_param_0, | |
.param .u64 fusion_2221_param_1, | |
.param .u64 fusion_2221_param_2, | |
.param .u64 fusion_2221_param_3, | |
.param .u64 fusion_2221_param_4, | |
.param .u64 fusion_2221_param_5, | |
.param .u64 fusion_2221_param_6 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<39>; | |
.reg .b32 %r<8>; | |
.reg .b64 %rd<28>; | |
ld.param.u64 %rd1, [fusion_2221_param_0]; | |
ld.param.u64 %rd2, [fusion_2221_param_5]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2221_param_1]; | |
ld.param.u64 %rd5, [fusion_2221_param_4]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2221_param_2]; | |
ld.param.u64 %rd8, [fusion_2221_param_3]; | |
cvta.to.global.u64 %rd9, %rd8; | |
cvta.to.global.u64 %rd10, %rd7; | |
cvta.to.global.u64 %rd11, %rd4; | |
cvta.to.global.u64 %rd12, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
mul.wide.u32 %rd13, %r5, 2; | |
add.s64 %rd14, %rd11, %rd13; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd14]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
cvt.f32.f16 %f1, %h5; | |
mul.wide.u32 %rd15, %r1, 4; | |
add.s64 %rd16, %rd10, %rd15; | |
ld.global.nc.f32 %f2, [%rd16]; | |
mul.rn.f32 %f3, %f2, 0f3A800000; | |
add.rn.f32 %f4, %f3, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f5, %f4; | |
mul.wide.u32 %rd17, %r4, 4; | |
add.s64 %rd18, %rd3, %rd17; | |
ld.global.nc.f32 %f6, [%rd18]; | |
mul.rn.f32 %f7, %f5, %f6; | |
mul.rn.f32 %f8, %f7, %f1; | |
add.s64 %rd19, %rd6, %rd17; | |
ld.global.nc.f32 %f9, [%rd19]; | |
add.s64 %rd20, %rd9, %rd15; | |
ld.global.nc.f32 %f10, [%rd20]; | |
mul.rn.f32 %f11, %f10, 0f3A800000; | |
mul.rn.f32 %f12, %f7, %f11; | |
sub.rn.f32 %f13, %f9, %f12; | |
add.rn.f32 %f14, %f8, %f13; | |
cvt.rn.f16.f32 %h9, %f14; | |
add.s64 %rd21, %rd12, %rd13; | |
cvt.f32.f16 %f15, %h6; | |
mul.wide.u32 %rd22, %r6, 4; | |
add.s64 %rd23, %rd3, %rd22; | |
ld.global.nc.f32 %f16, [%rd23]; | |
mul.rn.f32 %f17, %f5, %f16; | |
mul.rn.f32 %f18, %f17, %f15; | |
add.s64 %rd24, %rd6, %rd22; | |
ld.global.nc.f32 %f19, [%rd24]; | |
mul.rn.f32 %f20, %f11, %f17; | |
sub.rn.f32 %f21, %f19, %f20; | |
add.rn.f32 %f22, %f18, %f21; | |
cvt.rn.f16.f32 %h10, %f22; | |
cvt.f32.f16 %f23, %h7; | |
mul.wide.u32 %rd25, %r7, 4; | |
add.s64 %rd26, %rd3, %rd25; | |
ld.global.nc.f32 %f24, [%rd26]; | |
mul.rn.f32 %f25, %f5, %f24; | |
mul.rn.f32 %f26, %f25, %f23; | |
add.s64 %rd27, %rd6, %rd25; | |
ld.global.nc.f32 %f27, [%rd27]; | |
mul.rn.f32 %f28, %f11, %f25; | |
sub.rn.f32 %f29, %f27, %f28; | |
add.rn.f32 %f30, %f26, %f29; | |
cvt.rn.f16.f32 %h11, %f30; | |
cvt.f32.f16 %f31, %h8; | |
ld.global.nc.f32 %f32, [%rd18+12]; | |
mul.rn.f32 %f33, %f5, %f32; | |
mul.rn.f32 %f34, %f33, %f31; | |
ld.global.nc.f32 %f35, [%rd19+12]; | |
mul.rn.f32 %f36, %f11, %f33; | |
sub.rn.f32 %f37, %f35, %f36; | |
add.rn.f32 %f38, %f34, %f37; | |
cvt.rn.f16.f32 %h12, %f38; | |
st.global.v4.b16 [%rd21], {%h9, %h10, %h11, %h12}; | |
ret; | |
} | |
// .globl convert_1525 | |
.visible .entry convert_1525( | |
.param .u64 convert_1525_param_0, | |
.param .u64 convert_1525_param_1, | |
.param .u64 convert_1525_param_2 | |
) | |
.reqntid 128, 1, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %h<29>; | |
.reg .f32 %f<29>; | |
.reg .b32 %r<9>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd3, [convert_1525_param_0]; | |
ld.param.u64 %rd4, [convert_1525_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd3; | |
mov.u32 %r3, %ctaid.x; | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r1, %r3, 9; | |
shl.b32 %r2, %r4, 2; | |
or.b32 %r5, %r1, %r2; | |
mul.wide.u32 %rd7, %r5, 4; | |
add.s64 %rd1, %rd5, %rd7; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd8, %r5, 2; | |
add.s64 %rd2, %rd6, %rd8; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4}; | |
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440]; | |
cvt.rn.f16.f32 %h5, %f5; | |
cvt.rn.f16.f32 %h6, %f6; | |
cvt.rn.f16.f32 %h7, %f7; | |
cvt.rn.f16.f32 %h8, %f8; | |
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8}; | |
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880]; | |
cvt.rn.f16.f32 %h9, %f9; | |
cvt.rn.f16.f32 %h10, %f10; | |
cvt.rn.f16.f32 %h11, %f11; | |
cvt.rn.f16.f32 %h12, %f12; | |
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12}; | |
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320]; | |
cvt.rn.f16.f32 %h13, %f13; | |
cvt.rn.f16.f32 %h14, %f14; | |
cvt.rn.f16.f32 %h15, %f15; | |
cvt.rn.f16.f32 %h16, %f16; | |
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16}; | |
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760]; | |
cvt.rn.f16.f32 %h17, %f17; | |
cvt.rn.f16.f32 %h18, %f18; | |
cvt.rn.f16.f32 %h19, %f19; | |
cvt.rn.f16.f32 %h20, %f20; | |
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20}; | |
add.s32 %r6, %r5, 3276800; | |
setp.gt.u32 %p1, %r6, 4194303; | |
@%p1 bra LBB50_2; | |
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200]; | |
cvt.rn.f16.f32 %h21, %f21; | |
cvt.rn.f16.f32 %h22, %f22; | |
cvt.rn.f16.f32 %h23, %f23; | |
cvt.rn.f16.f32 %h24, %f24; | |
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24}; | |
LBB50_2: | |
add.s32 %r7, %r1, 3932160; | |
or.b32 %r8, %r7, %r2; | |
setp.gt.u32 %p2, %r8, 4194303; | |
@%p2 bra LBB50_4; | |
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640]; | |
cvt.rn.f16.f32 %h25, %f25; | |
cvt.rn.f16.f32 %h26, %f26; | |
cvt.rn.f16.f32 %h27, %f27; | |
cvt.rn.f16.f32 %h28, %f28; | |
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28}; | |
LBB50_4: | |
ret; | |
} | |
// .globl fusion_2216 | |
.visible .entry fusion_2216( | |
.param .u64 fusion_2216_param_0, | |
.param .u64 fusion_2216_param_1, | |
.param .u64 fusion_2216_param_2, | |
.param .u64 fusion_2216_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<21>; | |
.reg .b16 %h<21>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<150>; | |
.reg .b32 %r<25>; | |
.reg .b64 %rd<18>; | |
ld.param.u64 %rd1, [fusion_2216_param_0]; | |
ld.param.u64 %rd2, [fusion_2216_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2216_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r5, 1; | |
or.b32 %r7, %r5, 2; | |
or.b32 %r8, %r5, 3; | |
and.b32 %r9, %r8, 4095; | |
and.b32 %r10, %r7, 4094; | |
and.b32 %r11, %r6, 4093; | |
and.b32 %r12, %r5, 4092; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd5, %rd7; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd8]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
mul.wide.u32 %rd9, %r12, 4; | |
add.s64 %rd10, %rd3, %rd9; | |
ld.global.nc.f32 %f1, [%rd10]; | |
cvt.rn.f16.f32 %h9, %f1; | |
add.rn.f16 %h10, %h5, %h9; | |
cvt.f32.f16 %f2, %h10; | |
mul.rn.f32 %f3, %f2, %f2; | |
mul.rn.f32 %f4, %f3, %f2; | |
mul.rn.f32 %f5, %f4, 0f3D372713; | |
add.rn.f32 %f6, %f5, %f2; | |
mul.rn.f32 %f7, %f6, 0f3F4C422A; | |
abs.f32 %f8, %f7; | |
setp.lt.f32 %p1, %f8, 0f39D1B717; | |
setp.lt.f32 %p2, %f7, 0fC1100000; | |
selp.f32 %f9, 0fC1100000, %f7, %p2; | |
setp.gt.f32 %p3, %f9, 0f41100000; | |
selp.f32 %f10, 0f41100000, %f9, %p3; | |
mul.rn.f32 %f11, %f10, %f10; | |
mul.rn.f32 %f12, %f11, 0f259F25C0; | |
mov.f32 %f13, 0f2A61337E; | |
sub.rn.f32 %f14, %f13, %f12; | |
mul.rn.f32 %f15, %f11, %f14; | |
add.rn.f32 %f16, %f15, 0fAEBD37FF; | |
mul.rn.f32 %f17, %f11, %f16; | |
add.rn.f32 %f18, %f17, 0f335C0041; | |
mul.rn.f32 %f19, %f11, %f18; | |
add.rn.f32 %f20, %f19, 0f3779434A; | |
mul.rn.f32 %f21, %f11, %f20; | |
add.rn.f32 %f22, %f21, 0f3A270DED; | |
mul.rn.f32 %f23, %f11, %f22; | |
add.rn.f32 %f24, %f23, 0f3BA059DC; | |
mul.rn.f32 %f25, %f10, %f24; | |
mul.rn.f32 %f26, %f11, 0f35A0D3D8; | |
add.rn.f32 %f27, %f26, 0f38F895D6; | |
mul.rn.f32 %f28, %f11, %f27; | |
add.rn.f32 %f29, %f28, 0f3B14AA05; | |
mul.rn.f32 %f30, %f11, %f29; | |
add.rn.f32 %f31, %f30, 0f3BA059DD; | |
div.full.f32 %f32, %f25, %f31; | |
selp.f32 %f33, %f7, %f32, %p1; | |
mov.b32 %r13, %f7; | |
shr.u32 %r14, %r13, 31; | |
and.b32 %r15, %r14, 1; | |
setp.eq.b32 %p4, %r15, 1; | |
selp.f32 %f34, 0fBF800000, 0f3F800000, %p4; | |
setp.ltu.f32 %p5, %f8, 0f41A00000; | |
selp.f32 %f35, %f33, %f34, %p5; | |
add.rn.f32 %f36, %f35, 0f3F800000; | |
mul.rn.f32 %f37, %f36, 0f3F000000; | |
mul.rn.f32 %f38, %f37, %f2; | |
cvt.rn.f16.f32 %h11, %f38; | |
add.s64 %rd11, %rd6, %rd7; | |
mul.wide.u32 %rd12, %r11, 4; | |
add.s64 %rd13, %rd3, %rd12; | |
ld.global.nc.f32 %f39, [%rd13]; | |
cvt.rn.f16.f32 %h12, %f39; | |
add.rn.f16 %h13, %h6, %h12; | |
cvt.f32.f16 %f40, %h13; | |
mul.rn.f32 %f41, %f40, %f40; | |
mul.rn.f32 %f42, %f41, %f40; | |
mul.rn.f32 %f43, %f42, 0f3D372713; | |
add.rn.f32 %f44, %f43, %f40; | |
mul.rn.f32 %f45, %f44, 0f3F4C422A; | |
abs.f32 %f46, %f45; | |
setp.lt.f32 %p6, %f46, 0f39D1B717; | |
setp.lt.f32 %p7, %f45, 0fC1100000; | |
selp.f32 %f47, 0fC1100000, %f45, %p7; | |
setp.gt.f32 %p8, %f47, 0f41100000; | |
selp.f32 %f48, 0f41100000, %f47, %p8; | |
mul.rn.f32 %f49, %f48, %f48; | |
mul.rn.f32 %f50, %f49, 0f259F25C0; | |
sub.rn.f32 %f51, %f13, %f50; | |
mul.rn.f32 %f52, %f49, %f51; | |
add.rn.f32 %f53, %f52, 0fAEBD37FF; | |
mul.rn.f32 %f54, %f49, %f53; | |
add.rn.f32 %f55, %f54, 0f335C0041; | |
mul.rn.f32 %f56, %f49, %f55; | |
add.rn.f32 %f57, %f56, 0f3779434A; | |
mul.rn.f32 %f58, %f49, %f57; | |
add.rn.f32 %f59, %f58, 0f3A270DED; | |
mul.rn.f32 %f60, %f49, %f59; | |
add.rn.f32 %f61, %f60, 0f3BA059DC; | |
mul.rn.f32 %f62, %f48, %f61; | |
mul.rn.f32 %f63, %f49, 0f35A0D3D8; | |
add.rn.f32 %f64, %f63, 0f38F895D6; | |
mul.rn.f32 %f65, %f49, %f64; | |
add.rn.f32 %f66, %f65, 0f3B14AA05; | |
mul.rn.f32 %f67, %f49, %f66; | |
add.rn.f32 %f68, %f67, 0f3BA059DD; | |
div.full.f32 %f69, %f62, %f68; | |
selp.f32 %f70, %f45, %f69, %p6; | |
mov.b32 %r16, %f45; | |
shr.u32 %r17, %r16, 31; | |
and.b32 %r18, %r17, 1; | |
setp.eq.b32 %p9, %r18, 1; | |
selp.f32 %f71, 0fBF800000, 0f3F800000, %p9; | |
setp.ltu.f32 %p10, %f46, 0f41A00000; | |
selp.f32 %f72, %f70, %f71, %p10; | |
add.rn.f32 %f73, %f72, 0f3F800000; | |
mul.rn.f32 %f74, %f73, 0f3F000000; | |
mul.rn.f32 %f75, %f74, %f40; | |
cvt.rn.f16.f32 %h14, %f75; | |
mul.wide.u32 %rd14, %r10, 4; | |
add.s64 %rd15, %rd3, %rd14; | |
ld.global.nc.f32 %f76, [%rd15]; | |
cvt.rn.f16.f32 %h15, %f76; | |
add.rn.f16 %h16, %h7, %h15; | |
cvt.f32.f16 %f77, %h16; | |
mul.rn.f32 %f78, %f77, %f77; | |
mul.rn.f32 %f79, %f78, %f77; | |
mul.rn.f32 %f80, %f79, 0f3D372713; | |
add.rn.f32 %f81, %f80, %f77; | |
mul.rn.f32 %f82, %f81, 0f3F4C422A; | |
abs.f32 %f83, %f82; | |
setp.lt.f32 %p11, %f83, 0f39D1B717; | |
setp.lt.f32 %p12, %f82, 0fC1100000; | |
selp.f32 %f84, 0fC1100000, %f82, %p12; | |
setp.gt.f32 %p13, %f84, 0f41100000; | |
selp.f32 %f85, 0f41100000, %f84, %p13; | |
mul.rn.f32 %f86, %f85, %f85; | |
mul.rn.f32 %f87, %f86, 0f259F25C0; | |
sub.rn.f32 %f88, %f13, %f87; | |
mul.rn.f32 %f89, %f86, %f88; | |
add.rn.f32 %f90, %f89, 0fAEBD37FF; | |
mul.rn.f32 %f91, %f86, %f90; | |
add.rn.f32 %f92, %f91, 0f335C0041; | |
mul.rn.f32 %f93, %f86, %f92; | |
add.rn.f32 %f94, %f93, 0f3779434A; | |
mul.rn.f32 %f95, %f86, %f94; | |
add.rn.f32 %f96, %f95, 0f3A270DED; | |
mul.rn.f32 %f97, %f86, %f96; | |
add.rn.f32 %f98, %f97, 0f3BA059DC; | |
mul.rn.f32 %f99, %f85, %f98; | |
mul.rn.f32 %f100, %f86, 0f35A0D3D8; | |
add.rn.f32 %f101, %f100, 0f38F895D6; | |
mul.rn.f32 %f102, %f86, %f101; | |
add.rn.f32 %f103, %f102, 0f3B14AA05; | |
mul.rn.f32 %f104, %f86, %f103; | |
add.rn.f32 %f105, %f104, 0f3BA059DD; | |
div.full.f32 %f106, %f99, %f105; | |
selp.f32 %f107, %f82, %f106, %p11; | |
mov.b32 %r19, %f82; | |
shr.u32 %r20, %r19, 31; | |
and.b32 %r21, %r20, 1; | |
setp.eq.b32 %p14, %r21, 1; | |
selp.f32 %f108, 0fBF800000, 0f3F800000, %p14; | |
setp.ltu.f32 %p15, %f83, 0f41A00000; | |
selp.f32 %f109, %f107, %f108, %p15; | |
add.rn.f32 %f110, %f109, 0f3F800000; | |
mul.rn.f32 %f111, %f110, 0f3F000000; | |
mul.rn.f32 %f112, %f111, %f77; | |
cvt.rn.f16.f32 %h17, %f112; | |
mul.wide.u32 %rd16, %r9, 4; | |
add.s64 %rd17, %rd3, %rd16; | |
ld.global.nc.f32 %f113, [%rd17]; | |
cvt.rn.f16.f32 %h18, %f113; | |
add.rn.f16 %h19, %h8, %h18; | |
cvt.f32.f16 %f114, %h19; | |
mul.rn.f32 %f115, %f114, %f114; | |
mul.rn.f32 %f116, %f115, %f114; | |
mul.rn.f32 %f117, %f116, 0f3D372713; | |
add.rn.f32 %f118, %f117, %f114; | |
mul.rn.f32 %f119, %f118, 0f3F4C422A; | |
abs.f32 %f120, %f119; | |
setp.lt.f32 %p16, %f120, 0f39D1B717; | |
setp.lt.f32 %p17, %f119, 0fC1100000; | |
selp.f32 %f121, 0fC1100000, %f119, %p17; | |
setp.gt.f32 %p18, %f121, 0f41100000; | |
selp.f32 %f122, 0f41100000, %f121, %p18; | |
mul.rn.f32 %f123, %f122, %f122; | |
mul.rn.f32 %f124, %f123, 0f259F25C0; | |
sub.rn.f32 %f125, %f13, %f124; | |
mul.rn.f32 %f126, %f123, %f125; | |
add.rn.f32 %f127, %f126, 0fAEBD37FF; | |
mul.rn.f32 %f128, %f123, %f127; | |
add.rn.f32 %f129, %f128, 0f335C0041; | |
mul.rn.f32 %f130, %f123, %f129; | |
add.rn.f32 %f131, %f130, 0f3779434A; | |
mul.rn.f32 %f132, %f123, %f131; | |
add.rn.f32 %f133, %f132, 0f3A270DED; | |
mul.rn.f32 %f134, %f123, %f133; | |
add.rn.f32 %f135, %f134, 0f3BA059DC; | |
mul.rn.f32 %f136, %f122, %f135; | |
mul.rn.f32 %f137, %f123, 0f35A0D3D8; | |
add.rn.f32 %f138, %f137, 0f38F895D6; | |
mul.rn.f32 %f139, %f123, %f138; | |
add.rn.f32 %f140, %f139, 0f3B14AA05; | |
mul.rn.f32 %f141, %f123, %f140; | |
add.rn.f32 %f142, %f141, 0f3BA059DD; | |
div.full.f32 %f143, %f136, %f142; | |
selp.f32 %f144, %f119, %f143, %p16; | |
mov.b32 %r22, %f119; | |
shr.u32 %r23, %r22, 31; | |
and.b32 %r24, %r23, 1; | |
setp.eq.b32 %p19, %r24, 1; | |
selp.f32 %f145, 0fBF800000, 0f3F800000, %p19; | |
setp.ltu.f32 %p20, %f120, 0f41A00000; | |
selp.f32 %f146, %f144, %f145, %p20; | |
add.rn.f32 %f147, %f146, 0f3F800000; | |
mul.rn.f32 %f148, %f147, 0f3F000000; | |
mul.rn.f32 %f149, %f148, %f114; | |
cvt.rn.f16.f32 %h20, %f149; | |
st.global.v4.b16 [%rd11], {%h11, %h14, %h17, %h20}; | |
ret; | |
} | |
// .globl convert_1527 | |
.visible .entry convert_1527( | |
.param .u64 convert_1527_param_0, | |
.param .u64 convert_1527_param_1, | |
.param .u64 convert_1527_param_2 | |
) | |
.reqntid 128, 1, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %h<29>; | |
.reg .f32 %f<29>; | |
.reg .b32 %r<9>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd3, [convert_1527_param_0]; | |
ld.param.u64 %rd4, [convert_1527_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd3; | |
mov.u32 %r3, %ctaid.x; | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r1, %r3, 9; | |
shl.b32 %r2, %r4, 2; | |
or.b32 %r5, %r1, %r2; | |
mul.wide.u32 %rd7, %r5, 4; | |
add.s64 %rd1, %rd5, %rd7; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd8, %r5, 2; | |
add.s64 %rd2, %rd6, %rd8; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4}; | |
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440]; | |
cvt.rn.f16.f32 %h5, %f5; | |
cvt.rn.f16.f32 %h6, %f6; | |
cvt.rn.f16.f32 %h7, %f7; | |
cvt.rn.f16.f32 %h8, %f8; | |
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8}; | |
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880]; | |
cvt.rn.f16.f32 %h9, %f9; | |
cvt.rn.f16.f32 %h10, %f10; | |
cvt.rn.f16.f32 %h11, %f11; | |
cvt.rn.f16.f32 %h12, %f12; | |
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12}; | |
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320]; | |
cvt.rn.f16.f32 %h13, %f13; | |
cvt.rn.f16.f32 %h14, %f14; | |
cvt.rn.f16.f32 %h15, %f15; | |
cvt.rn.f16.f32 %h16, %f16; | |
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16}; | |
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760]; | |
cvt.rn.f16.f32 %h17, %f17; | |
cvt.rn.f16.f32 %h18, %f18; | |
cvt.rn.f16.f32 %h19, %f19; | |
cvt.rn.f16.f32 %h20, %f20; | |
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20}; | |
add.s32 %r6, %r5, 3276800; | |
setp.gt.u32 %p1, %r6, 4194303; | |
@%p1 bra LBB52_2; | |
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200]; | |
cvt.rn.f16.f32 %h21, %f21; | |
cvt.rn.f16.f32 %h22, %f22; | |
cvt.rn.f16.f32 %h23, %f23; | |
cvt.rn.f16.f32 %h24, %f24; | |
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24}; | |
LBB52_2: | |
add.s32 %r7, %r1, 3932160; | |
or.b32 %r8, %r7, %r2; | |
setp.gt.u32 %p2, %r8, 4194303; | |
@%p2 bra LBB52_4; | |
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640]; | |
cvt.rn.f16.f32 %h25, %f25; | |
cvt.rn.f16.f32 %h26, %f26; | |
cvt.rn.f16.f32 %h27, %f27; | |
cvt.rn.f16.f32 %h28, %f28; | |
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28}; | |
LBB52_4: | |
ret; | |
} | |
// .globl rng_get_and_update_state_5 | |
.visible .entry rng_get_and_update_state_5( | |
.param .u64 rng_get_and_update_state_5_param_0, | |
.param .u64 rng_get_and_update_state_5_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_5_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 524288; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 524288; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2214 | |
.visible .entry fusion_2214( | |
.param .u64 fusion_2214_param_0, | |
.param .u64 fusion_2214_param_1, | |
.param .u64 fusion_2214_param_2, | |
.param .u64 fusion_2214_param_3, | |
.param .u64 fusion_2214_param_4, | |
.param .u64 fusion_2214_param_5, | |
.param .u64 fusion_2214_param_6, | |
.param .u64 fusion_2214_param_7, | |
.param .u64 fusion_2214_param_8, | |
.param .u64 fusion_2214_param_9 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot54[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<75>; | |
.reg .b16 %h<145>; | |
.reg .f32 %f<254>; | |
.reg .b32 %r<350>; | |
.reg .b64 %rd<2739>; | |
mov.u64 %SPL, __local_depot54; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd463, [fusion_2214_param_0]; | |
ld.param.u64 %rd464, [fusion_2214_param_8]; | |
cvta.to.global.u64 %rd1, %rd464; | |
ld.param.u64 %rd465, [fusion_2214_param_1]; | |
ld.param.u64 %rd466, [fusion_2214_param_7]; | |
cvta.to.global.u64 %rd2, %rd466; | |
ld.param.u64 %rd467, [fusion_2214_param_2]; | |
ld.param.u64 %rd468, [fusion_2214_param_6]; | |
cvta.to.global.u64 %rd3, %rd468; | |
ld.param.u64 %rd470, [fusion_2214_param_5]; | |
cvta.to.global.u64 %rd4, %rd470; | |
ld.param.u64 %rd471, [fusion_2214_param_4]; | |
cvta.to.global.u64 %rd5, %rd471; | |
cvta.to.global.u64 %rd7, %rd467; | |
cvta.to.global.u64 %rd8, %rd465; | |
cvta.to.global.u64 %rd9, %rd463; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 10; | |
or.b32 %r48, %r4, %r3; | |
shr.u32 %r49, %r48, 2; | |
and.b32 %r5, %r1, 1; | |
setp.eq.s32 %p1, %r5, 0; | |
ld.global.nc.u64 %rd11, [%rd7]; | |
cvt.u64.u32 %rd473, %r49; | |
add.s64 %rd12, %rd11, %rd473; | |
setp.lt.u64 %p69, %rd12, %rd11; | |
and.b64 %rd2384, %rd12, 4294967295; | |
@%p1 bra LBB54_1; | |
bra.uni LBB54_4; | |
LBB54_1: | |
mul.lo.s64 %rd2446, %rd2384, 3528531795; | |
ld.global.nc.u64 %rd2461, [%rd7+8]; | |
selp.u64 %rd516, 1, 0, %p69; | |
add.s64 %rd517, %rd2461, %rd516; | |
xor.b64 %rd518, %rd517, %rd2446; | |
shr.u64 %rd519, %rd518, 32; | |
mul.lo.s64 %rd2449, %rd519, 3449720151; | |
shr.u64 %rd520, %rd2449, 32; | |
and.b64 %rd521, %rd517, 4294967295; | |
mul.lo.s64 %rd522, %rd521, 3449720151; | |
and.b64 %rd523, %rd522, 4294967295; | |
xor.b64 %rd524, %rd523, %rd520; | |
xor.b64 %rd525, %rd524, 2654435769; | |
mul.lo.s64 %rd2452, %rd525, 3528531795; | |
xor.b64 %rd2442, %rd522, %rd12; | |
mov.u32 %r312, -1879881855; | |
mov.u32 %r311, -845247145; | |
mov.u32 %r310, 534103459; | |
mov.u64 %rd2460, 3678237736; | |
mov.u64 %rd2459, 3041712726; | |
mov.u64 %rd2458, 1401181199; | |
mov.u64 %rd2457, 2835769497; | |
mov.u64 %rd2456, 1684936478; | |
mov.u64 %rd2455, 2027808484; | |
mov.u64 %rd2454, 387276957; | |
mov.u64 %rd2453, 842468239; | |
mov.u64 %rd2451, 3986602516; | |
mov.u64 %rd2450, 1013904242; | |
mov.u64 %rd2448, 3668340011; | |
mov.u64 %rd2447, 3144134277; | |
mov.u64 %rd2445, 3449720151; | |
mov.u64 %rd2444, 1993301258; | |
mov.u64 %rd2443, 3528531795; | |
bra.uni LBB54_5; | |
LBB54_4: | |
mov.u32 %r311, -766435501; | |
mov.u64 %rd2459, 1684936478; | |
mov.u64 %rd2458, 534103459; | |
mov.u64 %rd2457, 387276957; | |
mov.u64 %rd2456, 3041712726; | |
mov.u64 %rd2455, 3986602516; | |
mov.u64 %rd2454, 2835769497; | |
mov.u64 %rd2453, 3668340011; | |
mov.u64 %rd2451, 2027808484; | |
mov.u64 %rd2450, 1993301258; | |
mov.u64 %rd2448, 842468239; | |
mov.u64 %rd2447, 2654435769; | |
mov.u64 %rd2445, 3528531795; | |
mov.u64 %rd2444, 1013904242; | |
mov.u64 %rd2443, 3449720151; | |
mov.u32 %r312, -1767562579; | |
mov.u32 %r310, 1401181199; | |
mov.u64 %rd2460, 4055616968; | |
ld.global.nc.u64 %rd2461, [%rd7+8]; | |
selp.u64 %rd489, 1, 0, %p69; | |
add.s64 %rd490, %rd2461, %rd489; | |
and.b64 %rd491, %rd490, 4294967295; | |
mul.lo.s64 %rd2446, %rd491, 3449720151; | |
xor.b64 %rd492, %rd2446, %rd12; | |
shr.u64 %rd493, %rd492, 32; | |
mul.lo.s64 %rd2449, %rd493, 3528531795; | |
shr.u64 %rd494, %rd2449, 32; | |
mul.lo.s64 %rd496, %rd2384, 3528531795; | |
and.b64 %rd497, %rd496, 4294967295; | |
xor.b64 %rd498, %rd497, %rd494; | |
xor.b64 %rd499, %rd498, 3144134277; | |
mul.lo.s64 %rd2452, %rd499, 3449720151; | |
xor.b64 %rd2442, %rd490, %rd496; | |
LBB54_5: | |
shr.u64 %rd526, %rd2452, 32; | |
shr.u64 %rd527, %rd2442, 32; | |
mul.lo.s64 %rd528, %rd527, %rd2443; | |
and.b64 %rd529, %rd528, 4294967295; | |
xor.b64 %rd530, %rd529, %rd526; | |
xor.b64 %rd531, %rd530, %rd2444; | |
mul.lo.s64 %rd532, %rd531, %rd2445; | |
shr.u64 %rd533, %rd532, 32; | |
shr.u64 %rd534, %rd528, 32; | |
and.b64 %rd535, %rd2446, 4294967295; | |
xor.b64 %rd536, %rd535, %rd534; | |
xor.b64 %rd537, %rd536, %rd2447; | |
mul.lo.s64 %rd538, %rd537, %rd2445; | |
and.b64 %rd539, %rd538, 4294967295; | |
xor.b64 %rd540, %rd539, %rd533; | |
xor.b64 %rd541, %rd540, %rd2448; | |
mul.lo.s64 %rd542, %rd541, %rd2443; | |
shr.u64 %rd543, %rd542, 32; | |
shr.u64 %rd544, %rd538, 32; | |
and.b64 %rd545, %rd2449, 4294967295; | |
xor.b64 %rd546, %rd545, %rd544; | |
xor.b64 %rd547, %rd546, %rd2450; | |
mul.lo.s64 %rd548, %rd547, %rd2443; | |
and.b64 %rd549, %rd548, 4294967295; | |
xor.b64 %rd550, %rd549, %rd543; | |
xor.b64 %rd551, %rd550, %rd2451; | |
mul.lo.s64 %rd552, %rd551, %rd2445; | |
shr.u64 %rd553, %rd552, 32; | |
shr.u64 %rd554, %rd548, 32; | |
and.b64 %rd555, %rd2452, 4294967295; | |
xor.b64 %rd556, %rd555, %rd554; | |
xor.b64 %rd557, %rd556, %rd2453; | |
mul.lo.s64 %rd558, %rd557, %rd2445; | |
and.b64 %rd559, %rd558, 4294967295; | |
xor.b64 %rd560, %rd559, %rd553; | |
xor.b64 %rd561, %rd560, %rd2454; | |
mul.lo.s64 %rd562, %rd561, %rd2443; | |
shr.u64 %rd563, %rd562, 32; | |
shr.u64 %rd564, %rd558, 32; | |
and.b64 %rd565, %rd532, 4294967295; | |
xor.b64 %rd566, %rd565, %rd564; | |
xor.b64 %rd567, %rd566, %rd2455; | |
mul.lo.s64 %rd568, %rd567, %rd2443; | |
and.b64 %rd569, %rd568, 4294967295; | |
xor.b64 %rd570, %rd569, %rd563; | |
xor.b64 %rd571, %rd570, %rd2456; | |
mul.lo.s64 %rd572, %rd571, %rd2445; | |
shr.u64 %rd573, %rd572, 32; | |
shr.u64 %rd574, %rd568, 32; | |
and.b64 %rd575, %rd542, 4294967295; | |
xor.b64 %rd576, %rd575, %rd574; | |
xor.b64 %rd577, %rd576, %rd2457; | |
mul.lo.s64 %rd578, %rd577, %rd2445; | |
and.b64 %rd579, %rd578, 4294967295; | |
xor.b64 %rd580, %rd579, %rd573; | |
xor.b64 %rd581, %rd580, %rd2458; | |
mul.lo.s64 %rd582, %rd581, %rd2443; | |
shr.u64 %rd583, %rd582, 32; | |
shr.u64 %rd584, %rd578, 32; | |
and.b64 %rd585, %rd552, 4294967295; | |
xor.b64 %rd586, %rd585, %rd584; | |
xor.b64 %rd587, %rd586, %rd2459; | |
mul.lo.s64 %rd588, %rd587, %rd2443; | |
and.b64 %rd589, %rd588, 4294967295; | |
xor.b64 %rd590, %rd589, %rd583; | |
xor.b64 %rd591, %rd590, %rd2460; | |
mul.lo.s64 %rd592, %rd591, %rd2445; | |
shr.u64 %rd593, %rd592, 32; | |
cvt.u32.u64 %r56, %rd593; | |
shr.u64 %rd594, %rd588, 32; | |
xor.b64 %rd595, %rd594, %rd562; | |
cvt.u32.u64 %r57, %rd595; | |
xor.b32 %r58, %r310, %r57; | |
mul.lo.s32 %r59, %r58, %r311; | |
xor.b32 %r60, %r59, %r56; | |
xor.b32 %r61, %r60, %r312; | |
shr.u32 %r62, %r61, 9; | |
cvt.rn.f32.u32 %f19, %r62; | |
mul.rn.f32 %f20, %f19, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f20; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p4, %h1, %h2; | |
mul.wide.u32 %rd596, %r2, 2048; | |
add.s64 %rd597, %rd9, %rd596; | |
mul.wide.u32 %rd598, %r3, 2; | |
add.s64 %rd44, %rd597, %rd598; | |
ld.global.nc.b16 %h3, [%rd44]; | |
mul.wide.u32 %rd599, %r3, 4; | |
add.s64 %rd45, %rd1, %rd599; | |
ld.global.nc.f32 %f21, [%rd45]; | |
cvt.rn.f16.f32 %h4, %f21; | |
add.rn.f16 %h5, %h3, %h4; | |
mov.b16 %h6, 0x3C72; | |
mul.rn.f16 %h7, %h5, %h6; | |
selp.b16 %h8, %h7, 0x0000, %p4; | |
cvt.f32.f16 %f22, %h8; | |
add.s64 %rd600, %rd8, %rd596; | |
add.s64 %rd46, %rd600, %rd598; | |
ld.global.nc.b16 %h9, [%rd46]; | |
cvt.f32.f16 %f23, %h9; | |
mul.wide.u32 %rd601, %r2, 4; | |
add.s64 %rd602, %rd5, %rd601; | |
ld.global.nc.f32 %f24, [%rd602]; | |
mul.rn.f32 %f25, %f24, 0f3A800000; | |
add.rn.f32 %f26, %f25, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f1, %f26; | |
add.s64 %rd47, %rd2, %rd599; | |
ld.global.nc.f32 %f27, [%rd47]; | |
mul.rn.f32 %f28, %f1, %f27; | |
mul.rn.f32 %f29, %f28, %f23; | |
add.s64 %rd48, %rd3, %rd599; | |
ld.global.nc.f32 %f30, [%rd48]; | |
add.s64 %rd603, %rd4, %rd601; | |
ld.global.nc.f32 %f31, [%rd603]; | |
mul.rn.f32 %f2, %f31, 0f3A800000; | |
mul.rn.f32 %f32, %f28, %f2; | |
sub.rn.f32 %f33, %f30, %f32; | |
add.rn.f32 %f34, %f29, %f33; | |
add.rn.f32 %f35, %f34, %f22; | |
add.rn.f32 %f3, %f35, 0f00000000; | |
or.b32 %r63, %r3, 1; | |
and.b32 %r64, %r63, 3; | |
setp.ne.s32 %p5, %r64, 1; | |
@%p5 bra LBB54_7; | |
mul.lo.s64 %rd2466, %rd2384, 3528531795; | |
selp.u64 %rd644, 1, 0, %p69; | |
add.s64 %rd645, %rd2461, %rd644; | |
xor.b64 %rd646, %rd645, %rd2466; | |
shr.u64 %rd647, %rd646, 32; | |
mul.lo.s64 %rd2469, %rd647, 3449720151; | |
shr.u64 %rd648, %rd2469, 32; | |
and.b64 %rd649, %rd645, 4294967295; | |
mul.lo.s64 %rd650, %rd649, 3449720151; | |
and.b64 %rd651, %rd650, 4294967295; | |
xor.b64 %rd652, %rd651, %rd648; | |
xor.b64 %rd653, %rd652, 2654435769; | |
mul.lo.s64 %rd2472, %rd653, 3528531795; | |
xor.b64 %rd2462, %rd650, %rd12; | |
mov.u32 %r314, -845247145; | |
mov.u32 %r313, -616729560; | |
mov.u64 %rd2479, 3041712726; | |
mov.u64 %rd2478, 1401181199; | |
mov.u64 %rd2477, 2835769497; | |
mov.u64 %rd2476, 1684936478; | |
mov.u64 %rd2475, 2027808484; | |
mov.u64 %rd2474, 387276957; | |
mov.u64 %rd2473, 842468239; | |
mov.u64 %rd2471, 3986602516; | |
mov.u64 %rd2470, 1013904242; | |
mov.u64 %rd2468, 3668340011; | |
mov.u64 %rd2467, 3144134277; | |
mov.u64 %rd2465, 3449720151; | |
mov.u64 %rd2464, 1993301258; | |
mov.u64 %rd2463, 3528531795; | |
bra.uni LBB54_8; | |
LBB54_7: | |
mov.u32 %r313, -239350328; | |
selp.u64 %rd618, 1, 0, %p69; | |
add.s64 %rd619, %rd2461, %rd618; | |
and.b64 %rd620, %rd619, 4294967295; | |
mul.lo.s64 %rd2466, %rd620, 3449720151; | |
xor.b64 %rd621, %rd2466, %rd12; | |
shr.u64 %rd622, %rd621, 32; | |
mul.lo.s64 %rd2469, %rd622, 3528531795; | |
shr.u64 %rd623, %rd2469, 32; | |
mul.lo.s64 %rd625, %rd2384, 3528531795; | |
and.b64 %rd626, %rd625, 4294967295; | |
xor.b64 %rd627, %rd626, %rd623; | |
xor.b64 %rd628, %rd627, 3144134277; | |
mul.lo.s64 %rd2472, %rd628, 3449720151; | |
xor.b64 %rd2462, %rd619, %rd625; | |
mov.u32 %r314, -766435501; | |
mov.u64 %rd2479, 1684936478; | |
mov.u64 %rd2478, 534103459; | |
mov.u64 %rd2477, 387276957; | |
mov.u64 %rd2476, 3041712726; | |
mov.u64 %rd2475, 3986602516; | |
mov.u64 %rd2474, 2835769497; | |
mov.u64 %rd2473, 3668340011; | |
mov.u64 %rd2471, 2027808484; | |
mov.u64 %rd2470, 1993301258; | |
mov.u64 %rd2468, 842468239; | |
mov.u64 %rd2467, 2654435769; | |
mov.u64 %rd2465, 3528531795; | |
mov.u64 %rd2464, 1013904242; | |
mov.u64 %rd2463, 3449720151; | |
LBB54_8: | |
setp.ne.s32 %p8, %r5, 0; | |
shr.u64 %rd654, %rd2472, 32; | |
shr.u64 %rd655, %rd2462, 32; | |
mul.lo.s64 %rd656, %rd655, %rd2463; | |
and.b64 %rd657, %rd656, 4294967295; | |
xor.b64 %rd658, %rd657, %rd654; | |
xor.b64 %rd659, %rd658, %rd2464; | |
mul.lo.s64 %rd660, %rd659, %rd2465; | |
shr.u64 %rd661, %rd660, 32; | |
shr.u64 %rd662, %rd656, 32; | |
and.b64 %rd663, %rd2466, 4294967295; | |
xor.b64 %rd664, %rd663, %rd662; | |
xor.b64 %rd665, %rd664, %rd2467; | |
mul.lo.s64 %rd666, %rd665, %rd2465; | |
and.b64 %rd667, %rd666, 4294967295; | |
xor.b64 %rd668, %rd667, %rd661; | |
xor.b64 %rd669, %rd668, %rd2468; | |
mul.lo.s64 %rd670, %rd669, %rd2463; | |
shr.u64 %rd671, %rd670, 32; | |
shr.u64 %rd672, %rd666, 32; | |
and.b64 %rd673, %rd2469, 4294967295; | |
xor.b64 %rd674, %rd673, %rd672; | |
xor.b64 %rd675, %rd674, %rd2470; | |
mul.lo.s64 %rd676, %rd675, %rd2463; | |
and.b64 %rd677, %rd676, 4294967295; | |
xor.b64 %rd678, %rd677, %rd671; | |
xor.b64 %rd679, %rd678, %rd2471; | |
mul.lo.s64 %rd680, %rd679, %rd2465; | |
shr.u64 %rd681, %rd680, 32; | |
shr.u64 %rd682, %rd676, 32; | |
and.b64 %rd683, %rd2472, 4294967295; | |
xor.b64 %rd684, %rd683, %rd682; | |
xor.b64 %rd685, %rd684, %rd2473; | |
mul.lo.s64 %rd686, %rd685, %rd2465; | |
and.b64 %rd687, %rd686, 4294967295; | |
xor.b64 %rd688, %rd687, %rd681; | |
xor.b64 %rd689, %rd688, %rd2474; | |
mul.lo.s64 %rd690, %rd689, %rd2463; | |
shr.u64 %rd691, %rd690, 32; | |
shr.u64 %rd692, %rd686, 32; | |
and.b64 %rd693, %rd660, 4294967295; | |
xor.b64 %rd694, %rd693, %rd692; | |
xor.b64 %rd695, %rd694, %rd2475; | |
mul.lo.s64 %rd696, %rd695, %rd2463; | |
and.b64 %rd697, %rd696, 4294967295; | |
xor.b64 %rd698, %rd697, %rd691; | |
xor.b64 %rd699, %rd698, %rd2476; | |
mul.lo.s64 %rd700, %rd699, %rd2465; | |
shr.u64 %rd701, %rd700, 32; | |
shr.u64 %rd702, %rd696, 32; | |
and.b64 %rd703, %rd670, 4294967295; | |
xor.b64 %rd704, %rd703, %rd702; | |
xor.b64 %rd705, %rd704, %rd2477; | |
mul.lo.s64 %rd706, %rd705, %rd2465; | |
and.b64 %rd707, %rd706, 4294967295; | |
xor.b64 %rd708, %rd707, %rd701; | |
xor.b64 %rd709, %rd708, %rd2478; | |
mul.lo.s64 %rd710, %rd709, %rd2463; | |
shr.u64 %rd711, %rd710, 32; | |
shr.u64 %rd712, %rd706, 32; | |
xor.b64 %rd713, %rd680, %rd712; | |
xor.b64 %rd714, %rd713, %rd2479; | |
mul.lo.s64 %rd715, %rd714, %rd2463; | |
xor.b64 %rd716, %rd711, %rd715; | |
cvt.u32.u64 %r69, %rd716; | |
xor.b32 %r70, %r313, %r69; | |
mul.lo.s32 %r71, %r70, %r314; | |
shr.u32 %r72, %r71, 9; | |
cvt.rn.f32.u32 %f36, %r72; | |
mul.rn.f32 %f37, %f36, 0f34000000; | |
cvt.rn.f16.f32 %h10, %f37; | |
mov.b16 %h11, 0x2E66; | |
setp.ge.f16 %p9, %h10, %h11; | |
ld.global.nc.b16 %h12, [%rd44+2]; | |
ld.global.nc.f32 %f38, [%rd45+4]; | |
cvt.rn.f16.f32 %h13, %f38; | |
add.rn.f16 %h14, %h12, %h13; | |
mov.b16 %h15, 0x3C72; | |
mul.rn.f16 %h16, %h14, %h15; | |
selp.b16 %h17, %h16, 0x0000, %p9; | |
cvt.f32.f16 %f39, %h17; | |
ld.global.nc.b16 %h18, [%rd46+2]; | |
cvt.f32.f16 %f40, %h18; | |
ld.global.nc.f32 %f41, [%rd47+4]; | |
mul.rn.f32 %f42, %f1, %f41; | |
mul.rn.f32 %f43, %f42, %f40; | |
ld.global.nc.f32 %f44, [%rd48+4]; | |
mul.rn.f32 %f45, %f2, %f42; | |
sub.rn.f32 %f46, %f44, %f45; | |
add.rn.f32 %f47, %f43, %f46; | |
add.rn.f32 %f48, %f47, %f39; | |
add.rn.f32 %f4, %f3, %f48; | |
or.b32 %r73, %r3, %r4; | |
or.b32 %r74, %r73, 128; | |
shr.u32 %r75, %r74, 2; | |
cvt.u64.u32 %rd717, %r75; | |
add.s64 %rd75, %rd11, %rd717; | |
and.b64 %rd2433, %rd75, 4294967295; | |
setp.lt.u64 %p74, %rd75, %rd11; | |
@%p8 bra LBB54_10; | |
mul.lo.s64 %rd2484, %rd2433, 3528531795; | |
selp.u64 %rd760, 1, 0, %p74; | |
add.s64 %rd761, %rd2461, %rd760; | |
xor.b64 %rd762, %rd761, %rd2484; | |
shr.u64 %rd763, %rd762, 32; | |
mul.lo.s64 %rd2487, %rd763, 3449720151; | |
shr.u64 %rd764, %rd2487, 32; | |
and.b64 %rd765, %rd761, 4294967295; | |
mul.lo.s64 %rd766, %rd765, 3449720151; | |
and.b64 %rd767, %rd766, 4294967295; | |
xor.b64 %rd768, %rd767, %rd764; | |
xor.b64 %rd769, %rd768, 2654435769; | |
mul.lo.s64 %rd2490, %rd769, 3528531795; | |
xor.b64 %rd2480, %rd766, %rd75; | |
mov.u32 %r317, -1879881855; | |
mov.u32 %r316, -845247145; | |
mov.u32 %r315, 534103459; | |
mov.u64 %rd2498, 3678237736; | |
mov.u64 %rd2497, 3041712726; | |
mov.u64 %rd2496, 1401181199; | |
mov.u64 %rd2495, 2835769497; | |
mov.u64 %rd2494, 1684936478; | |
mov.u64 %rd2493, 2027808484; | |
mov.u64 %rd2492, 387276957; | |
mov.u64 %rd2491, 842468239; | |
mov.u64 %rd2489, 3986602516; | |
mov.u64 %rd2488, 1013904242; | |
mov.u64 %rd2486, 3668340011; | |
mov.u64 %rd2485, 3144134277; | |
mov.u64 %rd2483, 3449720151; | |
mov.u64 %rd2482, 1993301258; | |
mov.u64 %rd2481, 3528531795; | |
bra.uni LBB54_11; | |
LBB54_10: | |
selp.u64 %rd733, 1, 0, %p74; | |
add.s64 %rd734, %rd2461, %rd733; | |
and.b64 %rd735, %rd734, 4294967295; | |
mul.lo.s64 %rd2484, %rd735, 3449720151; | |
xor.b64 %rd736, %rd2484, %rd75; | |
shr.u64 %rd737, %rd736, 32; | |
mul.lo.s64 %rd2487, %rd737, 3528531795; | |
shr.u64 %rd738, %rd2487, 32; | |
mul.lo.s64 %rd740, %rd2433, 3528531795; | |
and.b64 %rd741, %rd740, 4294967295; | |
xor.b64 %rd742, %rd741, %rd738; | |
xor.b64 %rd743, %rd742, 3144134277; | |
mul.lo.s64 %rd2490, %rd743, 3449720151; | |
xor.b64 %rd2480, %rd734, %rd740; | |
mov.u32 %r317, -1767562579; | |
mov.u32 %r316, -766435501; | |
mov.u32 %r315, 1401181199; | |
mov.u64 %rd2498, 4055616968; | |
mov.u64 %rd2497, 1684936478; | |
mov.u64 %rd2496, 534103459; | |
mov.u64 %rd2495, 387276957; | |
mov.u64 %rd2494, 3041712726; | |
mov.u64 %rd2493, 3986602516; | |
mov.u64 %rd2492, 2835769497; | |
mov.u64 %rd2491, 3668340011; | |
mov.u64 %rd2489, 2027808484; | |
mov.u64 %rd2488, 1993301258; | |
mov.u64 %rd2486, 842468239; | |
mov.u64 %rd2485, 2654435769; | |
mov.u64 %rd2483, 3528531795; | |
mov.u64 %rd2482, 1013904242; | |
mov.u64 %rd2481, 3449720151; | |
LBB54_11: | |
shr.u64 %rd770, %rd2490, 32; | |
shr.u64 %rd771, %rd2480, 32; | |
mul.lo.s64 %rd772, %rd771, %rd2481; | |
and.b64 %rd773, %rd772, 4294967295; | |
xor.b64 %rd774, %rd773, %rd770; | |
xor.b64 %rd775, %rd774, %rd2482; | |
mul.lo.s64 %rd776, %rd775, %rd2483; | |
shr.u64 %rd777, %rd776, 32; | |
shr.u64 %rd778, %rd772, 32; | |
and.b64 %rd779, %rd2484, 4294967295; | |
xor.b64 %rd780, %rd779, %rd778; | |
xor.b64 %rd781, %rd780, %rd2485; | |
mul.lo.s64 %rd782, %rd781, %rd2483; | |
and.b64 %rd783, %rd782, 4294967295; | |
xor.b64 %rd784, %rd783, %rd777; | |
xor.b64 %rd785, %rd784, %rd2486; | |
mul.lo.s64 %rd786, %rd785, %rd2481; | |
shr.u64 %rd787, %rd786, 32; | |
shr.u64 %rd788, %rd782, 32; | |
and.b64 %rd789, %rd2487, 4294967295; | |
xor.b64 %rd790, %rd789, %rd788; | |
xor.b64 %rd791, %rd790, %rd2488; | |
mul.lo.s64 %rd792, %rd791, %rd2481; | |
and.b64 %rd793, %rd792, 4294967295; | |
xor.b64 %rd794, %rd793, %rd787; | |
xor.b64 %rd795, %rd794, %rd2489; | |
mul.lo.s64 %rd796, %rd795, %rd2483; | |
shr.u64 %rd797, %rd796, 32; | |
shr.u64 %rd798, %rd792, 32; | |
and.b64 %rd799, %rd2490, 4294967295; | |
xor.b64 %rd800, %rd799, %rd798; | |
xor.b64 %rd801, %rd800, %rd2491; | |
mul.lo.s64 %rd802, %rd801, %rd2483; | |
and.b64 %rd803, %rd802, 4294967295; | |
xor.b64 %rd804, %rd803, %rd797; | |
xor.b64 %rd805, %rd804, %rd2492; | |
mul.lo.s64 %rd806, %rd805, %rd2481; | |
shr.u64 %rd807, %rd806, 32; | |
shr.u64 %rd808, %rd802, 32; | |
and.b64 %rd809, %rd776, 4294967295; | |
xor.b64 %rd810, %rd809, %rd808; | |
xor.b64 %rd811, %rd810, %rd2493; | |
mul.lo.s64 %rd812, %rd811, %rd2481; | |
and.b64 %rd813, %rd812, 4294967295; | |
xor.b64 %rd814, %rd813, %rd807; | |
xor.b64 %rd815, %rd814, %rd2494; | |
mul.lo.s64 %rd816, %rd815, %rd2483; | |
shr.u64 %rd817, %rd816, 32; | |
shr.u64 %rd818, %rd812, 32; | |
and.b64 %rd819, %rd786, 4294967295; | |
xor.b64 %rd820, %rd819, %rd818; | |
xor.b64 %rd821, %rd820, %rd2495; | |
mul.lo.s64 %rd822, %rd821, %rd2483; | |
and.b64 %rd823, %rd822, 4294967295; | |
xor.b64 %rd824, %rd823, %rd817; | |
xor.b64 %rd825, %rd824, %rd2496; | |
mul.lo.s64 %rd826, %rd825, %rd2481; | |
shr.u64 %rd827, %rd826, 32; | |
shr.u64 %rd828, %rd822, 32; | |
and.b64 %rd829, %rd796, 4294967295; | |
xor.b64 %rd830, %rd829, %rd828; | |
xor.b64 %rd831, %rd830, %rd2497; | |
mul.lo.s64 %rd832, %rd831, %rd2481; | |
and.b64 %rd833, %rd832, 4294967295; | |
xor.b64 %rd834, %rd833, %rd827; | |
xor.b64 %rd835, %rd834, %rd2498; | |
mul.lo.s64 %rd836, %rd835, %rd2483; | |
shr.u64 %rd837, %rd836, 32; | |
cvt.u32.u64 %r82, %rd837; | |
shr.u64 %rd838, %rd832, 32; | |
xor.b64 %rd839, %rd838, %rd806; | |
cvt.u32.u64 %r83, %rd839; | |
xor.b32 %r84, %r315, %r83; | |
mul.lo.s32 %r85, %r84, %r316; | |
xor.b32 %r86, %r85, %r82; | |
xor.b32 %r87, %r86, %r317; | |
shr.u32 %r88, %r87, 9; | |
cvt.rn.f32.u32 %f49, %r88; | |
mul.rn.f32 %f50, %f49, 0f34000000; | |
cvt.rn.f16.f32 %h19, %f50; | |
mov.b16 %h20, 0x2E66; | |
setp.ge.f16 %p12, %h19, %h20; | |
ld.global.nc.b16 %h21, [%rd44+256]; | |
ld.global.nc.f32 %f51, [%rd45+512]; | |
cvt.rn.f16.f32 %h22, %f51; | |
add.rn.f16 %h23, %h21, %h22; | |
mov.b16 %h24, 0x3C72; | |
mul.rn.f16 %h25, %h23, %h24; | |
selp.b16 %h26, %h25, 0x0000, %p12; | |
cvt.f32.f16 %f52, %h26; | |
ld.global.nc.b16 %h27, [%rd46+256]; | |
cvt.f32.f16 %f53, %h27; | |
ld.global.nc.f32 %f54, [%rd47+512]; | |
mul.rn.f32 %f55, %f1, %f54; | |
mul.rn.f32 %f56, %f55, %f53; | |
ld.global.nc.f32 %f57, [%rd48+512]; | |
mul.rn.f32 %f58, %f2, %f55; | |
sub.rn.f32 %f59, %f57, %f58; | |
add.rn.f32 %f60, %f56, %f59; | |
add.rn.f32 %f61, %f60, %f52; | |
add.rn.f32 %f5, %f4, %f61; | |
or.b32 %r89, %r3, 129; | |
or.b32 %r90, %r89, %r4; | |
and.b32 %r91, %r89, 3; | |
shr.u32 %r92, %r90, 2; | |
setp.ne.s32 %p13, %r91, 1; | |
cvt.u64.u32 %rd840, %r92; | |
add.s64 %rd103, %rd11, %rd840; | |
and.b64 %rd2430, %rd103, 4294967295; | |
setp.lt.u64 %p73, %rd103, %rd11; | |
@%p13 bra LBB54_13; | |
mul.lo.s64 %rd2503, %rd2430, 3528531795; | |
selp.u64 %rd881, 1, 0, %p73; | |
add.s64 %rd882, %rd2461, %rd881; | |
xor.b64 %rd883, %rd882, %rd2503; | |
shr.u64 %rd884, %rd883, 32; | |
mul.lo.s64 %rd2506, %rd884, 3449720151; | |
shr.u64 %rd885, %rd2506, 32; | |
and.b64 %rd886, %rd882, 4294967295; | |
mul.lo.s64 %rd887, %rd886, 3449720151; | |
and.b64 %rd888, %rd887, 4294967295; | |
xor.b64 %rd889, %rd888, %rd885; | |
xor.b64 %rd890, %rd889, 2654435769; | |
mul.lo.s64 %rd2509, %rd890, 3528531795; | |
xor.b64 %rd2499, %rd887, %rd103; | |
mov.u32 %r319, -845247145; | |
mov.u32 %r318, -616729560; | |
mov.u64 %rd2516, 3041712726; | |
mov.u64 %rd2515, 1401181199; | |
mov.u64 %rd2514, 2835769497; | |
mov.u64 %rd2513, 1684936478; | |
mov.u64 %rd2512, 2027808484; | |
mov.u64 %rd2511, 387276957; | |
mov.u64 %rd2510, 842468239; | |
mov.u64 %rd2508, 3986602516; | |
mov.u64 %rd2507, 1013904242; | |
mov.u64 %rd2505, 3668340011; | |
mov.u64 %rd2504, 3144134277; | |
mov.u64 %rd2502, 3449720151; | |
mov.u64 %rd2501, 1993301258; | |
mov.u64 %rd2500, 3528531795; | |
bra.uni LBB54_14; | |
LBB54_13: | |
selp.u64 %rd855, 1, 0, %p73; | |
add.s64 %rd856, %rd2461, %rd855; | |
and.b64 %rd857, %rd856, 4294967295; | |
mul.lo.s64 %rd2503, %rd857, 3449720151; | |
xor.b64 %rd858, %rd2503, %rd103; | |
shr.u64 %rd859, %rd858, 32; | |
mul.lo.s64 %rd2506, %rd859, 3528531795; | |
shr.u64 %rd860, %rd2506, 32; | |
mul.lo.s64 %rd862, %rd2430, 3528531795; | |
and.b64 %rd863, %rd862, 4294967295; | |
xor.b64 %rd864, %rd863, %rd860; | |
xor.b64 %rd865, %rd864, 3144134277; | |
mul.lo.s64 %rd2509, %rd865, 3449720151; | |
xor.b64 %rd2499, %rd856, %rd862; | |
mov.u32 %r319, -766435501; | |
mov.u32 %r318, -239350328; | |
mov.u64 %rd2516, 1684936478; | |
mov.u64 %rd2515, 534103459; | |
mov.u64 %rd2514, 387276957; | |
mov.u64 %rd2513, 3041712726; | |
mov.u64 %rd2512, 3986602516; | |
mov.u64 %rd2511, 2835769497; | |
mov.u64 %rd2510, 3668340011; | |
mov.u64 %rd2508, 2027808484; | |
mov.u64 %rd2507, 1993301258; | |
mov.u64 %rd2505, 842468239; | |
mov.u64 %rd2504, 2654435769; | |
mov.u64 %rd2502, 3528531795; | |
mov.u64 %rd2501, 1013904242; | |
mov.u64 %rd2500, 3449720151; | |
LBB54_14: | |
shr.u64 %rd891, %rd2509, 32; | |
shr.u64 %rd892, %rd2499, 32; | |
mul.lo.s64 %rd893, %rd892, %rd2500; | |
and.b64 %rd894, %rd893, 4294967295; | |
xor.b64 %rd895, %rd894, %rd891; | |
xor.b64 %rd896, %rd895, %rd2501; | |
mul.lo.s64 %rd897, %rd896, %rd2502; | |
shr.u64 %rd898, %rd897, 32; | |
shr.u64 %rd899, %rd893, 32; | |
and.b64 %rd900, %rd2503, 4294967295; | |
xor.b64 %rd901, %rd900, %rd899; | |
xor.b64 %rd902, %rd901, %rd2504; | |
mul.lo.s64 %rd903, %rd902, %rd2502; | |
and.b64 %rd904, %rd903, 4294967295; | |
xor.b64 %rd905, %rd904, %rd898; | |
xor.b64 %rd906, %rd905, %rd2505; | |
mul.lo.s64 %rd907, %rd906, %rd2500; | |
shr.u64 %rd908, %rd907, 32; | |
shr.u64 %rd909, %rd903, 32; | |
and.b64 %rd910, %rd2506, 4294967295; | |
xor.b64 %rd911, %rd910, %rd909; | |
xor.b64 %rd912, %rd911, %rd2507; | |
mul.lo.s64 %rd913, %rd912, %rd2500; | |
and.b64 %rd914, %rd913, 4294967295; | |
xor.b64 %rd915, %rd914, %rd908; | |
xor.b64 %rd916, %rd915, %rd2508; | |
mul.lo.s64 %rd917, %rd916, %rd2502; | |
shr.u64 %rd918, %rd917, 32; | |
shr.u64 %rd919, %rd913, 32; | |
and.b64 %rd920, %rd2509, 4294967295; | |
xor.b64 %rd921, %rd920, %rd919; | |
xor.b64 %rd922, %rd921, %rd2510; | |
mul.lo.s64 %rd923, %rd922, %rd2502; | |
and.b64 %rd924, %rd923, 4294967295; | |
xor.b64 %rd925, %rd924, %rd918; | |
xor.b64 %rd926, %rd925, %rd2511; | |
mul.lo.s64 %rd927, %rd926, %rd2500; | |
shr.u64 %rd928, %rd927, 32; | |
shr.u64 %rd929, %rd923, 32; | |
and.b64 %rd930, %rd897, 4294967295; | |
xor.b64 %rd931, %rd930, %rd929; | |
xor.b64 %rd932, %rd931, %rd2512; | |
mul.lo.s64 %rd933, %rd932, %rd2500; | |
and.b64 %rd934, %rd933, 4294967295; | |
xor.b64 %rd935, %rd934, %rd928; | |
xor.b64 %rd936, %rd935, %rd2513; | |
mul.lo.s64 %rd937, %rd936, %rd2502; | |
shr.u64 %rd938, %rd937, 32; | |
shr.u64 %rd939, %rd933, 32; | |
and.b64 %rd940, %rd907, 4294967295; | |
xor.b64 %rd941, %rd940, %rd939; | |
xor.b64 %rd942, %rd941, %rd2514; | |
mul.lo.s64 %rd943, %rd942, %rd2502; | |
and.b64 %rd944, %rd943, 4294967295; | |
xor.b64 %rd945, %rd944, %rd938; | |
xor.b64 %rd946, %rd945, %rd2515; | |
mul.lo.s64 %rd947, %rd946, %rd2500; | |
shr.u64 %rd948, %rd947, 32; | |
shr.u64 %rd949, %rd943, 32; | |
xor.b64 %rd950, %rd917, %rd949; | |
xor.b64 %rd951, %rd950, %rd2516; | |
mul.lo.s64 %rd952, %rd951, %rd2500; | |
xor.b64 %rd953, %rd948, %rd952; | |
cvt.u32.u64 %r97, %rd953; | |
xor.b32 %r98, %r318, %r97; | |
mul.lo.s32 %r99, %r98, %r319; | |
shr.u32 %r100, %r99, 9; | |
cvt.rn.f32.u32 %f62, %r100; | |
mul.rn.f32 %f63, %f62, 0f34000000; | |
cvt.rn.f16.f32 %h28, %f63; | |
mov.b16 %h29, 0x2E66; | |
setp.ge.f16 %p17, %h28, %h29; | |
ld.global.nc.b16 %h30, [%rd44+258]; | |
ld.global.nc.f32 %f64, [%rd45+516]; | |
cvt.rn.f16.f32 %h31, %f64; | |
add.rn.f16 %h32, %h30, %h31; | |
mov.b16 %h33, 0x3C72; | |
mul.rn.f16 %h34, %h32, %h33; | |
selp.b16 %h35, %h34, 0x0000, %p17; | |
cvt.f32.f16 %f65, %h35; | |
ld.global.nc.b16 %h36, [%rd46+258]; | |
cvt.f32.f16 %f66, %h36; | |
ld.global.nc.f32 %f67, [%rd47+516]; | |
mul.rn.f32 %f68, %f1, %f67; | |
mul.rn.f32 %f69, %f68, %f66; | |
ld.global.nc.f32 %f70, [%rd48+516]; | |
mul.rn.f32 %f71, %f2, %f68; | |
sub.rn.f32 %f72, %f70, %f71; | |
add.rn.f32 %f73, %f69, %f72; | |
add.rn.f32 %f74, %f73, %f65; | |
add.rn.f32 %f6, %f5, %f74; | |
or.b32 %r102, %r73, 256; | |
shr.u32 %r103, %r102, 2; | |
cvt.u64.u32 %rd954, %r103; | |
add.s64 %rd130, %rd11, %rd954; | |
and.b64 %rd2426, %rd130, 4294967295; | |
setp.lt.u64 %p72, %rd130, %rd11; | |
@%p8 bra LBB54_16; | |
mul.lo.s64 %rd2521, %rd2426, 3528531795; | |
selp.u64 %rd997, 1, 0, %p72; | |
add.s64 %rd998, %rd2461, %rd997; | |
xor.b64 %rd999, %rd998, %rd2521; | |
shr.u64 %rd1000, %rd999, 32; | |
mul.lo.s64 %rd2524, %rd1000, 3449720151; | |
shr.u64 %rd1001, %rd2524, 32; | |
and.b64 %rd1002, %rd998, 4294967295; | |
mul.lo.s64 %rd1003, %rd1002, 3449720151; | |
and.b64 %rd1004, %rd1003, 4294967295; | |
xor.b64 %rd1005, %rd1004, %rd1001; | |
xor.b64 %rd1006, %rd1005, 2654435769; | |
mul.lo.s64 %rd2527, %rd1006, 3528531795; | |
xor.b64 %rd2517, %rd1003, %rd130; | |
mov.u32 %r322, -1879881855; | |
mov.u32 %r321, -845247145; | |
mov.u32 %r320, 534103459; | |
mov.u64 %rd2535, 3678237736; | |
mov.u64 %rd2534, 3041712726; | |
mov.u64 %rd2533, 1401181199; | |
mov.u64 %rd2532, 2835769497; | |
mov.u64 %rd2531, 1684936478; | |
mov.u64 %rd2530, 2027808484; | |
mov.u64 %rd2529, 387276957; | |
mov.u64 %rd2528, 842468239; | |
mov.u64 %rd2526, 3986602516; | |
mov.u64 %rd2525, 1013904242; | |
mov.u64 %rd2523, 3668340011; | |
mov.u64 %rd2522, 3144134277; | |
mov.u64 %rd2520, 3449720151; | |
mov.u64 %rd2519, 1993301258; | |
mov.u64 %rd2518, 3528531795; | |
bra.uni LBB54_17; | |
LBB54_16: | |
selp.u64 %rd970, 1, 0, %p72; | |
add.s64 %rd971, %rd2461, %rd970; | |
and.b64 %rd972, %rd971, 4294967295; | |
mul.lo.s64 %rd2521, %rd972, 3449720151; | |
xor.b64 %rd973, %rd2521, %rd130; | |
shr.u64 %rd974, %rd973, 32; | |
mul.lo.s64 %rd2524, %rd974, 3528531795; | |
shr.u64 %rd975, %rd2524, 32; | |
mul.lo.s64 %rd977, %rd2426, 3528531795; | |
and.b64 %rd978, %rd977, 4294967295; | |
xor.b64 %rd979, %rd978, %rd975; | |
xor.b64 %rd980, %rd979, 3144134277; | |
mul.lo.s64 %rd2527, %rd980, 3449720151; | |
xor.b64 %rd2517, %rd971, %rd977; | |
mov.u32 %r322, -1767562579; | |
mov.u32 %r321, -766435501; | |
mov.u32 %r320, 1401181199; | |
mov.u64 %rd2535, 4055616968; | |
mov.u64 %rd2534, 1684936478; | |
mov.u64 %rd2533, 534103459; | |
mov.u64 %rd2532, 387276957; | |
mov.u64 %rd2531, 3041712726; | |
mov.u64 %rd2530, 3986602516; | |
mov.u64 %rd2529, 2835769497; | |
mov.u64 %rd2528, 3668340011; | |
mov.u64 %rd2526, 2027808484; | |
mov.u64 %rd2525, 1993301258; | |
mov.u64 %rd2523, 842468239; | |
mov.u64 %rd2522, 2654435769; | |
mov.u64 %rd2520, 3528531795; | |
mov.u64 %rd2519, 1013904242; | |
mov.u64 %rd2518, 3449720151; | |
LBB54_17: | |
shr.u64 %rd1007, %rd2527, 32; | |
shr.u64 %rd1008, %rd2517, 32; | |
mul.lo.s64 %rd1009, %rd1008, %rd2518; | |
and.b64 %rd1010, %rd1009, 4294967295; | |
xor.b64 %rd1011, %rd1010, %rd1007; | |
xor.b64 %rd1012, %rd1011, %rd2519; | |
mul.lo.s64 %rd1013, %rd1012, %rd2520; | |
shr.u64 %rd1014, %rd1013, 32; | |
shr.u64 %rd1015, %rd1009, 32; | |
and.b64 %rd1016, %rd2521, 4294967295; | |
xor.b64 %rd1017, %rd1016, %rd1015; | |
xor.b64 %rd1018, %rd1017, %rd2522; | |
mul.lo.s64 %rd1019, %rd1018, %rd2520; | |
and.b64 %rd1020, %rd1019, 4294967295; | |
xor.b64 %rd1021, %rd1020, %rd1014; | |
xor.b64 %rd1022, %rd1021, %rd2523; | |
mul.lo.s64 %rd1023, %rd1022, %rd2518; | |
shr.u64 %rd1024, %rd1023, 32; | |
shr.u64 %rd1025, %rd1019, 32; | |
and.b64 %rd1026, %rd2524, 4294967295; | |
xor.b64 %rd1027, %rd1026, %rd1025; | |
xor.b64 %rd1028, %rd1027, %rd2525; | |
mul.lo.s64 %rd1029, %rd1028, %rd2518; | |
and.b64 %rd1030, %rd1029, 4294967295; | |
xor.b64 %rd1031, %rd1030, %rd1024; | |
xor.b64 %rd1032, %rd1031, %rd2526; | |
mul.lo.s64 %rd1033, %rd1032, %rd2520; | |
shr.u64 %rd1034, %rd1033, 32; | |
shr.u64 %rd1035, %rd1029, 32; | |
and.b64 %rd1036, %rd2527, 4294967295; | |
xor.b64 %rd1037, %rd1036, %rd1035; | |
xor.b64 %rd1038, %rd1037, %rd2528; | |
mul.lo.s64 %rd1039, %rd1038, %rd2520; | |
and.b64 %rd1040, %rd1039, 4294967295; | |
xor.b64 %rd1041, %rd1040, %rd1034; | |
xor.b64 %rd1042, %rd1041, %rd2529; | |
mul.lo.s64 %rd1043, %rd1042, %rd2518; | |
shr.u64 %rd1044, %rd1043, 32; | |
shr.u64 %rd1045, %rd1039, 32; | |
and.b64 %rd1046, %rd1013, 4294967295; | |
xor.b64 %rd1047, %rd1046, %rd1045; | |
xor.b64 %rd1048, %rd1047, %rd2530; | |
mul.lo.s64 %rd1049, %rd1048, %rd2518; | |
and.b64 %rd1050, %rd1049, 4294967295; | |
xor.b64 %rd1051, %rd1050, %rd1044; | |
xor.b64 %rd1052, %rd1051, %rd2531; | |
mul.lo.s64 %rd1053, %rd1052, %rd2520; | |
shr.u64 %rd1054, %rd1053, 32; | |
shr.u64 %rd1055, %rd1049, 32; | |
and.b64 %rd1056, %rd1023, 4294967295; | |
xor.b64 %rd1057, %rd1056, %rd1055; | |
xor.b64 %rd1058, %rd1057, %rd2532; | |
mul.lo.s64 %rd1059, %rd1058, %rd2520; | |
and.b64 %rd1060, %rd1059, 4294967295; | |
xor.b64 %rd1061, %rd1060, %rd1054; | |
xor.b64 %rd1062, %rd1061, %rd2533; | |
mul.lo.s64 %rd1063, %rd1062, %rd2518; | |
shr.u64 %rd1064, %rd1063, 32; | |
shr.u64 %rd1065, %rd1059, 32; | |
and.b64 %rd1066, %rd1033, 4294967295; | |
xor.b64 %rd1067, %rd1066, %rd1065; | |
xor.b64 %rd1068, %rd1067, %rd2534; | |
mul.lo.s64 %rd1069, %rd1068, %rd2518; | |
and.b64 %rd1070, %rd1069, 4294967295; | |
xor.b64 %rd1071, %rd1070, %rd1064; | |
xor.b64 %rd1072, %rd1071, %rd2535; | |
mul.lo.s64 %rd1073, %rd1072, %rd2520; | |
shr.u64 %rd1074, %rd1073, 32; | |
cvt.u32.u64 %r110, %rd1074; | |
shr.u64 %rd1075, %rd1069, 32; | |
xor.b64 %rd1076, %rd1075, %rd1043; | |
cvt.u32.u64 %r111, %rd1076; | |
xor.b32 %r112, %r320, %r111; | |
mul.lo.s32 %r113, %r112, %r321; | |
xor.b32 %r114, %r113, %r110; | |
xor.b32 %r115, %r114, %r322; | |
shr.u32 %r116, %r115, 9; | |
cvt.rn.f32.u32 %f75, %r116; | |
mul.rn.f32 %f76, %f75, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f76; | |
mov.b16 %h38, 0x2E66; | |
setp.ge.f16 %p20, %h37, %h38; | |
ld.global.nc.b16 %h39, [%rd44+512]; | |
ld.global.nc.f32 %f77, [%rd45+1024]; | |
cvt.rn.f16.f32 %h40, %f77; | |
add.rn.f16 %h41, %h39, %h40; | |
mov.b16 %h42, 0x3C72; | |
mul.rn.f16 %h43, %h41, %h42; | |
selp.b16 %h44, %h43, 0x0000, %p20; | |
cvt.f32.f16 %f78, %h44; | |
ld.global.nc.b16 %h45, [%rd46+512]; | |
cvt.f32.f16 %f79, %h45; | |
ld.global.nc.f32 %f80, [%rd47+1024]; | |
mul.rn.f32 %f81, %f1, %f80; | |
mul.rn.f32 %f82, %f81, %f79; | |
ld.global.nc.f32 %f83, [%rd48+1024]; | |
mul.rn.f32 %f84, %f2, %f81; | |
sub.rn.f32 %f85, %f83, %f84; | |
add.rn.f32 %f86, %f82, %f85; | |
add.rn.f32 %f87, %f86, %f78; | |
add.rn.f32 %f7, %f6, %f87; | |
or.b32 %r117, %r3, 257; | |
or.b32 %r118, %r117, %r4; | |
and.b32 %r119, %r117, 3; | |
shr.u32 %r120, %r118, 2; | |
setp.ne.s32 %p21, %r119, 1; | |
cvt.u64.u32 %rd1077, %r120; | |
add.s64 %rd158, %rd11, %rd1077; | |
and.b64 %rd2423, %rd158, 4294967295; | |
setp.lt.u64 %p71, %rd158, %rd11; | |
@%p21 bra LBB54_19; | |
mul.lo.s64 %rd2540, %rd2423, 3528531795; | |
selp.u64 %rd1118, 1, 0, %p71; | |
add.s64 %rd1119, %rd2461, %rd1118; | |
xor.b64 %rd1120, %rd1119, %rd2540; | |
shr.u64 %rd1121, %rd1120, 32; | |
mul.lo.s64 %rd2543, %rd1121, 3449720151; | |
shr.u64 %rd1122, %rd2543, 32; | |
and.b64 %rd1123, %rd1119, 4294967295; | |
mul.lo.s64 %rd1124, %rd1123, 3449720151; | |
and.b64 %rd1125, %rd1124, 4294967295; | |
xor.b64 %rd1126, %rd1125, %rd1122; | |
xor.b64 %rd1127, %rd1126, 2654435769; | |
mul.lo.s64 %rd2546, %rd1127, 3528531795; | |
xor.b64 %rd2536, %rd1124, %rd158; | |
mov.u32 %r324, -845247145; | |
mov.u32 %r323, -616729560; | |
mov.u64 %rd2553, 3041712726; | |
mov.u64 %rd2552, 1401181199; | |
mov.u64 %rd2551, 2835769497; | |
mov.u64 %rd2550, 1684936478; | |
mov.u64 %rd2549, 2027808484; | |
mov.u64 %rd2548, 387276957; | |
mov.u64 %rd2547, 842468239; | |
mov.u64 %rd2545, 3986602516; | |
mov.u64 %rd2544, 1013904242; | |
mov.u64 %rd2542, 3668340011; | |
mov.u64 %rd2541, 3144134277; | |
mov.u64 %rd2539, 3449720151; | |
mov.u64 %rd2538, 1993301258; | |
mov.u64 %rd2537, 3528531795; | |
bra.uni LBB54_20; | |
LBB54_19: | |
selp.u64 %rd1092, 1, 0, %p71; | |
add.s64 %rd1093, %rd2461, %rd1092; | |
and.b64 %rd1094, %rd1093, 4294967295; | |
mul.lo.s64 %rd2540, %rd1094, 3449720151; | |
xor.b64 %rd1095, %rd2540, %rd158; | |
shr.u64 %rd1096, %rd1095, 32; | |
mul.lo.s64 %rd2543, %rd1096, 3528531795; | |
shr.u64 %rd1097, %rd2543, 32; | |
mul.lo.s64 %rd1099, %rd2423, 3528531795; | |
and.b64 %rd1100, %rd1099, 4294967295; | |
xor.b64 %rd1101, %rd1100, %rd1097; | |
xor.b64 %rd1102, %rd1101, 3144134277; | |
mul.lo.s64 %rd2546, %rd1102, 3449720151; | |
xor.b64 %rd2536, %rd1093, %rd1099; | |
mov.u32 %r324, -766435501; | |
mov.u32 %r323, -239350328; | |
mov.u64 %rd2553, 1684936478; | |
mov.u64 %rd2552, 534103459; | |
mov.u64 %rd2551, 387276957; | |
mov.u64 %rd2550, 3041712726; | |
mov.u64 %rd2549, 3986602516; | |
mov.u64 %rd2548, 2835769497; | |
mov.u64 %rd2547, 3668340011; | |
mov.u64 %rd2545, 2027808484; | |
mov.u64 %rd2544, 1993301258; | |
mov.u64 %rd2542, 842468239; | |
mov.u64 %rd2541, 2654435769; | |
mov.u64 %rd2539, 3528531795; | |
mov.u64 %rd2538, 1013904242; | |
mov.u64 %rd2537, 3449720151; | |
LBB54_20: | |
shr.u64 %rd1128, %rd2546, 32; | |
shr.u64 %rd1129, %rd2536, 32; | |
mul.lo.s64 %rd1130, %rd1129, %rd2537; | |
and.b64 %rd1131, %rd1130, 4294967295; | |
xor.b64 %rd1132, %rd1131, %rd1128; | |
xor.b64 %rd1133, %rd1132, %rd2538; | |
mul.lo.s64 %rd1134, %rd1133, %rd2539; | |
shr.u64 %rd1135, %rd1134, 32; | |
shr.u64 %rd1136, %rd1130, 32; | |
and.b64 %rd1137, %rd2540, 4294967295; | |
xor.b64 %rd1138, %rd1137, %rd1136; | |
xor.b64 %rd1139, %rd1138, %rd2541; | |
mul.lo.s64 %rd1140, %rd1139, %rd2539; | |
and.b64 %rd1141, %rd1140, 4294967295; | |
xor.b64 %rd1142, %rd1141, %rd1135; | |
xor.b64 %rd1143, %rd1142, %rd2542; | |
mul.lo.s64 %rd1144, %rd1143, %rd2537; | |
shr.u64 %rd1145, %rd1144, 32; | |
shr.u64 %rd1146, %rd1140, 32; | |
and.b64 %rd1147, %rd2543, 4294967295; | |
xor.b64 %rd1148, %rd1147, %rd1146; | |
xor.b64 %rd1149, %rd1148, %rd2544; | |
mul.lo.s64 %rd1150, %rd1149, %rd2537; | |
and.b64 %rd1151, %rd1150, 4294967295; | |
xor.b64 %rd1152, %rd1151, %rd1145; | |
xor.b64 %rd1153, %rd1152, %rd2545; | |
mul.lo.s64 %rd1154, %rd1153, %rd2539; | |
shr.u64 %rd1155, %rd1154, 32; | |
shr.u64 %rd1156, %rd1150, 32; | |
and.b64 %rd1157, %rd2546, 4294967295; | |
xor.b64 %rd1158, %rd1157, %rd1156; | |
xor.b64 %rd1159, %rd1158, %rd2547; | |
mul.lo.s64 %rd1160, %rd1159, %rd2539; | |
and.b64 %rd1161, %rd1160, 4294967295; | |
xor.b64 %rd1162, %rd1161, %rd1155; | |
xor.b64 %rd1163, %rd1162, %rd2548; | |
mul.lo.s64 %rd1164, %rd1163, %rd2537; | |
shr.u64 %rd1165, %rd1164, 32; | |
shr.u64 %rd1166, %rd1160, 32; | |
and.b64 %rd1167, %rd1134, 4294967295; | |
xor.b64 %rd1168, %rd1167, %rd1166; | |
xor.b64 %rd1169, %rd1168, %rd2549; | |
mul.lo.s64 %rd1170, %rd1169, %rd2537; | |
and.b64 %rd1171, %rd1170, 4294967295; | |
xor.b64 %rd1172, %rd1171, %rd1165; | |
xor.b64 %rd1173, %rd1172, %rd2550; | |
mul.lo.s64 %rd1174, %rd1173, %rd2539; | |
shr.u64 %rd1175, %rd1174, 32; | |
shr.u64 %rd1176, %rd1170, 32; | |
and.b64 %rd1177, %rd1144, 4294967295; | |
xor.b64 %rd1178, %rd1177, %rd1176; | |
xor.b64 %rd1179, %rd1178, %rd2551; | |
mul.lo.s64 %rd1180, %rd1179, %rd2539; | |
and.b64 %rd1181, %rd1180, 4294967295; | |
xor.b64 %rd1182, %rd1181, %rd1175; | |
xor.b64 %rd1183, %rd1182, %rd2552; | |
mul.lo.s64 %rd1184, %rd1183, %rd2537; | |
shr.u64 %rd1185, %rd1184, 32; | |
shr.u64 %rd1186, %rd1180, 32; | |
xor.b64 %rd1187, %rd1154, %rd1186; | |
xor.b64 %rd1188, %rd1187, %rd2553; | |
mul.lo.s64 %rd1189, %rd1188, %rd2537; | |
xor.b64 %rd1190, %rd1185, %rd1189; | |
cvt.u32.u64 %r125, %rd1190; | |
xor.b32 %r126, %r323, %r125; | |
mul.lo.s32 %r127, %r126, %r324; | |
shr.u32 %r128, %r127, 9; | |
cvt.rn.f32.u32 %f88, %r128; | |
mul.rn.f32 %f89, %f88, 0f34000000; | |
cvt.rn.f16.f32 %h46, %f89; | |
mov.b16 %h47, 0x2E66; | |
setp.ge.f16 %p25, %h46, %h47; | |
ld.global.nc.b16 %h48, [%rd44+514]; | |
ld.global.nc.f32 %f90, [%rd45+1028]; | |
cvt.rn.f16.f32 %h49, %f90; | |
add.rn.f16 %h50, %h48, %h49; | |
mov.b16 %h51, 0x3C72; | |
mul.rn.f16 %h52, %h50, %h51; | |
selp.b16 %h53, %h52, 0x0000, %p25; | |
cvt.f32.f16 %f91, %h53; | |
ld.global.nc.b16 %h54, [%rd46+514]; | |
cvt.f32.f16 %f92, %h54; | |
ld.global.nc.f32 %f93, [%rd47+1028]; | |
mul.rn.f32 %f94, %f1, %f93; | |
mul.rn.f32 %f95, %f94, %f92; | |
ld.global.nc.f32 %f96, [%rd48+1028]; | |
mul.rn.f32 %f97, %f2, %f94; | |
sub.rn.f32 %f98, %f96, %f97; | |
add.rn.f32 %f99, %f95, %f98; | |
add.rn.f32 %f100, %f99, %f91; | |
add.rn.f32 %f8, %f7, %f100; | |
or.b32 %r130, %r73, 384; | |
shr.u32 %r131, %r130, 2; | |
cvt.u64.u32 %rd1191, %r131; | |
add.s64 %rd185, %rd11, %rd1191; | |
and.b64 %rd2419, %rd185, 4294967295; | |
setp.lt.u64 %p70, %rd185, %rd11; | |
@%p8 bra LBB54_22; | |
mul.lo.s64 %rd2558, %rd2419, 3528531795; | |
selp.u64 %rd1234, 1, 0, %p70; | |
add.s64 %rd1235, %rd2461, %rd1234; | |
xor.b64 %rd1236, %rd1235, %rd2558; | |
shr.u64 %rd1237, %rd1236, 32; | |
mul.lo.s64 %rd2561, %rd1237, 3449720151; | |
shr.u64 %rd1238, %rd2561, 32; | |
and.b64 %rd1239, %rd1235, 4294967295; | |
mul.lo.s64 %rd1240, %rd1239, 3449720151; | |
and.b64 %rd1241, %rd1240, 4294967295; | |
xor.b64 %rd1242, %rd1241, %rd1238; | |
xor.b64 %rd1243, %rd1242, 2654435769; | |
mul.lo.s64 %rd2564, %rd1243, 3528531795; | |
xor.b64 %rd2554, %rd1240, %rd185; | |
mov.u32 %r327, -1879881855; | |
mov.u32 %r326, -845247145; | |
mov.u32 %r325, 534103459; | |
mov.u64 %rd2572, 3678237736; | |
mov.u64 %rd2571, 3041712726; | |
mov.u64 %rd2570, 1401181199; | |
mov.u64 %rd2569, 2835769497; | |
mov.u64 %rd2568, 1684936478; | |
mov.u64 %rd2567, 2027808484; | |
mov.u64 %rd2566, 387276957; | |
mov.u64 %rd2565, 842468239; | |
mov.u64 %rd2563, 3986602516; | |
mov.u64 %rd2562, 1013904242; | |
mov.u64 %rd2560, 3668340011; | |
mov.u64 %rd2559, 3144134277; | |
mov.u64 %rd2557, 3449720151; | |
mov.u64 %rd2556, 1993301258; | |
mov.u64 %rd2555, 3528531795; | |
bra.uni LBB54_23; | |
LBB54_22: | |
selp.u64 %rd1207, 1, 0, %p70; | |
add.s64 %rd1208, %rd2461, %rd1207; | |
and.b64 %rd1209, %rd1208, 4294967295; | |
mul.lo.s64 %rd2558, %rd1209, 3449720151; | |
xor.b64 %rd1210, %rd2558, %rd185; | |
shr.u64 %rd1211, %rd1210, 32; | |
mul.lo.s64 %rd2561, %rd1211, 3528531795; | |
shr.u64 %rd1212, %rd2561, 32; | |
mul.lo.s64 %rd1214, %rd2419, 3528531795; | |
and.b64 %rd1215, %rd1214, 4294967295; | |
xor.b64 %rd1216, %rd1215, %rd1212; | |
xor.b64 %rd1217, %rd1216, 3144134277; | |
mul.lo.s64 %rd2564, %rd1217, 3449720151; | |
xor.b64 %rd2554, %rd1208, %rd1214; | |
mov.u32 %r327, -1767562579; | |
mov.u32 %r326, -766435501; | |
mov.u32 %r325, 1401181199; | |
mov.u64 %rd2572, 4055616968; | |
mov.u64 %rd2571, 1684936478; | |
mov.u64 %rd2570, 534103459; | |
mov.u64 %rd2569, 387276957; | |
mov.u64 %rd2568, 3041712726; | |
mov.u64 %rd2567, 3986602516; | |
mov.u64 %rd2566, 2835769497; | |
mov.u64 %rd2565, 3668340011; | |
mov.u64 %rd2563, 2027808484; | |
mov.u64 %rd2562, 1993301258; | |
mov.u64 %rd2560, 842468239; | |
mov.u64 %rd2559, 2654435769; | |
mov.u64 %rd2557, 3528531795; | |
mov.u64 %rd2556, 1013904242; | |
mov.u64 %rd2555, 3449720151; | |
LBB54_23: | |
shr.u64 %rd1244, %rd2564, 32; | |
shr.u64 %rd1245, %rd2554, 32; | |
mul.lo.s64 %rd1246, %rd1245, %rd2555; | |
and.b64 %rd1247, %rd1246, 4294967295; | |
xor.b64 %rd1248, %rd1247, %rd1244; | |
xor.b64 %rd1249, %rd1248, %rd2556; | |
mul.lo.s64 %rd1250, %rd1249, %rd2557; | |
shr.u64 %rd1251, %rd1250, 32; | |
shr.u64 %rd1252, %rd1246, 32; | |
and.b64 %rd1253, %rd2558, 4294967295; | |
xor.b64 %rd1254, %rd1253, %rd1252; | |
xor.b64 %rd1255, %rd1254, %rd2559; | |
mul.lo.s64 %rd1256, %rd1255, %rd2557; | |
and.b64 %rd1257, %rd1256, 4294967295; | |
xor.b64 %rd1258, %rd1257, %rd1251; | |
xor.b64 %rd1259, %rd1258, %rd2560; | |
mul.lo.s64 %rd1260, %rd1259, %rd2555; | |
shr.u64 %rd1261, %rd1260, 32; | |
shr.u64 %rd1262, %rd1256, 32; | |
and.b64 %rd1263, %rd2561, 4294967295; | |
xor.b64 %rd1264, %rd1263, %rd1262; | |
xor.b64 %rd1265, %rd1264, %rd2562; | |
mul.lo.s64 %rd1266, %rd1265, %rd2555; | |
and.b64 %rd1267, %rd1266, 4294967295; | |
xor.b64 %rd1268, %rd1267, %rd1261; | |
xor.b64 %rd1269, %rd1268, %rd2563; | |
mul.lo.s64 %rd1270, %rd1269, %rd2557; | |
shr.u64 %rd1271, %rd1270, 32; | |
shr.u64 %rd1272, %rd1266, 32; | |
and.b64 %rd1273, %rd2564, 4294967295; | |
xor.b64 %rd1274, %rd1273, %rd1272; | |
xor.b64 %rd1275, %rd1274, %rd2565; | |
mul.lo.s64 %rd1276, %rd1275, %rd2557; | |
and.b64 %rd1277, %rd1276, 4294967295; | |
xor.b64 %rd1278, %rd1277, %rd1271; | |
xor.b64 %rd1279, %rd1278, %rd2566; | |
mul.lo.s64 %rd1280, %rd1279, %rd2555; | |
shr.u64 %rd1281, %rd1280, 32; | |
shr.u64 %rd1282, %rd1276, 32; | |
and.b64 %rd1283, %rd1250, 4294967295; | |
xor.b64 %rd1284, %rd1283, %rd1282; | |
xor.b64 %rd1285, %rd1284, %rd2567; | |
mul.lo.s64 %rd1286, %rd1285, %rd2555; | |
and.b64 %rd1287, %rd1286, 4294967295; | |
xor.b64 %rd1288, %rd1287, %rd1281; | |
xor.b64 %rd1289, %rd1288, %rd2568; | |
mul.lo.s64 %rd1290, %rd1289, %rd2557; | |
shr.u64 %rd1291, %rd1290, 32; | |
shr.u64 %rd1292, %rd1286, 32; | |
and.b64 %rd1293, %rd1260, 4294967295; | |
xor.b64 %rd1294, %rd1293, %rd1292; | |
xor.b64 %rd1295, %rd1294, %rd2569; | |
mul.lo.s64 %rd1296, %rd1295, %rd2557; | |
and.b64 %rd1297, %rd1296, 4294967295; | |
xor.b64 %rd1298, %rd1297, %rd1291; | |
xor.b64 %rd1299, %rd1298, %rd2570; | |
mul.lo.s64 %rd1300, %rd1299, %rd2555; | |
shr.u64 %rd1301, %rd1300, 32; | |
shr.u64 %rd1302, %rd1296, 32; | |
and.b64 %rd1303, %rd1270, 4294967295; | |
xor.b64 %rd1304, %rd1303, %rd1302; | |
xor.b64 %rd1305, %rd1304, %rd2571; | |
mul.lo.s64 %rd1306, %rd1305, %rd2555; | |
and.b64 %rd1307, %rd1306, 4294967295; | |
xor.b64 %rd1308, %rd1307, %rd1301; | |
xor.b64 %rd1309, %rd1308, %rd2572; | |
mul.lo.s64 %rd1310, %rd1309, %rd2557; | |
shr.u64 %rd1311, %rd1310, 32; | |
cvt.u32.u64 %r138, %rd1311; | |
shr.u64 %rd1312, %rd1306, 32; | |
xor.b64 %rd1313, %rd1312, %rd1280; | |
cvt.u32.u64 %r139, %rd1313; | |
xor.b32 %r140, %r325, %r139; | |
mul.lo.s32 %r141, %r140, %r326; | |
xor.b32 %r142, %r141, %r138; | |
xor.b32 %r143, %r142, %r327; | |
shr.u32 %r144, %r143, 9; | |
cvt.rn.f32.u32 %f101, %r144; | |
mul.rn.f32 %f102, %f101, 0f34000000; | |
cvt.rn.f16.f32 %h55, %f102; | |
mov.b16 %h56, 0x2E66; | |
setp.ge.f16 %p28, %h55, %h56; | |
ld.global.nc.b16 %h57, [%rd44+768]; | |
ld.global.nc.f32 %f103, [%rd45+1536]; | |
cvt.rn.f16.f32 %h58, %f103; | |
add.rn.f16 %h59, %h57, %h58; | |
mov.b16 %h60, 0x3C72; | |
mul.rn.f16 %h61, %h59, %h60; | |
selp.b16 %h62, %h61, 0x0000, %p28; | |
cvt.f32.f16 %f104, %h62; | |
ld.global.nc.b16 %h63, [%rd46+768]; | |
cvt.f32.f16 %f105, %h63; | |
ld.global.nc.f32 %f106, [%rd47+1536]; | |
mul.rn.f32 %f107, %f1, %f106; | |
mul.rn.f32 %f108, %f107, %f105; | |
ld.global.nc.f32 %f109, [%rd48+1536]; | |
mul.rn.f32 %f110, %f2, %f107; | |
sub.rn.f32 %f111, %f109, %f110; | |
add.rn.f32 %f112, %f108, %f111; | |
add.rn.f32 %f113, %f112, %f104; | |
add.rn.f32 %f9, %f8, %f113; | |
or.b32 %r145, %r3, 385; | |
or.b32 %r146, %r145, %r4; | |
and.b32 %r147, %r145, 3; | |
shr.u32 %r148, %r146, 2; | |
setp.ne.s32 %p29, %r147, 1; | |
cvt.u64.u32 %rd1314, %r148; | |
add.s64 %rd213, %rd11, %rd1314; | |
@%p29 bra LBB54_25; | |
and.b64 %rd1354, %rd213, 4294967295; | |
mul.lo.s64 %rd2577, %rd1354, 3528531795; | |
setp.lt.u64 %p31, %rd213, %rd11; | |
selp.u64 %rd1355, 1, 0, %p31; | |
add.s64 %rd1356, %rd2461, %rd1355; | |
xor.b64 %rd1357, %rd1356, %rd2577; | |
shr.u64 %rd1358, %rd1357, 32; | |
mul.lo.s64 %rd2580, %rd1358, 3449720151; | |
shr.u64 %rd1359, %rd2580, 32; | |
and.b64 %rd1360, %rd1356, 4294967295; | |
mul.lo.s64 %rd1361, %rd1360, 3449720151; | |
and.b64 %rd1362, %rd1361, 4294967295; | |
xor.b64 %rd1363, %rd1362, %rd1359; | |
xor.b64 %rd1364, %rd1363, 2654435769; | |
mul.lo.s64 %rd2583, %rd1364, 3528531795; | |
xor.b64 %rd2573, %rd1361, %rd213; | |
mov.u32 %r329, -845247145; | |
mov.u32 %r328, -616729560; | |
mov.u64 %rd2590, 3041712726; | |
mov.u64 %rd2589, 1401181199; | |
mov.u64 %rd2588, 2835769497; | |
mov.u64 %rd2587, 1684936478; | |
mov.u64 %rd2586, 2027808484; | |
mov.u64 %rd2585, 387276957; | |
mov.u64 %rd2584, 842468239; | |
mov.u64 %rd2582, 3986602516; | |
mov.u64 %rd2581, 1013904242; | |
mov.u64 %rd2579, 3668340011; | |
mov.u64 %rd2578, 3144134277; | |
mov.u64 %rd2576, 3449720151; | |
mov.u64 %rd2575, 1993301258; | |
mov.u64 %rd2574, 3528531795; | |
bra.uni LBB54_26; | |
LBB54_25: | |
setp.lt.u64 %p30, %rd213, %rd11; | |
selp.u64 %rd1329, 1, 0, %p30; | |
add.s64 %rd1330, %rd2461, %rd1329; | |
and.b64 %rd1331, %rd1330, 4294967295; | |
mul.lo.s64 %rd2577, %rd1331, 3449720151; | |
xor.b64 %rd1332, %rd2577, %rd213; | |
shr.u64 %rd1333, %rd1332, 32; | |
mul.lo.s64 %rd2580, %rd1333, 3528531795; | |
shr.u64 %rd1334, %rd2580, 32; | |
and.b64 %rd1335, %rd213, 4294967295; | |
mul.lo.s64 %rd1336, %rd1335, 3528531795; | |
and.b64 %rd1337, %rd1336, 4294967295; | |
xor.b64 %rd1338, %rd1337, %rd1334; | |
xor.b64 %rd1339, %rd1338, 3144134277; | |
mul.lo.s64 %rd2583, %rd1339, 3449720151; | |
xor.b64 %rd2573, %rd1330, %rd1336; | |
mov.u32 %r329, -766435501; | |
mov.u32 %r328, -239350328; | |
mov.u64 %rd2590, 1684936478; | |
mov.u64 %rd2589, 534103459; | |
mov.u64 %rd2588, 387276957; | |
mov.u64 %rd2587, 3041712726; | |
mov.u64 %rd2586, 3986602516; | |
mov.u64 %rd2585, 2835769497; | |
mov.u64 %rd2584, 3668340011; | |
mov.u64 %rd2582, 2027808484; | |
mov.u64 %rd2581, 1993301258; | |
mov.u64 %rd2579, 842468239; | |
mov.u64 %rd2578, 2654435769; | |
mov.u64 %rd2576, 3528531795; | |
mov.u64 %rd2575, 1013904242; | |
mov.u64 %rd2574, 3449720151; | |
LBB54_26: | |
shr.u64 %rd1365, %rd2583, 32; | |
shr.u64 %rd1366, %rd2573, 32; | |
mul.lo.s64 %rd1367, %rd1366, %rd2574; | |
and.b64 %rd1368, %rd1367, 4294967295; | |
xor.b64 %rd1369, %rd1368, %rd1365; | |
xor.b64 %rd1370, %rd1369, %rd2575; | |
mul.lo.s64 %rd1371, %rd1370, %rd2576; | |
shr.u64 %rd1372, %rd1371, 32; | |
shr.u64 %rd1373, %rd1367, 32; | |
and.b64 %rd1374, %rd2577, 4294967295; | |
xor.b64 %rd1375, %rd1374, %rd1373; | |
xor.b64 %rd1376, %rd1375, %rd2578; | |
mul.lo.s64 %rd1377, %rd1376, %rd2576; | |
and.b64 %rd1378, %rd1377, 4294967295; | |
xor.b64 %rd1379, %rd1378, %rd1372; | |
xor.b64 %rd1380, %rd1379, %rd2579; | |
mul.lo.s64 %rd1381, %rd1380, %rd2574; | |
shr.u64 %rd1382, %rd1381, 32; | |
shr.u64 %rd1383, %rd1377, 32; | |
and.b64 %rd1384, %rd2580, 4294967295; | |
xor.b64 %rd1385, %rd1384, %rd1383; | |
xor.b64 %rd1386, %rd1385, %rd2581; | |
mul.lo.s64 %rd1387, %rd1386, %rd2574; | |
and.b64 %rd1388, %rd1387, 4294967295; | |
xor.b64 %rd1389, %rd1388, %rd1382; | |
xor.b64 %rd1390, %rd1389, %rd2582; | |
mul.lo.s64 %rd1391, %rd1390, %rd2576; | |
shr.u64 %rd1392, %rd1391, 32; | |
shr.u64 %rd1393, %rd1387, 32; | |
and.b64 %rd1394, %rd2583, 4294967295; | |
xor.b64 %rd1395, %rd1394, %rd1393; | |
xor.b64 %rd1396, %rd1395, %rd2584; | |
mul.lo.s64 %rd1397, %rd1396, %rd2576; | |
and.b64 %rd1398, %rd1397, 4294967295; | |
xor.b64 %rd1399, %rd1398, %rd1392; | |
xor.b64 %rd1400, %rd1399, %rd2585; | |
mul.lo.s64 %rd1401, %rd1400, %rd2574; | |
shr.u64 %rd1402, %rd1401, 32; | |
shr.u64 %rd1403, %rd1397, 32; | |
and.b64 %rd1404, %rd1371, 4294967295; | |
xor.b64 %rd1405, %rd1404, %rd1403; | |
xor.b64 %rd1406, %rd1405, %rd2586; | |
mul.lo.s64 %rd1407, %rd1406, %rd2574; | |
and.b64 %rd1408, %rd1407, 4294967295; | |
xor.b64 %rd1409, %rd1408, %rd1402; | |
xor.b64 %rd1410, %rd1409, %rd2587; | |
mul.lo.s64 %rd1411, %rd1410, %rd2576; | |
shr.u64 %rd1412, %rd1411, 32; | |
shr.u64 %rd1413, %rd1407, 32; | |
and.b64 %rd1414, %rd1381, 4294967295; | |
xor.b64 %rd1415, %rd1414, %rd1413; | |
xor.b64 %rd1416, %rd1415, %rd2588; | |
mul.lo.s64 %rd1417, %rd1416, %rd2576; | |
and.b64 %rd1418, %rd1417, 4294967295; | |
xor.b64 %rd1419, %rd1418, %rd1412; | |
xor.b64 %rd1420, %rd1419, %rd2589; | |
mul.lo.s64 %rd1421, %rd1420, %rd2574; | |
shr.u64 %rd1422, %rd1421, 32; | |
shr.u64 %rd1423, %rd1417, 32; | |
xor.b64 %rd1424, %rd1391, %rd1423; | |
xor.b64 %rd1425, %rd1424, %rd2590; | |
mul.lo.s64 %rd1426, %rd1425, %rd2574; | |
xor.b64 %rd1427, %rd1422, %rd1426; | |
cvt.u32.u64 %r153, %rd1427; | |
xor.b32 %r154, %r328, %r153; | |
mul.lo.s32 %r155, %r154, %r329; | |
shr.u32 %r156, %r155, 9; | |
cvt.rn.f32.u32 %f114, %r156; | |
mul.rn.f32 %f115, %f114, 0f34000000; | |
cvt.rn.f16.f32 %h64, %f115; | |
mov.b16 %h65, 0x2E66; | |
setp.ge.f16 %p33, %h64, %h65; | |
ld.global.nc.b16 %h66, [%rd44+770]; | |
ld.global.nc.f32 %f116, [%rd45+1540]; | |
cvt.rn.f16.f32 %h67, %f116; | |
add.rn.f16 %h68, %h66, %h67; | |
mov.b16 %h69, 0x3C72; | |
mul.rn.f16 %h70, %h68, %h69; | |
selp.b16 %h71, %h70, 0x0000, %p33; | |
cvt.f32.f16 %f117, %h71; | |
ld.global.nc.b16 %h72, [%rd46+770]; | |
cvt.f32.f16 %f118, %h72; | |
ld.global.nc.f32 %f119, [%rd47+1540]; | |
mul.rn.f32 %f120, %f1, %f119; | |
mul.rn.f32 %f121, %f120, %f118; | |
ld.global.nc.f32 %f122, [%rd48+1540]; | |
mul.rn.f32 %f123, %f2, %f120; | |
sub.rn.f32 %f124, %f122, %f123; | |
add.rn.f32 %f125, %f121, %f124; | |
add.rn.f32 %f126, %f125, %f117; | |
add.rn.f32 %f10, %f9, %f126; | |
or.b32 %r158, %r73, 512; | |
shr.u32 %r159, %r158, 2; | |
cvt.u64.u32 %rd1428, %r159; | |
add.s64 %rd240, %rd11, %rd1428; | |
@%p8 bra LBB54_28; | |
and.b64 %rd1470, %rd240, 4294967295; | |
mul.lo.s64 %rd2595, %rd1470, 3528531795; | |
setp.lt.u64 %p35, %rd240, %rd11; | |
selp.u64 %rd1471, 1, 0, %p35; | |
add.s64 %rd1472, %rd2461, %rd1471; | |
xor.b64 %rd1473, %rd1472, %rd2595; | |
shr.u64 %rd1474, %rd1473, 32; | |
mul.lo.s64 %rd2598, %rd1474, 3449720151; | |
shr.u64 %rd1475, %rd2598, 32; | |
and.b64 %rd1476, %rd1472, 4294967295; | |
mul.lo.s64 %rd1477, %rd1476, 3449720151; | |
and.b64 %rd1478, %rd1477, 4294967295; | |
xor.b64 %rd1479, %rd1478, %rd1475; | |
xor.b64 %rd1480, %rd1479, 2654435769; | |
mul.lo.s64 %rd2601, %rd1480, 3528531795; | |
xor.b64 %rd2591, %rd1477, %rd240; | |
mov.u32 %r332, -1879881855; | |
mov.u32 %r331, -845247145; | |
mov.u32 %r330, 534103459; | |
mov.u64 %rd2609, 3678237736; | |
mov.u64 %rd2608, 3041712726; | |
mov.u64 %rd2607, 1401181199; | |
mov.u64 %rd2606, 2835769497; | |
mov.u64 %rd2605, 1684936478; | |
mov.u64 %rd2604, 2027808484; | |
mov.u64 %rd2603, 387276957; | |
mov.u64 %rd2602, 842468239; | |
mov.u64 %rd2600, 3986602516; | |
mov.u64 %rd2599, 1013904242; | |
mov.u64 %rd2597, 3668340011; | |
mov.u64 %rd2596, 3144134277; | |
mov.u64 %rd2594, 3449720151; | |
mov.u64 %rd2593, 1993301258; | |
mov.u64 %rd2592, 3528531795; | |
bra.uni LBB54_29; | |
LBB54_28: | |
setp.lt.u64 %p34, %rd240, %rd11; | |
selp.u64 %rd1444, 1, 0, %p34; | |
add.s64 %rd1445, %rd2461, %rd1444; | |
and.b64 %rd1446, %rd1445, 4294967295; | |
mul.lo.s64 %rd2595, %rd1446, 3449720151; | |
xor.b64 %rd1447, %rd2595, %rd240; | |
shr.u64 %rd1448, %rd1447, 32; | |
mul.lo.s64 %rd2598, %rd1448, 3528531795; | |
shr.u64 %rd1449, %rd2598, 32; | |
and.b64 %rd1450, %rd240, 4294967295; | |
mul.lo.s64 %rd1451, %rd1450, 3528531795; | |
and.b64 %rd1452, %rd1451, 4294967295; | |
xor.b64 %rd1453, %rd1452, %rd1449; | |
xor.b64 %rd1454, %rd1453, 3144134277; | |
mul.lo.s64 %rd2601, %rd1454, 3449720151; | |
xor.b64 %rd2591, %rd1445, %rd1451; | |
mov.u32 %r332, -1767562579; | |
mov.u32 %r331, -766435501; | |
mov.u32 %r330, 1401181199; | |
mov.u64 %rd2609, 4055616968; | |
mov.u64 %rd2608, 1684936478; | |
mov.u64 %rd2607, 534103459; | |
mov.u64 %rd2606, 387276957; | |
mov.u64 %rd2605, 3041712726; | |
mov.u64 %rd2604, 3986602516; | |
mov.u64 %rd2603, 2835769497; | |
mov.u64 %rd2602, 3668340011; | |
mov.u64 %rd2600, 2027808484; | |
mov.u64 %rd2599, 1993301258; | |
mov.u64 %rd2597, 842468239; | |
mov.u64 %rd2596, 2654435769; | |
mov.u64 %rd2594, 3528531795; | |
mov.u64 %rd2593, 1013904242; | |
mov.u64 %rd2592, 3449720151; | |
LBB54_29: | |
shr.u64 %rd1481, %rd2601, 32; | |
shr.u64 %rd1482, %rd2591, 32; | |
mul.lo.s64 %rd1483, %rd1482, %rd2592; | |
and.b64 %rd1484, %rd1483, 4294967295; | |
xor.b64 %rd1485, %rd1484, %rd1481; | |
xor.b64 %rd1486, %rd1485, %rd2593; | |
mul.lo.s64 %rd1487, %rd1486, %rd2594; | |
shr.u64 %rd1488, %rd1487, 32; | |
shr.u64 %rd1489, %rd1483, 32; | |
and.b64 %rd1490, %rd2595, 4294967295; | |
xor.b64 %rd1491, %rd1490, %rd1489; | |
xor.b64 %rd1492, %rd1491, %rd2596; | |
mul.lo.s64 %rd1493, %rd1492, %rd2594; | |
and.b64 %rd1494, %rd1493, 4294967295; | |
xor.b64 %rd1495, %rd1494, %rd1488; | |
xor.b64 %rd1496, %rd1495, %rd2597; | |
mul.lo.s64 %rd1497, %rd1496, %rd2592; | |
shr.u64 %rd1498, %rd1497, 32; | |
shr.u64 %rd1499, %rd1493, 32; | |
and.b64 %rd1500, %rd2598, 4294967295; | |
xor.b64 %rd1501, %rd1500, %rd1499; | |
xor.b64 %rd1502, %rd1501, %rd2599; | |
mul.lo.s64 %rd1503, %rd1502, %rd2592; | |
and.b64 %rd1504, %rd1503, 4294967295; | |
xor.b64 %rd1505, %rd1504, %rd1498; | |
xor.b64 %rd1506, %rd1505, %rd2600; | |
mul.lo.s64 %rd1507, %rd1506, %rd2594; | |
shr.u64 %rd1508, %rd1507, 32; | |
shr.u64 %rd1509, %rd1503, 32; | |
and.b64 %rd1510, %rd2601, 4294967295; | |
xor.b64 %rd1511, %rd1510, %rd1509; | |
xor.b64 %rd1512, %rd1511, %rd2602; | |
mul.lo.s64 %rd1513, %rd1512, %rd2594; | |
and.b64 %rd1514, %rd1513, 4294967295; | |
xor.b64 %rd1515, %rd1514, %rd1508; | |
xor.b64 %rd1516, %rd1515, %rd2603; | |
mul.lo.s64 %rd1517, %rd1516, %rd2592; | |
shr.u64 %rd1518, %rd1517, 32; | |
shr.u64 %rd1519, %rd1513, 32; | |
and.b64 %rd1520, %rd1487, 4294967295; | |
xor.b64 %rd1521, %rd1520, %rd1519; | |
xor.b64 %rd1522, %rd1521, %rd2604; | |
mul.lo.s64 %rd1523, %rd1522, %rd2592; | |
and.b64 %rd1524, %rd1523, 4294967295; | |
xor.b64 %rd1525, %rd1524, %rd1518; | |
xor.b64 %rd1526, %rd1525, %rd2605; | |
mul.lo.s64 %rd1527, %rd1526, %rd2594; | |
shr.u64 %rd1528, %rd1527, 32; | |
shr.u64 %rd1529, %rd1523, 32; | |
and.b64 %rd1530, %rd1497, 4294967295; | |
xor.b64 %rd1531, %rd1530, %rd1529; | |
xor.b64 %rd1532, %rd1531, %rd2606; | |
mul.lo.s64 %rd1533, %rd1532, %rd2594; | |
and.b64 %rd1534, %rd1533, 4294967295; | |
xor.b64 %rd1535, %rd1534, %rd1528; | |
xor.b64 %rd1536, %rd1535, %rd2607; | |
mul.lo.s64 %rd1537, %rd1536, %rd2592; | |
shr.u64 %rd1538, %rd1537, 32; | |
shr.u64 %rd1539, %rd1533, 32; | |
and.b64 %rd1540, %rd1507, 4294967295; | |
xor.b64 %rd1541, %rd1540, %rd1539; | |
xor.b64 %rd1542, %rd1541, %rd2608; | |
mul.lo.s64 %rd1543, %rd1542, %rd2592; | |
and.b64 %rd1544, %rd1543, 4294967295; | |
xor.b64 %rd1545, %rd1544, %rd1538; | |
xor.b64 %rd1546, %rd1545, %rd2609; | |
mul.lo.s64 %rd1547, %rd1546, %rd2594; | |
shr.u64 %rd1548, %rd1547, 32; | |
cvt.u32.u64 %r166, %rd1548; | |
shr.u64 %rd1549, %rd1543, 32; | |
xor.b64 %rd1550, %rd1549, %rd1517; | |
cvt.u32.u64 %r167, %rd1550; | |
xor.b32 %r168, %r330, %r167; | |
mul.lo.s32 %r169, %r168, %r331; | |
xor.b32 %r170, %r169, %r166; | |
xor.b32 %r171, %r170, %r332; | |
shr.u32 %r172, %r171, 9; | |
cvt.rn.f32.u32 %f127, %r172; | |
mul.rn.f32 %f128, %f127, 0f34000000; | |
cvt.rn.f16.f32 %h73, %f128; | |
mov.b16 %h74, 0x2E66; | |
setp.ge.f16 %p36, %h73, %h74; | |
ld.global.nc.b16 %h75, [%rd44+1024]; | |
ld.global.nc.f32 %f129, [%rd45+2048]; | |
cvt.rn.f16.f32 %h76, %f129; | |
add.rn.f16 %h77, %h75, %h76; | |
mov.b16 %h78, 0x3C72; | |
mul.rn.f16 %h79, %h77, %h78; | |
selp.b16 %h80, %h79, 0x0000, %p36; | |
cvt.f32.f16 %f130, %h80; | |
ld.global.nc.b16 %h81, [%rd46+1024]; | |
cvt.f32.f16 %f131, %h81; | |
ld.global.nc.f32 %f132, [%rd47+2048]; | |
mul.rn.f32 %f133, %f1, %f132; | |
mul.rn.f32 %f134, %f133, %f131; | |
ld.global.nc.f32 %f135, [%rd48+2048]; | |
mul.rn.f32 %f136, %f2, %f133; | |
sub.rn.f32 %f137, %f135, %f136; | |
add.rn.f32 %f138, %f134, %f137; | |
add.rn.f32 %f139, %f138, %f130; | |
add.rn.f32 %f11, %f10, %f139; | |
or.b32 %r173, %r3, 513; | |
or.b32 %r174, %r173, %r4; | |
and.b32 %r175, %r173, 3; | |
shr.u32 %r176, %r174, 2; | |
setp.ne.s32 %p37, %r175, 1; | |
cvt.u64.u32 %rd1551, %r176; | |
add.s64 %rd268, %rd11, %rd1551; | |
@%p37 bra LBB54_31; | |
and.b64 %rd1591, %rd268, 4294967295; | |
mul.lo.s64 %rd2614, %rd1591, 3528531795; | |
setp.lt.u64 %p39, %rd268, %rd11; | |
selp.u64 %rd1592, 1, 0, %p39; | |
add.s64 %rd1593, %rd2461, %rd1592; | |
xor.b64 %rd1594, %rd1593, %rd2614; | |
shr.u64 %rd1595, %rd1594, 32; | |
mul.lo.s64 %rd2617, %rd1595, 3449720151; | |
shr.u64 %rd1596, %rd2617, 32; | |
and.b64 %rd1597, %rd1593, 4294967295; | |
mul.lo.s64 %rd1598, %rd1597, 3449720151; | |
and.b64 %rd1599, %rd1598, 4294967295; | |
xor.b64 %rd1600, %rd1599, %rd1596; | |
xor.b64 %rd1601, %rd1600, 2654435769; | |
mul.lo.s64 %rd2620, %rd1601, 3528531795; | |
xor.b64 %rd2610, %rd1598, %rd268; | |
mov.u32 %r334, -845247145; | |
mov.u32 %r333, -616729560; | |
mov.u64 %rd2627, 3041712726; | |
mov.u64 %rd2626, 1401181199; | |
mov.u64 %rd2625, 2835769497; | |
mov.u64 %rd2624, 1684936478; | |
mov.u64 %rd2623, 2027808484; | |
mov.u64 %rd2622, 387276957; | |
mov.u64 %rd2621, 842468239; | |
mov.u64 %rd2619, 3986602516; | |
mov.u64 %rd2618, 1013904242; | |
mov.u64 %rd2616, 3668340011; | |
mov.u64 %rd2615, 3144134277; | |
mov.u64 %rd2613, 3449720151; | |
mov.u64 %rd2612, 1993301258; | |
mov.u64 %rd2611, 3528531795; | |
bra.uni LBB54_32; | |
LBB54_31: | |
setp.lt.u64 %p38, %rd268, %rd11; | |
selp.u64 %rd1566, 1, 0, %p38; | |
add.s64 %rd1567, %rd2461, %rd1566; | |
and.b64 %rd1568, %rd1567, 4294967295; | |
mul.lo.s64 %rd2614, %rd1568, 3449720151; | |
xor.b64 %rd1569, %rd2614, %rd268; | |
shr.u64 %rd1570, %rd1569, 32; | |
mul.lo.s64 %rd2617, %rd1570, 3528531795; | |
shr.u64 %rd1571, %rd2617, 32; | |
and.b64 %rd1572, %rd268, 4294967295; | |
mul.lo.s64 %rd1573, %rd1572, 3528531795; | |
and.b64 %rd1574, %rd1573, 4294967295; | |
xor.b64 %rd1575, %rd1574, %rd1571; | |
xor.b64 %rd1576, %rd1575, 3144134277; | |
mul.lo.s64 %rd2620, %rd1576, 3449720151; | |
xor.b64 %rd2610, %rd1567, %rd1573; | |
mov.u32 %r334, -766435501; | |
mov.u32 %r333, -239350328; | |
mov.u64 %rd2627, 1684936478; | |
mov.u64 %rd2626, 534103459; | |
mov.u64 %rd2625, 387276957; | |
mov.u64 %rd2624, 3041712726; | |
mov.u64 %rd2623, 3986602516; | |
mov.u64 %rd2622, 2835769497; | |
mov.u64 %rd2621, 3668340011; | |
mov.u64 %rd2619, 2027808484; | |
mov.u64 %rd2618, 1993301258; | |
mov.u64 %rd2616, 842468239; | |
mov.u64 %rd2615, 2654435769; | |
mov.u64 %rd2613, 3528531795; | |
mov.u64 %rd2612, 1013904242; | |
mov.u64 %rd2611, 3449720151; | |
LBB54_32: | |
shr.u64 %rd1602, %rd2620, 32; | |
shr.u64 %rd1603, %rd2610, 32; | |
mul.lo.s64 %rd1604, %rd1603, %rd2611; | |
and.b64 %rd1605, %rd1604, 4294967295; | |
xor.b64 %rd1606, %rd1605, %rd1602; | |
xor.b64 %rd1607, %rd1606, %rd2612; | |
mul.lo.s64 %rd1608, %rd1607, %rd2613; | |
shr.u64 %rd1609, %rd1608, 32; | |
shr.u64 %rd1610, %rd1604, 32; | |
and.b64 %rd1611, %rd2614, 4294967295; | |
xor.b64 %rd1612, %rd1611, %rd1610; | |
xor.b64 %rd1613, %rd1612, %rd2615; | |
mul.lo.s64 %rd1614, %rd1613, %rd2613; | |
and.b64 %rd1615, %rd1614, 4294967295; | |
xor.b64 %rd1616, %rd1615, %rd1609; | |
xor.b64 %rd1617, %rd1616, %rd2616; | |
mul.lo.s64 %rd1618, %rd1617, %rd2611; | |
shr.u64 %rd1619, %rd1618, 32; | |
shr.u64 %rd1620, %rd1614, 32; | |
and.b64 %rd1621, %rd2617, 4294967295; | |
xor.b64 %rd1622, %rd1621, %rd1620; | |
xor.b64 %rd1623, %rd1622, %rd2618; | |
mul.lo.s64 %rd1624, %rd1623, %rd2611; | |
and.b64 %rd1625, %rd1624, 4294967295; | |
xor.b64 %rd1626, %rd1625, %rd1619; | |
xor.b64 %rd1627, %rd1626, %rd2619; | |
mul.lo.s64 %rd1628, %rd1627, %rd2613; | |
shr.u64 %rd1629, %rd1628, 32; | |
shr.u64 %rd1630, %rd1624, 32; | |
and.b64 %rd1631, %rd2620, 4294967295; | |
xor.b64 %rd1632, %rd1631, %rd1630; | |
xor.b64 %rd1633, %rd1632, %rd2621; | |
mul.lo.s64 %rd1634, %rd1633, %rd2613; | |
and.b64 %rd1635, %rd1634, 4294967295; | |
xor.b64 %rd1636, %rd1635, %rd1629; | |
xor.b64 %rd1637, %rd1636, %rd2622; | |
mul.lo.s64 %rd1638, %rd1637, %rd2611; | |
shr.u64 %rd1639, %rd1638, 32; | |
shr.u64 %rd1640, %rd1634, 32; | |
and.b64 %rd1641, %rd1608, 4294967295; | |
xor.b64 %rd1642, %rd1641, %rd1640; | |
xor.b64 %rd1643, %rd1642, %rd2623; | |
mul.lo.s64 %rd1644, %rd1643, %rd2611; | |
and.b64 %rd1645, %rd1644, 4294967295; | |
xor.b64 %rd1646, %rd1645, %rd1639; | |
xor.b64 %rd1647, %rd1646, %rd2624; | |
mul.lo.s64 %rd1648, %rd1647, %rd2613; | |
shr.u64 %rd1649, %rd1648, 32; | |
shr.u64 %rd1650, %rd1644, 32; | |
and.b64 %rd1651, %rd1618, 4294967295; | |
xor.b64 %rd1652, %rd1651, %rd1650; | |
xor.b64 %rd1653, %rd1652, %rd2625; | |
mul.lo.s64 %rd1654, %rd1653, %rd2613; | |
and.b64 %rd1655, %rd1654, 4294967295; | |
xor.b64 %rd1656, %rd1655, %rd1649; | |
xor.b64 %rd1657, %rd1656, %rd2626; | |
mul.lo.s64 %rd1658, %rd1657, %rd2611; | |
shr.u64 %rd1659, %rd1658, 32; | |
shr.u64 %rd1660, %rd1654, 32; | |
xor.b64 %rd1661, %rd1628, %rd1660; | |
xor.b64 %rd1662, %rd1661, %rd2627; | |
mul.lo.s64 %rd1663, %rd1662, %rd2611; | |
xor.b64 %rd1664, %rd1659, %rd1663; | |
cvt.u32.u64 %r181, %rd1664; | |
xor.b32 %r182, %r333, %r181; | |
mul.lo.s32 %r183, %r182, %r334; | |
shr.u32 %r184, %r183, 9; | |
cvt.rn.f32.u32 %f140, %r184; | |
mul.rn.f32 %f141, %f140, 0f34000000; | |
cvt.rn.f16.f32 %h82, %f141; | |
mov.b16 %h83, 0x2E66; | |
setp.ge.f16 %p41, %h82, %h83; | |
ld.global.nc.b16 %h84, [%rd44+1026]; | |
ld.global.nc.f32 %f142, [%rd45+2052]; | |
cvt.rn.f16.f32 %h85, %f142; | |
add.rn.f16 %h86, %h84, %h85; | |
mov.b16 %h87, 0x3C72; | |
mul.rn.f16 %h88, %h86, %h87; | |
selp.b16 %h89, %h88, 0x0000, %p41; | |
cvt.f32.f16 %f143, %h89; | |
ld.global.nc.b16 %h90, [%rd46+1026]; | |
cvt.f32.f16 %f144, %h90; | |
ld.global.nc.f32 %f145, [%rd47+2052]; | |
mul.rn.f32 %f146, %f1, %f145; | |
mul.rn.f32 %f147, %f146, %f144; | |
ld.global.nc.f32 %f148, [%rd48+2052]; | |
mul.rn.f32 %f149, %f2, %f146; | |
sub.rn.f32 %f150, %f148, %f149; | |
add.rn.f32 %f151, %f147, %f150; | |
add.rn.f32 %f152, %f151, %f143; | |
add.rn.f32 %f12, %f11, %f152; | |
or.b32 %r186, %r73, 640; | |
shr.u32 %r187, %r186, 2; | |
cvt.u64.u32 %rd1665, %r187; | |
add.s64 %rd295, %rd11, %rd1665; | |
@%p8 bra LBB54_34; | |
and.b64 %rd1707, %rd295, 4294967295; | |
mul.lo.s64 %rd2632, %rd1707, 3528531795; | |
setp.lt.u64 %p43, %rd295, %rd11; | |
selp.u64 %rd1708, 1, 0, %p43; | |
add.s64 %rd1709, %rd2461, %rd1708; | |
xor.b64 %rd1710, %rd1709, %rd2632; | |
shr.u64 %rd1711, %rd1710, 32; | |
mul.lo.s64 %rd2635, %rd1711, 3449720151; | |
shr.u64 %rd1712, %rd2635, 32; | |
and.b64 %rd1713, %rd1709, 4294967295; | |
mul.lo.s64 %rd1714, %rd1713, 3449720151; | |
and.b64 %rd1715, %rd1714, 4294967295; | |
xor.b64 %rd1716, %rd1715, %rd1712; | |
xor.b64 %rd1717, %rd1716, 2654435769; | |
mul.lo.s64 %rd2638, %rd1717, 3528531795; | |
xor.b64 %rd2628, %rd1714, %rd295; | |
mov.u32 %r337, -1879881855; | |
mov.u32 %r336, -845247145; | |
mov.u32 %r335, 534103459; | |
mov.u64 %rd2646, 3678237736; | |
mov.u64 %rd2645, 3041712726; | |
mov.u64 %rd2644, 1401181199; | |
mov.u64 %rd2643, 2835769497; | |
mov.u64 %rd2642, 1684936478; | |
mov.u64 %rd2641, 2027808484; | |
mov.u64 %rd2640, 387276957; | |
mov.u64 %rd2639, 842468239; | |
mov.u64 %rd2637, 3986602516; | |
mov.u64 %rd2636, 1013904242; | |
mov.u64 %rd2634, 3668340011; | |
mov.u64 %rd2633, 3144134277; | |
mov.u64 %rd2631, 3449720151; | |
mov.u64 %rd2630, 1993301258; | |
mov.u64 %rd2629, 3528531795; | |
bra.uni LBB54_35; | |
LBB54_34: | |
setp.lt.u64 %p42, %rd295, %rd11; | |
selp.u64 %rd1681, 1, 0, %p42; | |
add.s64 %rd1682, %rd2461, %rd1681; | |
and.b64 %rd1683, %rd1682, 4294967295; | |
mul.lo.s64 %rd2632, %rd1683, 3449720151; | |
xor.b64 %rd1684, %rd2632, %rd295; | |
shr.u64 %rd1685, %rd1684, 32; | |
mul.lo.s64 %rd2635, %rd1685, 3528531795; | |
shr.u64 %rd1686, %rd2635, 32; | |
and.b64 %rd1687, %rd295, 4294967295; | |
mul.lo.s64 %rd1688, %rd1687, 3528531795; | |
and.b64 %rd1689, %rd1688, 4294967295; | |
xor.b64 %rd1690, %rd1689, %rd1686; | |
xor.b64 %rd1691, %rd1690, 3144134277; | |
mul.lo.s64 %rd2638, %rd1691, 3449720151; | |
xor.b64 %rd2628, %rd1682, %rd1688; | |
mov.u32 %r337, -1767562579; | |
mov.u32 %r336, -766435501; | |
mov.u32 %r335, 1401181199; | |
mov.u64 %rd2646, 4055616968; | |
mov.u64 %rd2645, 1684936478; | |
mov.u64 %rd2644, 534103459; | |
mov.u64 %rd2643, 387276957; | |
mov.u64 %rd2642, 3041712726; | |
mov.u64 %rd2641, 3986602516; | |
mov.u64 %rd2640, 2835769497; | |
mov.u64 %rd2639, 3668340011; | |
mov.u64 %rd2637, 2027808484; | |
mov.u64 %rd2636, 1993301258; | |
mov.u64 %rd2634, 842468239; | |
mov.u64 %rd2633, 2654435769; | |
mov.u64 %rd2631, 3528531795; | |
mov.u64 %rd2630, 1013904242; | |
mov.u64 %rd2629, 3449720151; | |
LBB54_35: | |
shr.u64 %rd1718, %rd2638, 32; | |
shr.u64 %rd1719, %rd2628, 32; | |
mul.lo.s64 %rd1720, %rd1719, %rd2629; | |
and.b64 %rd1721, %rd1720, 4294967295; | |
xor.b64 %rd1722, %rd1721, %rd1718; | |
xor.b64 %rd1723, %rd1722, %rd2630; | |
mul.lo.s64 %rd1724, %rd1723, %rd2631; | |
shr.u64 %rd1725, %rd1724, 32; | |
shr.u64 %rd1726, %rd1720, 32; | |
and.b64 %rd1727, %rd2632, 4294967295; | |
xor.b64 %rd1728, %rd1727, %rd1726; | |
xor.b64 %rd1729, %rd1728, %rd2633; | |
mul.lo.s64 %rd1730, %rd1729, %rd2631; | |
and.b64 %rd1731, %rd1730, 4294967295; | |
xor.b64 %rd1732, %rd1731, %rd1725; | |
xor.b64 %rd1733, %rd1732, %rd2634; | |
mul.lo.s64 %rd1734, %rd1733, %rd2629; | |
shr.u64 %rd1735, %rd1734, 32; | |
shr.u64 %rd1736, %rd1730, 32; | |
and.b64 %rd1737, %rd2635, 4294967295; | |
xor.b64 %rd1738, %rd1737, %rd1736; | |
xor.b64 %rd1739, %rd1738, %rd2636; | |
mul.lo.s64 %rd1740, %rd1739, %rd2629; | |
and.b64 %rd1741, %rd1740, 4294967295; | |
xor.b64 %rd1742, %rd1741, %rd1735; | |
xor.b64 %rd1743, %rd1742, %rd2637; | |
mul.lo.s64 %rd1744, %rd1743, %rd2631; | |
shr.u64 %rd1745, %rd1744, 32; | |
shr.u64 %rd1746, %rd1740, 32; | |
and.b64 %rd1747, %rd2638, 4294967295; | |
xor.b64 %rd1748, %rd1747, %rd1746; | |
xor.b64 %rd1749, %rd1748, %rd2639; | |
mul.lo.s64 %rd1750, %rd1749, %rd2631; | |
and.b64 %rd1751, %rd1750, 4294967295; | |
xor.b64 %rd1752, %rd1751, %rd1745; | |
xor.b64 %rd1753, %rd1752, %rd2640; | |
mul.lo.s64 %rd1754, %rd1753, %rd2629; | |
shr.u64 %rd1755, %rd1754, 32; | |
shr.u64 %rd1756, %rd1750, 32; | |
and.b64 %rd1757, %rd1724, 4294967295; | |
xor.b64 %rd1758, %rd1757, %rd1756; | |
xor.b64 %rd1759, %rd1758, %rd2641; | |
mul.lo.s64 %rd1760, %rd1759, %rd2629; | |
and.b64 %rd1761, %rd1760, 4294967295; | |
xor.b64 %rd1762, %rd1761, %rd1755; | |
xor.b64 %rd1763, %rd1762, %rd2642; | |
mul.lo.s64 %rd1764, %rd1763, %rd2631; | |
shr.u64 %rd1765, %rd1764, 32; | |
shr.u64 %rd1766, %rd1760, 32; | |
and.b64 %rd1767, %rd1734, 4294967295; | |
xor.b64 %rd1768, %rd1767, %rd1766; | |
xor.b64 %rd1769, %rd1768, %rd2643; | |
mul.lo.s64 %rd1770, %rd1769, %rd2631; | |
and.b64 %rd1771, %rd1770, 4294967295; | |
xor.b64 %rd1772, %rd1771, %rd1765; | |
xor.b64 %rd1773, %rd1772, %rd2644; | |
mul.lo.s64 %rd1774, %rd1773, %rd2629; | |
shr.u64 %rd1775, %rd1774, 32; | |
shr.u64 %rd1776, %rd1770, 32; | |
and.b64 %rd1777, %rd1744, 4294967295; | |
xor.b64 %rd1778, %rd1777, %rd1776; | |
xor.b64 %rd1779, %rd1778, %rd2645; | |
mul.lo.s64 %rd1780, %rd1779, %rd2629; | |
and.b64 %rd1781, %rd1780, 4294967295; | |
xor.b64 %rd1782, %rd1781, %rd1775; | |
xor.b64 %rd1783, %rd1782, %rd2646; | |
mul.lo.s64 %rd1784, %rd1783, %rd2631; | |
shr.u64 %rd1785, %rd1784, 32; | |
cvt.u32.u64 %r194, %rd1785; | |
shr.u64 %rd1786, %rd1780, 32; | |
xor.b64 %rd1787, %rd1786, %rd1754; | |
cvt.u32.u64 %r195, %rd1787; | |
xor.b32 %r196, %r335, %r195; | |
mul.lo.s32 %r197, %r196, %r336; | |
xor.b32 %r198, %r197, %r194; | |
xor.b32 %r199, %r198, %r337; | |
shr.u32 %r200, %r199, 9; | |
cvt.rn.f32.u32 %f153, %r200; | |
mul.rn.f32 %f154, %f153, 0f34000000; | |
cvt.rn.f16.f32 %h91, %f154; | |
mov.b16 %h92, 0x2E66; | |
setp.ge.f16 %p44, %h91, %h92; | |
ld.global.nc.b16 %h93, [%rd44+1280]; | |
ld.global.nc.f32 %f155, [%rd45+2560]; | |
cvt.rn.f16.f32 %h94, %f155; | |
add.rn.f16 %h95, %h93, %h94; | |
mov.b16 %h96, 0x3C72; | |
mul.rn.f16 %h97, %h95, %h96; | |
selp.b16 %h98, %h97, 0x0000, %p44; | |
cvt.f32.f16 %f156, %h98; | |
ld.global.nc.b16 %h99, [%rd46+1280]; | |
cvt.f32.f16 %f157, %h99; | |
ld.global.nc.f32 %f158, [%rd47+2560]; | |
mul.rn.f32 %f159, %f1, %f158; | |
mul.rn.f32 %f160, %f159, %f157; | |
ld.global.nc.f32 %f161, [%rd48+2560]; | |
mul.rn.f32 %f162, %f2, %f159; | |
sub.rn.f32 %f163, %f161, %f162; | |
add.rn.f32 %f164, %f160, %f163; | |
add.rn.f32 %f165, %f164, %f156; | |
add.rn.f32 %f13, %f12, %f165; | |
or.b32 %r201, %r3, 641; | |
or.b32 %r202, %r201, %r4; | |
and.b32 %r203, %r201, 3; | |
shr.u32 %r204, %r202, 2; | |
setp.ne.s32 %p45, %r203, 1; | |
cvt.u64.u32 %rd1788, %r204; | |
add.s64 %rd323, %rd11, %rd1788; | |
@%p45 bra LBB54_37; | |
and.b64 %rd1828, %rd323, 4294967295; | |
mul.lo.s64 %rd2651, %rd1828, 3528531795; | |
setp.lt.u64 %p47, %rd323, %rd11; | |
selp.u64 %rd1829, 1, 0, %p47; | |
add.s64 %rd1830, %rd2461, %rd1829; | |
xor.b64 %rd1831, %rd1830, %rd2651; | |
shr.u64 %rd1832, %rd1831, 32; | |
mul.lo.s64 %rd2654, %rd1832, 3449720151; | |
shr.u64 %rd1833, %rd2654, 32; | |
and.b64 %rd1834, %rd1830, 4294967295; | |
mul.lo.s64 %rd1835, %rd1834, 3449720151; | |
and.b64 %rd1836, %rd1835, 4294967295; | |
xor.b64 %rd1837, %rd1836, %rd1833; | |
xor.b64 %rd1838, %rd1837, 2654435769; | |
mul.lo.s64 %rd2657, %rd1838, 3528531795; | |
xor.b64 %rd2647, %rd1835, %rd323; | |
mov.u32 %r339, -845247145; | |
mov.u32 %r338, -616729560; | |
mov.u64 %rd2664, 3041712726; | |
mov.u64 %rd2663, 1401181199; | |
mov.u64 %rd2662, 2835769497; | |
mov.u64 %rd2661, 1684936478; | |
mov.u64 %rd2660, 2027808484; | |
mov.u64 %rd2659, 387276957; | |
mov.u64 %rd2658, 842468239; | |
mov.u64 %rd2656, 3986602516; | |
mov.u64 %rd2655, 1013904242; | |
mov.u64 %rd2653, 3668340011; | |
mov.u64 %rd2652, 3144134277; | |
mov.u64 %rd2650, 3449720151; | |
mov.u64 %rd2649, 1993301258; | |
mov.u64 %rd2648, 3528531795; | |
bra.uni LBB54_38; | |
LBB54_37: | |
setp.lt.u64 %p46, %rd323, %rd11; | |
selp.u64 %rd1803, 1, 0, %p46; | |
add.s64 %rd1804, %rd2461, %rd1803; | |
and.b64 %rd1805, %rd1804, 4294967295; | |
mul.lo.s64 %rd2651, %rd1805, 3449720151; | |
xor.b64 %rd1806, %rd2651, %rd323; | |
shr.u64 %rd1807, %rd1806, 32; | |
mul.lo.s64 %rd2654, %rd1807, 3528531795; | |
shr.u64 %rd1808, %rd2654, 32; | |
and.b64 %rd1809, %rd323, 4294967295; | |
mul.lo.s64 %rd1810, %rd1809, 3528531795; | |
and.b64 %rd1811, %rd1810, 4294967295; | |
xor.b64 %rd1812, %rd1811, %rd1808; | |
xor.b64 %rd1813, %rd1812, 3144134277; | |
mul.lo.s64 %rd2657, %rd1813, 3449720151; | |
xor.b64 %rd2647, %rd1804, %rd1810; | |
mov.u32 %r339, -766435501; | |
mov.u32 %r338, -239350328; | |
mov.u64 %rd2664, 1684936478; | |
mov.u64 %rd2663, 534103459; | |
mov.u64 %rd2662, 387276957; | |
mov.u64 %rd2661, 3041712726; | |
mov.u64 %rd2660, 3986602516; | |
mov.u64 %rd2659, 2835769497; | |
mov.u64 %rd2658, 3668340011; | |
mov.u64 %rd2656, 2027808484; | |
mov.u64 %rd2655, 1993301258; | |
mov.u64 %rd2653, 842468239; | |
mov.u64 %rd2652, 2654435769; | |
mov.u64 %rd2650, 3528531795; | |
mov.u64 %rd2649, 1013904242; | |
mov.u64 %rd2648, 3449720151; | |
LBB54_38: | |
shr.u64 %rd1839, %rd2657, 32; | |
shr.u64 %rd1840, %rd2647, 32; | |
mul.lo.s64 %rd1841, %rd1840, %rd2648; | |
and.b64 %rd1842, %rd1841, 4294967295; | |
xor.b64 %rd1843, %rd1842, %rd1839; | |
xor.b64 %rd1844, %rd1843, %rd2649; | |
mul.lo.s64 %rd1845, %rd1844, %rd2650; | |
shr.u64 %rd1846, %rd1845, 32; | |
shr.u64 %rd1847, %rd1841, 32; | |
and.b64 %rd1848, %rd2651, 4294967295; | |
xor.b64 %rd1849, %rd1848, %rd1847; | |
xor.b64 %rd1850, %rd1849, %rd2652; | |
mul.lo.s64 %rd1851, %rd1850, %rd2650; | |
and.b64 %rd1852, %rd1851, 4294967295; | |
xor.b64 %rd1853, %rd1852, %rd1846; | |
xor.b64 %rd1854, %rd1853, %rd2653; | |
mul.lo.s64 %rd1855, %rd1854, %rd2648; | |
shr.u64 %rd1856, %rd1855, 32; | |
shr.u64 %rd1857, %rd1851, 32; | |
and.b64 %rd1858, %rd2654, 4294967295; | |
xor.b64 %rd1859, %rd1858, %rd1857; | |
xor.b64 %rd1860, %rd1859, %rd2655; | |
mul.lo.s64 %rd1861, %rd1860, %rd2648; | |
and.b64 %rd1862, %rd1861, 4294967295; | |
xor.b64 %rd1863, %rd1862, %rd1856; | |
xor.b64 %rd1864, %rd1863, %rd2656; | |
mul.lo.s64 %rd1865, %rd1864, %rd2650; | |
shr.u64 %rd1866, %rd1865, 32; | |
shr.u64 %rd1867, %rd1861, 32; | |
and.b64 %rd1868, %rd2657, 4294967295; | |
xor.b64 %rd1869, %rd1868, %rd1867; | |
xor.b64 %rd1870, %rd1869, %rd2658; | |
mul.lo.s64 %rd1871, %rd1870, %rd2650; | |
and.b64 %rd1872, %rd1871, 4294967295; | |
xor.b64 %rd1873, %rd1872, %rd1866; | |
xor.b64 %rd1874, %rd1873, %rd2659; | |
mul.lo.s64 %rd1875, %rd1874, %rd2648; | |
shr.u64 %rd1876, %rd1875, 32; | |
shr.u64 %rd1877, %rd1871, 32; | |
and.b64 %rd1878, %rd1845, 4294967295; | |
xor.b64 %rd1879, %rd1878, %rd1877; | |
xor.b64 %rd1880, %rd1879, %rd2660; | |
mul.lo.s64 %rd1881, %rd1880, %rd2648; | |
and.b64 %rd1882, %rd1881, 4294967295; | |
xor.b64 %rd1883, %rd1882, %rd1876; | |
xor.b64 %rd1884, %rd1883, %rd2661; | |
mul.lo.s64 %rd1885, %rd1884, %rd2650; | |
shr.u64 %rd1886, %rd1885, 32; | |
shr.u64 %rd1887, %rd1881, 32; | |
and.b64 %rd1888, %rd1855, 4294967295; | |
xor.b64 %rd1889, %rd1888, %rd1887; | |
xor.b64 %rd1890, %rd1889, %rd2662; | |
mul.lo.s64 %rd1891, %rd1890, %rd2650; | |
and.b64 %rd1892, %rd1891, 4294967295; | |
xor.b64 %rd1893, %rd1892, %rd1886; | |
xor.b64 %rd1894, %rd1893, %rd2663; | |
mul.lo.s64 %rd1895, %rd1894, %rd2648; | |
shr.u64 %rd1896, %rd1895, 32; | |
shr.u64 %rd1897, %rd1891, 32; | |
xor.b64 %rd1898, %rd1865, %rd1897; | |
xor.b64 %rd1899, %rd1898, %rd2664; | |
mul.lo.s64 %rd1900, %rd1899, %rd2648; | |
xor.b64 %rd1901, %rd1896, %rd1900; | |
cvt.u32.u64 %r209, %rd1901; | |
xor.b32 %r210, %r338, %r209; | |
mul.lo.s32 %r211, %r210, %r339; | |
shr.u32 %r212, %r211, 9; | |
cvt.rn.f32.u32 %f166, %r212; | |
mul.rn.f32 %f167, %f166, 0f34000000; | |
cvt.rn.f16.f32 %h100, %f167; | |
mov.b16 %h101, 0x2E66; | |
setp.ge.f16 %p49, %h100, %h101; | |
ld.global.nc.b16 %h102, [%rd44+1282]; | |
ld.global.nc.f32 %f168, [%rd45+2564]; | |
cvt.rn.f16.f32 %h103, %f168; | |
add.rn.f16 %h104, %h102, %h103; | |
mov.b16 %h105, 0x3C72; | |
mul.rn.f16 %h106, %h104, %h105; | |
selp.b16 %h107, %h106, 0x0000, %p49; | |
cvt.f32.f16 %f169, %h107; | |
ld.global.nc.b16 %h108, [%rd46+1282]; | |
cvt.f32.f16 %f170, %h108; | |
ld.global.nc.f32 %f171, [%rd47+2564]; | |
mul.rn.f32 %f172, %f1, %f171; | |
mul.rn.f32 %f173, %f172, %f170; | |
ld.global.nc.f32 %f174, [%rd48+2564]; | |
mul.rn.f32 %f175, %f2, %f172; | |
sub.rn.f32 %f176, %f174, %f175; | |
add.rn.f32 %f177, %f173, %f176; | |
add.rn.f32 %f178, %f177, %f169; | |
add.rn.f32 %f14, %f13, %f178; | |
or.b32 %r214, %r73, 768; | |
shr.u32 %r215, %r214, 2; | |
cvt.u64.u32 %rd1902, %r215; | |
add.s64 %rd350, %rd11, %rd1902; | |
@%p8 bra LBB54_40; | |
and.b64 %rd1944, %rd350, 4294967295; | |
mul.lo.s64 %rd2669, %rd1944, 3528531795; | |
setp.lt.u64 %p51, %rd350, %rd11; | |
selp.u64 %rd1945, 1, 0, %p51; | |
add.s64 %rd1946, %rd2461, %rd1945; | |
xor.b64 %rd1947, %rd1946, %rd2669; | |
shr.u64 %rd1948, %rd1947, 32; | |
mul.lo.s64 %rd2672, %rd1948, 3449720151; | |
shr.u64 %rd1949, %rd2672, 32; | |
and.b64 %rd1950, %rd1946, 4294967295; | |
mul.lo.s64 %rd1951, %rd1950, 3449720151; | |
and.b64 %rd1952, %rd1951, 4294967295; | |
xor.b64 %rd1953, %rd1952, %rd1949; | |
xor.b64 %rd1954, %rd1953, 2654435769; | |
mul.lo.s64 %rd2675, %rd1954, 3528531795; | |
xor.b64 %rd2665, %rd1951, %rd350; | |
mov.u32 %r342, -1879881855; | |
mov.u32 %r341, -845247145; | |
mov.u32 %r340, 534103459; | |
mov.u64 %rd2683, 3678237736; | |
mov.u64 %rd2682, 3041712726; | |
mov.u64 %rd2681, 1401181199; | |
mov.u64 %rd2680, 2835769497; | |
mov.u64 %rd2679, 1684936478; | |
mov.u64 %rd2678, 2027808484; | |
mov.u64 %rd2677, 387276957; | |
mov.u64 %rd2676, 842468239; | |
mov.u64 %rd2674, 3986602516; | |
mov.u64 %rd2673, 1013904242; | |
mov.u64 %rd2671, 3668340011; | |
mov.u64 %rd2670, 3144134277; | |
mov.u64 %rd2668, 3449720151; | |
mov.u64 %rd2667, 1993301258; | |
mov.u64 %rd2666, 3528531795; | |
bra.uni LBB54_41; | |
LBB54_40: | |
setp.lt.u64 %p50, %rd350, %rd11; | |
selp.u64 %rd1918, 1, 0, %p50; | |
add.s64 %rd1919, %rd2461, %rd1918; | |
and.b64 %rd1920, %rd1919, 4294967295; | |
mul.lo.s64 %rd2669, %rd1920, 3449720151; | |
xor.b64 %rd1921, %rd2669, %rd350; | |
shr.u64 %rd1922, %rd1921, 32; | |
mul.lo.s64 %rd2672, %rd1922, 3528531795; | |
shr.u64 %rd1923, %rd2672, 32; | |
and.b64 %rd1924, %rd350, 4294967295; | |
mul.lo.s64 %rd1925, %rd1924, 3528531795; | |
and.b64 %rd1926, %rd1925, 4294967295; | |
xor.b64 %rd1927, %rd1926, %rd1923; | |
xor.b64 %rd1928, %rd1927, 3144134277; | |
mul.lo.s64 %rd2675, %rd1928, 3449720151; | |
xor.b64 %rd2665, %rd1919, %rd1925; | |
mov.u32 %r342, -1767562579; | |
mov.u32 %r341, -766435501; | |
mov.u32 %r340, 1401181199; | |
mov.u64 %rd2683, 4055616968; | |
mov.u64 %rd2682, 1684936478; | |
mov.u64 %rd2681, 534103459; | |
mov.u64 %rd2680, 387276957; | |
mov.u64 %rd2679, 3041712726; | |
mov.u64 %rd2678, 3986602516; | |
mov.u64 %rd2677, 2835769497; | |
mov.u64 %rd2676, 3668340011; | |
mov.u64 %rd2674, 2027808484; | |
mov.u64 %rd2673, 1993301258; | |
mov.u64 %rd2671, 842468239; | |
mov.u64 %rd2670, 2654435769; | |
mov.u64 %rd2668, 3528531795; | |
mov.u64 %rd2667, 1013904242; | |
mov.u64 %rd2666, 3449720151; | |
LBB54_41: | |
shr.u64 %rd1955, %rd2675, 32; | |
shr.u64 %rd1956, %rd2665, 32; | |
mul.lo.s64 %rd1957, %rd1956, %rd2666; | |
and.b64 %rd1958, %rd1957, 4294967295; | |
xor.b64 %rd1959, %rd1958, %rd1955; | |
xor.b64 %rd1960, %rd1959, %rd2667; | |
mul.lo.s64 %rd1961, %rd1960, %rd2668; | |
shr.u64 %rd1962, %rd1961, 32; | |
shr.u64 %rd1963, %rd1957, 32; | |
and.b64 %rd1964, %rd2669, 4294967295; | |
xor.b64 %rd1965, %rd1964, %rd1963; | |
xor.b64 %rd1966, %rd1965, %rd2670; | |
mul.lo.s64 %rd1967, %rd1966, %rd2668; | |
and.b64 %rd1968, %rd1967, 4294967295; | |
xor.b64 %rd1969, %rd1968, %rd1962; | |
xor.b64 %rd1970, %rd1969, %rd2671; | |
mul.lo.s64 %rd1971, %rd1970, %rd2666; | |
shr.u64 %rd1972, %rd1971, 32; | |
shr.u64 %rd1973, %rd1967, 32; | |
and.b64 %rd1974, %rd2672, 4294967295; | |
xor.b64 %rd1975, %rd1974, %rd1973; | |
xor.b64 %rd1976, %rd1975, %rd2673; | |
mul.lo.s64 %rd1977, %rd1976, %rd2666; | |
and.b64 %rd1978, %rd1977, 4294967295; | |
xor.b64 %rd1979, %rd1978, %rd1972; | |
xor.b64 %rd1980, %rd1979, %rd2674; | |
mul.lo.s64 %rd1981, %rd1980, %rd2668; | |
shr.u64 %rd1982, %rd1981, 32; | |
shr.u64 %rd1983, %rd1977, 32; | |
and.b64 %rd1984, %rd2675, 4294967295; | |
xor.b64 %rd1985, %rd1984, %rd1983; | |
xor.b64 %rd1986, %rd1985, %rd2676; | |
mul.lo.s64 %rd1987, %rd1986, %rd2668; | |
and.b64 %rd1988, %rd1987, 4294967295; | |
xor.b64 %rd1989, %rd1988, %rd1982; | |
xor.b64 %rd1990, %rd1989, %rd2677; | |
mul.lo.s64 %rd1991, %rd1990, %rd2666; | |
shr.u64 %rd1992, %rd1991, 32; | |
shr.u64 %rd1993, %rd1987, 32; | |
and.b64 %rd1994, %rd1961, 4294967295; | |
xor.b64 %rd1995, %rd1994, %rd1993; | |
xor.b64 %rd1996, %rd1995, %rd2678; | |
mul.lo.s64 %rd1997, %rd1996, %rd2666; | |
and.b64 %rd1998, %rd1997, 4294967295; | |
xor.b64 %rd1999, %rd1998, %rd1992; | |
xor.b64 %rd2000, %rd1999, %rd2679; | |
mul.lo.s64 %rd2001, %rd2000, %rd2668; | |
shr.u64 %rd2002, %rd2001, 32; | |
shr.u64 %rd2003, %rd1997, 32; | |
and.b64 %rd2004, %rd1971, 4294967295; | |
xor.b64 %rd2005, %rd2004, %rd2003; | |
xor.b64 %rd2006, %rd2005, %rd2680; | |
mul.lo.s64 %rd2007, %rd2006, %rd2668; | |
and.b64 %rd2008, %rd2007, 4294967295; | |
xor.b64 %rd2009, %rd2008, %rd2002; | |
xor.b64 %rd2010, %rd2009, %rd2681; | |
mul.lo.s64 %rd2011, %rd2010, %rd2666; | |
shr.u64 %rd2012, %rd2011, 32; | |
shr.u64 %rd2013, %rd2007, 32; | |
and.b64 %rd2014, %rd1981, 4294967295; | |
xor.b64 %rd2015, %rd2014, %rd2013; | |
xor.b64 %rd2016, %rd2015, %rd2682; | |
mul.lo.s64 %rd2017, %rd2016, %rd2666; | |
and.b64 %rd2018, %rd2017, 4294967295; | |
xor.b64 %rd2019, %rd2018, %rd2012; | |
xor.b64 %rd2020, %rd2019, %rd2683; | |
mul.lo.s64 %rd2021, %rd2020, %rd2668; | |
shr.u64 %rd2022, %rd2021, 32; | |
cvt.u32.u64 %r222, %rd2022; | |
shr.u64 %rd2023, %rd2017, 32; | |
xor.b64 %rd2024, %rd2023, %rd1991; | |
cvt.u32.u64 %r223, %rd2024; | |
xor.b32 %r224, %r340, %r223; | |
mul.lo.s32 %r225, %r224, %r341; | |
xor.b32 %r226, %r225, %r222; | |
xor.b32 %r227, %r226, %r342; | |
shr.u32 %r228, %r227, 9; | |
cvt.rn.f32.u32 %f179, %r228; | |
mul.rn.f32 %f180, %f179, 0f34000000; | |
cvt.rn.f16.f32 %h109, %f180; | |
mov.b16 %h110, 0x2E66; | |
setp.ge.f16 %p52, %h109, %h110; | |
ld.global.nc.b16 %h111, [%rd44+1536]; | |
ld.global.nc.f32 %f181, [%rd45+3072]; | |
cvt.rn.f16.f32 %h112, %f181; | |
add.rn.f16 %h113, %h111, %h112; | |
mov.b16 %h114, 0x3C72; | |
mul.rn.f16 %h115, %h113, %h114; | |
selp.b16 %h116, %h115, 0x0000, %p52; | |
cvt.f32.f16 %f182, %h116; | |
ld.global.nc.b16 %h117, [%rd46+1536]; | |
cvt.f32.f16 %f183, %h117; | |
ld.global.nc.f32 %f184, [%rd47+3072]; | |
mul.rn.f32 %f185, %f1, %f184; | |
mul.rn.f32 %f186, %f185, %f183; | |
ld.global.nc.f32 %f187, [%rd48+3072]; | |
mul.rn.f32 %f188, %f2, %f185; | |
sub.rn.f32 %f189, %f187, %f188; | |
add.rn.f32 %f190, %f186, %f189; | |
add.rn.f32 %f191, %f190, %f182; | |
add.rn.f32 %f15, %f14, %f191; | |
or.b32 %r229, %r3, 769; | |
or.b32 %r230, %r229, %r4; | |
and.b32 %r231, %r229, 3; | |
shr.u32 %r232, %r230, 2; | |
setp.ne.s32 %p53, %r231, 1; | |
cvt.u64.u32 %rd2025, %r232; | |
add.s64 %rd378, %rd11, %rd2025; | |
@%p53 bra LBB54_43; | |
and.b64 %rd2065, %rd378, 4294967295; | |
mul.lo.s64 %rd2688, %rd2065, 3528531795; | |
setp.lt.u64 %p55, %rd378, %rd11; | |
selp.u64 %rd2066, 1, 0, %p55; | |
add.s64 %rd2067, %rd2461, %rd2066; | |
xor.b64 %rd2068, %rd2067, %rd2688; | |
shr.u64 %rd2069, %rd2068, 32; | |
mul.lo.s64 %rd2691, %rd2069, 3449720151; | |
shr.u64 %rd2070, %rd2691, 32; | |
and.b64 %rd2071, %rd2067, 4294967295; | |
mul.lo.s64 %rd2072, %rd2071, 3449720151; | |
and.b64 %rd2073, %rd2072, 4294967295; | |
xor.b64 %rd2074, %rd2073, %rd2070; | |
xor.b64 %rd2075, %rd2074, 2654435769; | |
mul.lo.s64 %rd2694, %rd2075, 3528531795; | |
xor.b64 %rd2684, %rd2072, %rd378; | |
mov.u32 %r344, -845247145; | |
mov.u32 %r343, -616729560; | |
mov.u64 %rd2701, 3041712726; | |
mov.u64 %rd2700, 1401181199; | |
mov.u64 %rd2699, 2835769497; | |
mov.u64 %rd2698, 1684936478; | |
mov.u64 %rd2697, 2027808484; | |
mov.u64 %rd2696, 387276957; | |
mov.u64 %rd2695, 842468239; | |
mov.u64 %rd2693, 3986602516; | |
mov.u64 %rd2692, 1013904242; | |
mov.u64 %rd2690, 3668340011; | |
mov.u64 %rd2689, 3144134277; | |
mov.u64 %rd2687, 3449720151; | |
mov.u64 %rd2686, 1993301258; | |
mov.u64 %rd2685, 3528531795; | |
bra.uni LBB54_44; | |
LBB54_43: | |
setp.lt.u64 %p54, %rd378, %rd11; | |
selp.u64 %rd2040, 1, 0, %p54; | |
add.s64 %rd2041, %rd2461, %rd2040; | |
and.b64 %rd2042, %rd2041, 4294967295; | |
mul.lo.s64 %rd2688, %rd2042, 3449720151; | |
xor.b64 %rd2043, %rd2688, %rd378; | |
shr.u64 %rd2044, %rd2043, 32; | |
mul.lo.s64 %rd2691, %rd2044, 3528531795; | |
shr.u64 %rd2045, %rd2691, 32; | |
and.b64 %rd2046, %rd378, 4294967295; | |
mul.lo.s64 %rd2047, %rd2046, 3528531795; | |
and.b64 %rd2048, %rd2047, 4294967295; | |
xor.b64 %rd2049, %rd2048, %rd2045; | |
xor.b64 %rd2050, %rd2049, 3144134277; | |
mul.lo.s64 %rd2694, %rd2050, 3449720151; | |
xor.b64 %rd2684, %rd2041, %rd2047; | |
mov.u32 %r344, -766435501; | |
mov.u32 %r343, -239350328; | |
mov.u64 %rd2701, 1684936478; | |
mov.u64 %rd2700, 534103459; | |
mov.u64 %rd2699, 387276957; | |
mov.u64 %rd2698, 3041712726; | |
mov.u64 %rd2697, 3986602516; | |
mov.u64 %rd2696, 2835769497; | |
mov.u64 %rd2695, 3668340011; | |
mov.u64 %rd2693, 2027808484; | |
mov.u64 %rd2692, 1993301258; | |
mov.u64 %rd2690, 842468239; | |
mov.u64 %rd2689, 2654435769; | |
mov.u64 %rd2687, 3528531795; | |
mov.u64 %rd2686, 1013904242; | |
mov.u64 %rd2685, 3449720151; | |
LBB54_44: | |
shr.u64 %rd2076, %rd2694, 32; | |
shr.u64 %rd2077, %rd2684, 32; | |
mul.lo.s64 %rd2078, %rd2077, %rd2685; | |
and.b64 %rd2079, %rd2078, 4294967295; | |
xor.b64 %rd2080, %rd2079, %rd2076; | |
xor.b64 %rd2081, %rd2080, %rd2686; | |
mul.lo.s64 %rd2082, %rd2081, %rd2687; | |
shr.u64 %rd2083, %rd2082, 32; | |
shr.u64 %rd2084, %rd2078, 32; | |
and.b64 %rd2085, %rd2688, 4294967295; | |
xor.b64 %rd2086, %rd2085, %rd2084; | |
xor.b64 %rd2087, %rd2086, %rd2689; | |
mul.lo.s64 %rd2088, %rd2087, %rd2687; | |
and.b64 %rd2089, %rd2088, 4294967295; | |
xor.b64 %rd2090, %rd2089, %rd2083; | |
xor.b64 %rd2091, %rd2090, %rd2690; | |
mul.lo.s64 %rd2092, %rd2091, %rd2685; | |
shr.u64 %rd2093, %rd2092, 32; | |
shr.u64 %rd2094, %rd2088, 32; | |
and.b64 %rd2095, %rd2691, 4294967295; | |
xor.b64 %rd2096, %rd2095, %rd2094; | |
xor.b64 %rd2097, %rd2096, %rd2692; | |
mul.lo.s64 %rd2098, %rd2097, %rd2685; | |
and.b64 %rd2099, %rd2098, 4294967295; | |
xor.b64 %rd2100, %rd2099, %rd2093; | |
xor.b64 %rd2101, %rd2100, %rd2693; | |
mul.lo.s64 %rd2102, %rd2101, %rd2687; | |
shr.u64 %rd2103, %rd2102, 32; | |
shr.u64 %rd2104, %rd2098, 32; | |
and.b64 %rd2105, %rd2694, 4294967295; | |
xor.b64 %rd2106, %rd2105, %rd2104; | |
xor.b64 %rd2107, %rd2106, %rd2695; | |
mul.lo.s64 %rd2108, %rd2107, %rd2687; | |
and.b64 %rd2109, %rd2108, 4294967295; | |
xor.b64 %rd2110, %rd2109, %rd2103; | |
xor.b64 %rd2111, %rd2110, %rd2696; | |
mul.lo.s64 %rd2112, %rd2111, %rd2685; | |
shr.u64 %rd2113, %rd2112, 32; | |
shr.u64 %rd2114, %rd2108, 32; | |
and.b64 %rd2115, %rd2082, 4294967295; | |
xor.b64 %rd2116, %rd2115, %rd2114; | |
xor.b64 %rd2117, %rd2116, %rd2697; | |
mul.lo.s64 %rd2118, %rd2117, %rd2685; | |
and.b64 %rd2119, %rd2118, 4294967295; | |
xor.b64 %rd2120, %rd2119, %rd2113; | |
xor.b64 %rd2121, %rd2120, %rd2698; | |
mul.lo.s64 %rd2122, %rd2121, %rd2687; | |
shr.u64 %rd2123, %rd2122, 32; | |
shr.u64 %rd2124, %rd2118, 32; | |
and.b64 %rd2125, %rd2092, 4294967295; | |
xor.b64 %rd2126, %rd2125, %rd2124; | |
xor.b64 %rd2127, %rd2126, %rd2699; | |
mul.lo.s64 %rd2128, %rd2127, %rd2687; | |
and.b64 %rd2129, %rd2128, 4294967295; | |
xor.b64 %rd2130, %rd2129, %rd2123; | |
xor.b64 %rd2131, %rd2130, %rd2700; | |
mul.lo.s64 %rd2132, %rd2131, %rd2685; | |
shr.u64 %rd2133, %rd2132, 32; | |
shr.u64 %rd2134, %rd2128, 32; | |
xor.b64 %rd2135, %rd2102, %rd2134; | |
xor.b64 %rd2136, %rd2135, %rd2701; | |
mul.lo.s64 %rd2137, %rd2136, %rd2685; | |
xor.b64 %rd2138, %rd2133, %rd2137; | |
cvt.u32.u64 %r237, %rd2138; | |
xor.b32 %r238, %r343, %r237; | |
mul.lo.s32 %r239, %r238, %r344; | |
shr.u32 %r240, %r239, 9; | |
cvt.rn.f32.u32 %f192, %r240; | |
mul.rn.f32 %f193, %f192, 0f34000000; | |
cvt.rn.f16.f32 %h118, %f193; | |
mov.b16 %h119, 0x2E66; | |
setp.ge.f16 %p57, %h118, %h119; | |
ld.global.nc.b16 %h120, [%rd44+1538]; | |
ld.global.nc.f32 %f194, [%rd45+3076]; | |
cvt.rn.f16.f32 %h121, %f194; | |
add.rn.f16 %h122, %h120, %h121; | |
mov.b16 %h123, 0x3C72; | |
mul.rn.f16 %h124, %h122, %h123; | |
selp.b16 %h125, %h124, 0x0000, %p57; | |
cvt.f32.f16 %f195, %h125; | |
ld.global.nc.b16 %h126, [%rd46+1538]; | |
cvt.f32.f16 %f196, %h126; | |
ld.global.nc.f32 %f197, [%rd47+3076]; | |
mul.rn.f32 %f198, %f1, %f197; | |
mul.rn.f32 %f199, %f198, %f196; | |
ld.global.nc.f32 %f200, [%rd48+3076]; | |
mul.rn.f32 %f201, %f2, %f198; | |
sub.rn.f32 %f202, %f200, %f201; | |
add.rn.f32 %f203, %f199, %f202; | |
add.rn.f32 %f204, %f203, %f195; | |
add.rn.f32 %f16, %f15, %f204; | |
or.b32 %r242, %r73, 896; | |
shr.u32 %r243, %r242, 2; | |
cvt.u64.u32 %rd2139, %r243; | |
add.s64 %rd405, %rd11, %rd2139; | |
@%p8 bra LBB54_46; | |
mov.u32 %r347, -1879881855; | |
mov.u32 %r345, 534103459; | |
mov.u64 %rd2720, 3678237736; | |
and.b64 %rd2181, %rd405, 4294967295; | |
mul.lo.s64 %rd2706, %rd2181, 3528531795; | |
setp.lt.u64 %p59, %rd405, %rd11; | |
selp.u64 %rd2182, 1, 0, %p59; | |
add.s64 %rd2183, %rd2461, %rd2182; | |
xor.b64 %rd2184, %rd2183, %rd2706; | |
shr.u64 %rd2185, %rd2184, 32; | |
mul.lo.s64 %rd2709, %rd2185, 3449720151; | |
shr.u64 %rd2186, %rd2709, 32; | |
and.b64 %rd2187, %rd2183, 4294967295; | |
mul.lo.s64 %rd2188, %rd2187, 3449720151; | |
and.b64 %rd2189, %rd2188, 4294967295; | |
xor.b64 %rd2190, %rd2189, %rd2186; | |
xor.b64 %rd2191, %rd2190, 2654435769; | |
mul.lo.s64 %rd2712, %rd2191, 3528531795; | |
xor.b64 %rd2702, %rd2188, %rd405; | |
mov.u32 %r346, -845247145; | |
mov.u64 %rd2719, 3041712726; | |
mov.u64 %rd2718, 1401181199; | |
mov.u64 %rd2717, 2835769497; | |
mov.u64 %rd2716, 1684936478; | |
mov.u64 %rd2715, 2027808484; | |
mov.u64 %rd2714, 387276957; | |
mov.u64 %rd2713, 842468239; | |
mov.u64 %rd2711, 3986602516; | |
mov.u64 %rd2710, 1013904242; | |
mov.u64 %rd2708, 3668340011; | |
mov.u64 %rd2707, 3144134277; | |
mov.u64 %rd2705, 3449720151; | |
mov.u64 %rd2704, 1993301258; | |
mov.u64 %rd2703, 3528531795; | |
bra.uni LBB54_47; | |
LBB54_46: | |
setp.lt.u64 %p58, %rd405, %rd11; | |
selp.u64 %rd2155, 1, 0, %p58; | |
add.s64 %rd2156, %rd2461, %rd2155; | |
and.b64 %rd2157, %rd2156, 4294967295; | |
mul.lo.s64 %rd2706, %rd2157, 3449720151; | |
xor.b64 %rd2158, %rd2706, %rd405; | |
shr.u64 %rd2159, %rd2158, 32; | |
mul.lo.s64 %rd2709, %rd2159, 3528531795; | |
shr.u64 %rd2160, %rd2709, 32; | |
and.b64 %rd2161, %rd405, 4294967295; | |
mul.lo.s64 %rd2162, %rd2161, 3528531795; | |
and.b64 %rd2163, %rd2162, 4294967295; | |
xor.b64 %rd2164, %rd2163, %rd2160; | |
xor.b64 %rd2165, %rd2164, 3144134277; | |
mul.lo.s64 %rd2712, %rd2165, 3449720151; | |
xor.b64 %rd2702, %rd2156, %rd2162; | |
mov.u32 %r347, -1767562579; | |
mov.u32 %r346, -766435501; | |
mov.u32 %r345, 1401181199; | |
mov.u64 %rd2720, 4055616968; | |
mov.u64 %rd2719, 1684936478; | |
mov.u64 %rd2718, 534103459; | |
mov.u64 %rd2717, 387276957; | |
mov.u64 %rd2716, 3041712726; | |
mov.u64 %rd2715, 3986602516; | |
mov.u64 %rd2714, 2835769497; | |
mov.u64 %rd2713, 3668340011; | |
mov.u64 %rd2711, 2027808484; | |
mov.u64 %rd2710, 1993301258; | |
mov.u64 %rd2708, 842468239; | |
mov.u64 %rd2707, 2654435769; | |
mov.u64 %rd2705, 3528531795; | |
mov.u64 %rd2704, 1013904242; | |
mov.u64 %rd2703, 3449720151; | |
LBB54_47: | |
shr.u64 %rd2192, %rd2712, 32; | |
shr.u64 %rd2193, %rd2702, 32; | |
mul.lo.s64 %rd2194, %rd2193, %rd2703; | |
and.b64 %rd2195, %rd2194, 4294967295; | |
xor.b64 %rd2196, %rd2195, %rd2192; | |
xor.b64 %rd2197, %rd2196, %rd2704; | |
mul.lo.s64 %rd2198, %rd2197, %rd2705; | |
shr.u64 %rd2199, %rd2198, 32; | |
shr.u64 %rd2200, %rd2194, 32; | |
and.b64 %rd2201, %rd2706, 4294967295; | |
xor.b64 %rd2202, %rd2201, %rd2200; | |
xor.b64 %rd2203, %rd2202, %rd2707; | |
mul.lo.s64 %rd2204, %rd2203, %rd2705; | |
and.b64 %rd2205, %rd2204, 4294967295; | |
xor.b64 %rd2206, %rd2205, %rd2199; | |
xor.b64 %rd2207, %rd2206, %rd2708; | |
mul.lo.s64 %rd2208, %rd2207, %rd2703; | |
shr.u64 %rd2209, %rd2208, 32; | |
shr.u64 %rd2210, %rd2204, 32; | |
and.b64 %rd2211, %rd2709, 4294967295; | |
xor.b64 %rd2212, %rd2211, %rd2210; | |
xor.b64 %rd2213, %rd2212, %rd2710; | |
mul.lo.s64 %rd2214, %rd2213, %rd2703; | |
and.b64 %rd2215, %rd2214, 4294967295; | |
xor.b64 %rd2216, %rd2215, %rd2209; | |
xor.b64 %rd2217, %rd2216, %rd2711; | |
mul.lo.s64 %rd2218, %rd2217, %rd2705; | |
shr.u64 %rd2219, %rd2218, 32; | |
shr.u64 %rd2220, %rd2214, 32; | |
and.b64 %rd2221, %rd2712, 4294967295; | |
xor.b64 %rd2222, %rd2221, %rd2220; | |
xor.b64 %rd2223, %rd2222, %rd2713; | |
mul.lo.s64 %rd2224, %rd2223, %rd2705; | |
and.b64 %rd2225, %rd2224, 4294967295; | |
xor.b64 %rd2226, %rd2225, %rd2219; | |
xor.b64 %rd2227, %rd2226, %rd2714; | |
mul.lo.s64 %rd2228, %rd2227, %rd2703; | |
shr.u64 %rd2229, %rd2228, 32; | |
shr.u64 %rd2230, %rd2224, 32; | |
and.b64 %rd2231, %rd2198, 4294967295; | |
xor.b64 %rd2232, %rd2231, %rd2230; | |
xor.b64 %rd2233, %rd2232, %rd2715; | |
mul.lo.s64 %rd2234, %rd2233, %rd2703; | |
and.b64 %rd2235, %rd2234, 4294967295; | |
xor.b64 %rd2236, %rd2235, %rd2229; | |
xor.b64 %rd2237, %rd2236, %rd2716; | |
mul.lo.s64 %rd2238, %rd2237, %rd2705; | |
shr.u64 %rd2239, %rd2238, 32; | |
shr.u64 %rd2240, %rd2234, 32; | |
and.b64 %rd2241, %rd2208, 4294967295; | |
xor.b64 %rd2242, %rd2241, %rd2240; | |
xor.b64 %rd2243, %rd2242, %rd2717; | |
mul.lo.s64 %rd2244, %rd2243, %rd2705; | |
and.b64 %rd2245, %rd2244, 4294967295; | |
xor.b64 %rd2246, %rd2245, %rd2239; | |
xor.b64 %rd2247, %rd2246, %rd2718; | |
mul.lo.s64 %rd2248, %rd2247, %rd2703; | |
shr.u64 %rd2249, %rd2248, 32; | |
shr.u64 %rd2250, %rd2244, 32; | |
and.b64 %rd2251, %rd2218, 4294967295; | |
xor.b64 %rd2252, %rd2251, %rd2250; | |
xor.b64 %rd2253, %rd2252, %rd2719; | |
mul.lo.s64 %rd2254, %rd2253, %rd2703; | |
and.b64 %rd2255, %rd2254, 4294967295; | |
xor.b64 %rd2256, %rd2255, %rd2249; | |
xor.b64 %rd2257, %rd2256, %rd2720; | |
mul.lo.s64 %rd2258, %rd2257, %rd2705; | |
shr.u64 %rd2259, %rd2258, 32; | |
cvt.u32.u64 %r250, %rd2259; | |
shr.u64 %rd2260, %rd2254, 32; | |
xor.b64 %rd2261, %rd2260, %rd2228; | |
cvt.u32.u64 %r251, %rd2261; | |
xor.b32 %r252, %r345, %r251; | |
mul.lo.s32 %r253, %r252, %r346; | |
xor.b32 %r254, %r253, %r250; | |
xor.b32 %r255, %r254, %r347; | |
shr.u32 %r256, %r255, 9; | |
cvt.rn.f32.u32 %f205, %r256; | |
mul.rn.f32 %f206, %f205, 0f34000000; | |
cvt.rn.f16.f32 %h127, %f206; | |
mov.b16 %h128, 0x2E66; | |
setp.ge.f16 %p60, %h127, %h128; | |
ld.global.nc.b16 %h129, [%rd44+1792]; | |
ld.global.nc.f32 %f207, [%rd45+3584]; | |
cvt.rn.f16.f32 %h130, %f207; | |
add.rn.f16 %h131, %h129, %h130; | |
mov.b16 %h132, 0x3C72; | |
mul.rn.f16 %h133, %h131, %h132; | |
selp.b16 %h134, %h133, 0x0000, %p60; | |
cvt.f32.f16 %f208, %h134; | |
ld.global.nc.b16 %h135, [%rd46+1792]; | |
cvt.f32.f16 %f209, %h135; | |
ld.global.nc.f32 %f210, [%rd47+3584]; | |
mul.rn.f32 %f211, %f1, %f210; | |
mul.rn.f32 %f212, %f211, %f209; | |
ld.global.nc.f32 %f213, [%rd48+3584]; | |
mul.rn.f32 %f214, %f2, %f211; | |
sub.rn.f32 %f215, %f213, %f214; | |
add.rn.f32 %f216, %f212, %f215; | |
add.rn.f32 %f217, %f216, %f208; | |
add.rn.f32 %f17, %f16, %f217; | |
or.b32 %r257, %r3, 897; | |
or.b32 %r258, %r257, %r4; | |
and.b32 %r259, %r257, 3; | |
shr.u32 %r260, %r258, 2; | |
setp.ne.s32 %p61, %r259, 1; | |
cvt.u64.u32 %rd2262, %r260; | |
add.s64 %rd433, %rd11, %rd2262; | |
@%p61 bra LBB54_49; | |
mov.u32 %r349, -845247145; | |
mov.u64 %rd2737, 1401181199; | |
mov.u64 %rd2726, 3144134277; | |
mov.u32 %r348, -616729560; | |
and.b64 %rd2302, %rd433, 4294967295; | |
mul.lo.s64 %rd2725, %rd2302, 3528531795; | |
setp.lt.u64 %p63, %rd433, %rd11; | |
selp.u64 %rd2303, 1, 0, %p63; | |
add.s64 %rd2304, %rd2461, %rd2303; | |
xor.b64 %rd2305, %rd2304, %rd2725; | |
shr.u64 %rd2306, %rd2305, 32; | |
mul.lo.s64 %rd2728, %rd2306, 3449720151; | |
shr.u64 %rd2307, %rd2728, 32; | |
and.b64 %rd2308, %rd2304, 4294967295; | |
mul.lo.s64 %rd2309, %rd2308, 3449720151; | |
and.b64 %rd2310, %rd2309, 4294967295; | |
xor.b64 %rd2311, %rd2310, %rd2307; | |
xor.b64 %rd2312, %rd2311, 2654435769; | |
mul.lo.s64 %rd2731, %rd2312, 3528531795; | |
xor.b64 %rd2721, %rd2309, %rd433; | |
mov.u64 %rd2738, 3041712726; | |
mov.u64 %rd2736, 2835769497; | |
mov.u64 %rd2735, 1684936478; | |
mov.u64 %rd2734, 2027808484; | |
mov.u64 %rd2733, 387276957; | |
mov.u64 %rd2732, 842468239; | |
mov.u64 %rd2730, 3986602516; | |
mov.u64 %rd2729, 1013904242; | |
mov.u64 %rd2727, 3668340011; | |
mov.u64 %rd2724, 3449720151; | |
mov.u64 %rd2723, 1993301258; | |
mov.u64 %rd2722, 3528531795; | |
bra.uni LBB54_50; | |
LBB54_49: | |
setp.lt.u64 %p62, %rd433, %rd11; | |
selp.u64 %rd2277, 1, 0, %p62; | |
add.s64 %rd2278, %rd2461, %rd2277; | |
and.b64 %rd2279, %rd2278, 4294967295; | |
mul.lo.s64 %rd2725, %rd2279, 3449720151; | |
xor.b64 %rd2280, %rd2725, %rd433; | |
shr.u64 %rd2281, %rd2280, 32; | |
mul.lo.s64 %rd2728, %rd2281, 3528531795; | |
shr.u64 %rd2282, %rd2728, 32; | |
and.b64 %rd2283, %rd433, 4294967295; | |
mul.lo.s64 %rd2284, %rd2283, 3528531795; | |
and.b64 %rd2285, %rd2284, 4294967295; | |
xor.b64 %rd2286, %rd2285, %rd2282; | |
xor.b64 %rd2287, %rd2286, 3144134277; | |
mul.lo.s64 %rd2731, %rd2287, 3449720151; | |
xor.b64 %rd2721, %rd2278, %rd2284; | |
mov.u32 %r349, -766435501; | |
mov.u32 %r348, -239350328; | |
mov.u64 %rd2738, 1684936478; | |
mov.u64 %rd2737, 534103459; | |
mov.u64 %rd2736, 387276957; | |
mov.u64 %rd2735, 3041712726; | |
mov.u64 %rd2734, 3986602516; | |
mov.u64 %rd2733, 2835769497; | |
mov.u64 %rd2732, 3668340011; | |
mov.u64 %rd2730, 2027808484; | |
mov.u64 %rd2729, 1993301258; | |
mov.u64 %rd2727, 842468239; | |
mov.u64 %rd2726, 2654435769; | |
mov.u64 %rd2724, 3528531795; | |
mov.u64 %rd2723, 1013904242; | |
mov.u64 %rd2722, 3449720151; | |
LBB54_50: | |
shr.u64 %rd2313, %rd2731, 32; | |
shr.u64 %rd2314, %rd2721, 32; | |
mul.lo.s64 %rd2315, %rd2314, %rd2722; | |
and.b64 %rd2316, %rd2315, 4294967295; | |
xor.b64 %rd2317, %rd2316, %rd2313; | |
xor.b64 %rd2318, %rd2317, %rd2723; | |
mul.lo.s64 %rd2319, %rd2318, %rd2724; | |
shr.u64 %rd2320, %rd2319, 32; | |
shr.u64 %rd2321, %rd2315, 32; | |
and.b64 %rd2322, %rd2725, 4294967295; | |
xor.b64 %rd2323, %rd2322, %rd2321; | |
xor.b64 %rd2324, %rd2323, %rd2726; | |
mul.lo.s64 %rd2325, %rd2324, %rd2724; | |
and.b64 %rd2326, %rd2325, 4294967295; | |
xor.b64 %rd2327, %rd2326, %rd2320; | |
xor.b64 %rd2328, %rd2327, %rd2727; | |
mul.lo.s64 %rd2329, %rd2328, %rd2722; | |
shr.u64 %rd2330, %rd2329, 32; | |
shr.u64 %rd2331, %rd2325, 32; | |
and.b64 %rd2332, %rd2728, 4294967295; | |
xor.b64 %rd2333, %rd2332, %rd2331; | |
xor.b64 %rd2334, %rd2333, %rd2729; | |
mul.lo.s64 %rd2335, %rd2334, %rd2722; | |
and.b64 %rd2336, %rd2335, 4294967295; | |
xor.b64 %rd2337, %rd2336, %rd2330; | |
xor.b64 %rd2338, %rd2337, %rd2730; | |
mul.lo.s64 %rd2339, %rd2338, %rd2724; | |
shr.u64 %rd2340, %rd2339, 32; | |
shr.u64 %rd2341, %rd2335, 32; | |
and.b64 %rd2342, %rd2731, 4294967295; | |
xor.b64 %rd2343, %rd2342, %rd2341; | |
xor.b64 %rd2344, %rd2343, %rd2732; | |
mul.lo.s64 %rd2345, %rd2344, %rd2724; | |
and.b64 %rd2346, %rd2345, 4294967295; | |
xor.b64 %rd2347, %rd2346, %rd2340; | |
xor.b64 %rd2348, %rd2347, %rd2733; | |
mul.lo.s64 %rd2349, %rd2348, %rd2722; | |
shr.u64 %rd2350, %rd2349, 32; | |
shr.u64 %rd2351, %rd2345, 32; | |
and.b64 %rd2352, %rd2319, 4294967295; | |
xor.b64 %rd2353, %rd2352, %rd2351; | |
xor.b64 %rd2354, %rd2353, %rd2734; | |
mul.lo.s64 %rd2355, %rd2354, %rd2722; | |
and.b64 %rd2356, %rd2355, 4294967295; | |
xor.b64 %rd2357, %rd2356, %rd2350; | |
xor.b64 %rd2358, %rd2357, %rd2735; | |
mul.lo.s64 %rd2359, %rd2358, %rd2724; | |
shr.u64 %rd2360, %rd2359, 32; | |
shr.u64 %rd2361, %rd2355, 32; | |
and.b64 %rd2362, %rd2329, 4294967295; | |
xor.b64 %rd2363, %rd2362, %rd2361; | |
xor.b64 %rd2364, %rd2363, %rd2736; | |
mul.lo.s64 %rd2365, %rd2364, %rd2724; | |
and.b64 %rd2366, %rd2365, 4294967295; | |
xor.b64 %rd2367, %rd2366, %rd2360; | |
xor.b64 %rd2368, %rd2367, %rd2737; | |
mul.lo.s64 %rd2369, %rd2368, %rd2722; | |
shr.u64 %rd2370, %rd2369, 32; | |
shr.u64 %rd2371, %rd2365, 32; | |
xor.b64 %rd2372, %rd2339, %rd2371; | |
xor.b64 %rd2373, %rd2372, %rd2738; | |
mul.lo.s64 %rd2374, %rd2373, %rd2722; | |
xor.b64 %rd2375, %rd2370, %rd2374; | |
cvt.u32.u64 %r265, %rd2375; | |
xor.b32 %r266, %r348, %r265; | |
mul.lo.s32 %r267, %r266, %r349; | |
shr.u32 %r268, %r267, 9; | |
cvt.rn.f32.u32 %f218, %r268; | |
mul.rn.f32 %f219, %f218, 0f34000000; | |
cvt.rn.f16.f32 %h136, %f219; | |
mov.b16 %h137, 0x2E66; | |
setp.ge.f16 %p64, %h136, %h137; | |
ld.global.nc.b16 %h138, [%rd44+1794]; | |
ld.global.nc.f32 %f220, [%rd45+3588]; | |
cvt.rn.f16.f32 %h139, %f220; | |
add.rn.f16 %h140, %h138, %h139; | |
mov.b16 %h141, 0x3C72; | |
mul.rn.f16 %h142, %h140, %h141; | |
selp.b16 %h143, %h142, 0x0000, %p64; | |
cvt.f32.f16 %f221, %h143; | |
ld.global.nc.b16 %h144, [%rd46+1794]; | |
cvt.f32.f16 %f222, %h144; | |
ld.global.nc.f32 %f223, [%rd47+3588]; | |
mul.rn.f32 %f224, %f1, %f223; | |
mul.rn.f32 %f225, %f224, %f222; | |
ld.global.nc.f32 %f226, [%rd48+3588]; | |
mul.rn.f32 %f227, %f2, %f224; | |
sub.rn.f32 %f228, %f226, %f227; | |
add.rn.f32 %f229, %f225, %f228; | |
add.rn.f32 %f230, %f229, %f221; | |
add.rn.f32 %f231, %f17, %f230; | |
and.b32 %r46, %r1, 31; | |
shfl.sync.down.b32 %f232, %f231, 16, 31, -1; | |
add.rn.f32 %f233, %f232, %f231; | |
shfl.sync.down.b32 %f234, %f233, 8, 31, -1; | |
add.rn.f32 %f235, %f234, %f233; | |
shfl.sync.down.b32 %f236, %f235, 4, 31, -1; | |
add.rn.f32 %f237, %f236, %f235; | |
shfl.sync.down.b32 %f238, %f237, 2, 31, -1; | |
add.rn.f32 %f239, %f238, %f237; | |
shfl.sync.down.b32 %f240, %f239, 1, 31, -1; | |
shr.u32 %r47, %r1, 5; | |
setp.ne.s32 %p65, %r46, 0; | |
mov.u64 %rd2378, shared_cache_012; | |
@%p65 bra LBB54_2; | |
mul.wide.u32 %rd2377, %r47, 4; | |
add.s64 %rd461, %rd2378, %rd2377; | |
add.rn.f32 %f18, %f240, %f239; | |
st.shared.f32 [%rd461], %f18; | |
LBB54_2: | |
bar.sync 0; | |
setp.eq.s32 %p66, %r47, 0; | |
@%p66 bra LBB54_52; | |
bra.uni LBB54_3; | |
LBB54_52: | |
add.u64 %rd472, %SP, 0; | |
add.u64 %rd10, %SPL, 0; | |
mul.wide.u32 %rd2379, %r46, 4; | |
add.s64 %rd462, %rd2378, %rd2379; | |
cvta.shared.u64 %rd2381, %rd462; | |
mov.u32 %r269, 0; | |
st.local.u32 [%rd10], %r269; | |
setp.lt.u32 %p67, %r1, 2; | |
selp.b64 %rd2383, %rd2381, %rd472, %p67; | |
ld.f32 %f241, [%rd2383]; | |
shfl.sync.down.b32 %f242, %f241, 16, 31, -1; | |
add.rn.f32 %f243, %f241, %f242; | |
shfl.sync.down.b32 %f244, %f243, 8, 31, -1; | |
add.rn.f32 %f245, %f243, %f244; | |
shfl.sync.down.b32 %f246, %f245, 4, 31, -1; | |
add.rn.f32 %f247, %f245, %f246; | |
shfl.sync.down.b32 %f248, %f247, 2, 31, -1; | |
add.rn.f32 %f249, %f247, %f248; | |
shfl.sync.down.b32 %f250, %f249, 1, 31, -1; | |
add.rn.f32 %f251, %f249, %f250; | |
st.f32 [%rd2383], %f251; | |
setp.ne.s32 %p68, %r1, 0; | |
@%p68 bra LBB54_3; | |
ld.param.u64 %rd469, [fusion_2214_param_3]; | |
cvt.u64.u32 %rd43, %r2; | |
cvta.to.global.u64 %rd6, %rd469; | |
shl.b64 %rd2376, %rd43, 2; | |
add.s64 %rd460, %rd6, %rd2376; | |
ld.shared.f32 %f252, [%rd462]; | |
atom.global.add.f32 %f253, [%rd460], %f252; | |
LBB54_3: | |
ret; | |
} | |
// .globl fusion_2212 | |
.visible .entry fusion_2212( | |
.param .u64 fusion_2212_param_0, | |
.param .u64 fusion_2212_param_1, | |
.param .u64 fusion_2212_param_2, | |
.param .u64 fusion_2212_param_3, | |
.param .u64 fusion_2212_param_4, | |
.param .u64 fusion_2212_param_5, | |
.param .u64 fusion_2212_param_6, | |
.param .u64 fusion_2212_param_7, | |
.param .u64 fusion_2212_param_8, | |
.param .u64 fusion_2212_param_9, | |
.param .u64 fusion_2212_param_10 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot55[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<75>; | |
.reg .b16 %h<145>; | |
.reg .f32 %f<288>; | |
.reg .b32 %r<350>; | |
.reg .b64 %rd<2742>; | |
mov.u64 %SPL, __local_depot55; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd464, [fusion_2212_param_0]; | |
ld.param.u64 %rd465, [fusion_2212_param_9]; | |
cvta.to.global.u64 %rd1, %rd465; | |
ld.param.u64 %rd466, [fusion_2212_param_1]; | |
ld.param.u64 %rd467, [fusion_2212_param_8]; | |
cvta.to.global.u64 %rd2, %rd467; | |
ld.param.u64 %rd468, [fusion_2212_param_2]; | |
ld.param.u64 %rd469, [fusion_2212_param_7]; | |
cvta.to.global.u64 %rd3, %rd469; | |
ld.param.u64 %rd471, [fusion_2212_param_6]; | |
cvta.to.global.u64 %rd4, %rd471; | |
ld.param.u64 %rd472, [fusion_2212_param_4]; | |
ld.param.u64 %rd473, [fusion_2212_param_5]; | |
cvta.to.global.u64 %rd5, %rd473; | |
cvta.to.global.u64 %rd6, %rd472; | |
cvta.to.global.u64 %rd8, %rd468; | |
cvta.to.global.u64 %rd9, %rd466; | |
cvta.to.global.u64 %rd10, %rd464; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 10; | |
or.b32 %r48, %r4, %r3; | |
shr.u32 %r49, %r48, 2; | |
and.b32 %r5, %r1, 1; | |
setp.eq.s32 %p1, %r5, 0; | |
ld.global.nc.u64 %rd12, [%rd8]; | |
cvt.u64.u32 %rd475, %r49; | |
add.s64 %rd13, %rd12, %rd475; | |
setp.lt.u64 %p69, %rd13, %rd12; | |
and.b64 %rd2387, %rd13, 4294967295; | |
@%p1 bra LBB55_1; | |
bra.uni LBB55_4; | |
LBB55_1: | |
mul.lo.s64 %rd2449, %rd2387, 3528531795; | |
ld.global.nc.u64 %rd2464, [%rd8+8]; | |
selp.u64 %rd518, 1, 0, %p69; | |
add.s64 %rd519, %rd2464, %rd518; | |
xor.b64 %rd520, %rd519, %rd2449; | |
shr.u64 %rd521, %rd520, 32; | |
mul.lo.s64 %rd2452, %rd521, 3449720151; | |
shr.u64 %rd522, %rd2452, 32; | |
and.b64 %rd523, %rd519, 4294967295; | |
mul.lo.s64 %rd524, %rd523, 3449720151; | |
and.b64 %rd525, %rd524, 4294967295; | |
xor.b64 %rd526, %rd525, %rd522; | |
xor.b64 %rd527, %rd526, 2654435769; | |
mul.lo.s64 %rd2455, %rd527, 3528531795; | |
xor.b64 %rd2445, %rd524, %rd13; | |
mov.u32 %r312, -1879881855; | |
mov.u32 %r311, -845247145; | |
mov.u32 %r310, 534103459; | |
mov.u64 %rd2463, 3678237736; | |
mov.u64 %rd2462, 3041712726; | |
mov.u64 %rd2461, 1401181199; | |
mov.u64 %rd2460, 2835769497; | |
mov.u64 %rd2459, 1684936478; | |
mov.u64 %rd2458, 2027808484; | |
mov.u64 %rd2457, 387276957; | |
mov.u64 %rd2456, 842468239; | |
mov.u64 %rd2454, 3986602516; | |
mov.u64 %rd2453, 1013904242; | |
mov.u64 %rd2451, 3668340011; | |
mov.u64 %rd2450, 3144134277; | |
mov.u64 %rd2448, 3449720151; | |
mov.u64 %rd2447, 1993301258; | |
mov.u64 %rd2446, 3528531795; | |
bra.uni LBB55_5; | |
LBB55_4: | |
mov.u32 %r311, -766435501; | |
mov.u64 %rd2462, 1684936478; | |
mov.u64 %rd2461, 534103459; | |
mov.u64 %rd2460, 387276957; | |
mov.u64 %rd2459, 3041712726; | |
mov.u64 %rd2458, 3986602516; | |
mov.u64 %rd2457, 2835769497; | |
mov.u64 %rd2456, 3668340011; | |
mov.u64 %rd2454, 2027808484; | |
mov.u64 %rd2453, 1993301258; | |
mov.u64 %rd2451, 842468239; | |
mov.u64 %rd2450, 2654435769; | |
mov.u64 %rd2448, 3528531795; | |
mov.u64 %rd2447, 1013904242; | |
mov.u64 %rd2446, 3449720151; | |
mov.u32 %r312, -1767562579; | |
mov.u32 %r310, 1401181199; | |
mov.u64 %rd2463, 4055616968; | |
ld.global.nc.u64 %rd2464, [%rd8+8]; | |
selp.u64 %rd491, 1, 0, %p69; | |
add.s64 %rd492, %rd2464, %rd491; | |
and.b64 %rd493, %rd492, 4294967295; | |
mul.lo.s64 %rd2449, %rd493, 3449720151; | |
xor.b64 %rd494, %rd2449, %rd13; | |
shr.u64 %rd495, %rd494, 32; | |
mul.lo.s64 %rd2452, %rd495, 3528531795; | |
shr.u64 %rd496, %rd2452, 32; | |
mul.lo.s64 %rd498, %rd2387, 3528531795; | |
and.b64 %rd499, %rd498, 4294967295; | |
xor.b64 %rd500, %rd499, %rd496; | |
xor.b64 %rd501, %rd500, 3144134277; | |
mul.lo.s64 %rd2455, %rd501, 3449720151; | |
xor.b64 %rd2445, %rd492, %rd498; | |
LBB55_5: | |
shr.u64 %rd528, %rd2455, 32; | |
shr.u64 %rd529, %rd2445, 32; | |
mul.lo.s64 %rd530, %rd529, %rd2446; | |
and.b64 %rd531, %rd530, 4294967295; | |
xor.b64 %rd532, %rd531, %rd528; | |
xor.b64 %rd533, %rd532, %rd2447; | |
mul.lo.s64 %rd534, %rd533, %rd2448; | |
shr.u64 %rd535, %rd534, 32; | |
shr.u64 %rd536, %rd530, 32; | |
and.b64 %rd537, %rd2449, 4294967295; | |
xor.b64 %rd538, %rd537, %rd536; | |
xor.b64 %rd539, %rd538, %rd2450; | |
mul.lo.s64 %rd540, %rd539, %rd2448; | |
and.b64 %rd541, %rd540, 4294967295; | |
xor.b64 %rd542, %rd541, %rd535; | |
xor.b64 %rd543, %rd542, %rd2451; | |
mul.lo.s64 %rd544, %rd543, %rd2446; | |
shr.u64 %rd545, %rd544, 32; | |
shr.u64 %rd546, %rd540, 32; | |
and.b64 %rd547, %rd2452, 4294967295; | |
xor.b64 %rd548, %rd547, %rd546; | |
xor.b64 %rd549, %rd548, %rd2453; | |
mul.lo.s64 %rd550, %rd549, %rd2446; | |
and.b64 %rd551, %rd550, 4294967295; | |
xor.b64 %rd552, %rd551, %rd545; | |
xor.b64 %rd553, %rd552, %rd2454; | |
mul.lo.s64 %rd554, %rd553, %rd2448; | |
shr.u64 %rd555, %rd554, 32; | |
shr.u64 %rd556, %rd550, 32; | |
and.b64 %rd557, %rd2455, 4294967295; | |
xor.b64 %rd558, %rd557, %rd556; | |
xor.b64 %rd559, %rd558, %rd2456; | |
mul.lo.s64 %rd560, %rd559, %rd2448; | |
and.b64 %rd561, %rd560, 4294967295; | |
xor.b64 %rd562, %rd561, %rd555; | |
xor.b64 %rd563, %rd562, %rd2457; | |
mul.lo.s64 %rd564, %rd563, %rd2446; | |
shr.u64 %rd565, %rd564, 32; | |
shr.u64 %rd566, %rd560, 32; | |
and.b64 %rd567, %rd534, 4294967295; | |
xor.b64 %rd568, %rd567, %rd566; | |
xor.b64 %rd569, %rd568, %rd2458; | |
mul.lo.s64 %rd570, %rd569, %rd2446; | |
and.b64 %rd571, %rd570, 4294967295; | |
xor.b64 %rd572, %rd571, %rd565; | |
xor.b64 %rd573, %rd572, %rd2459; | |
mul.lo.s64 %rd574, %rd573, %rd2448; | |
shr.u64 %rd575, %rd574, 32; | |
shr.u64 %rd576, %rd570, 32; | |
and.b64 %rd577, %rd544, 4294967295; | |
xor.b64 %rd578, %rd577, %rd576; | |
xor.b64 %rd579, %rd578, %rd2460; | |
mul.lo.s64 %rd580, %rd579, %rd2448; | |
and.b64 %rd581, %rd580, 4294967295; | |
xor.b64 %rd582, %rd581, %rd575; | |
xor.b64 %rd583, %rd582, %rd2461; | |
mul.lo.s64 %rd584, %rd583, %rd2446; | |
shr.u64 %rd585, %rd584, 32; | |
shr.u64 %rd586, %rd580, 32; | |
and.b64 %rd587, %rd554, 4294967295; | |
xor.b64 %rd588, %rd587, %rd586; | |
xor.b64 %rd589, %rd588, %rd2462; | |
mul.lo.s64 %rd590, %rd589, %rd2446; | |
and.b64 %rd591, %rd590, 4294967295; | |
xor.b64 %rd592, %rd591, %rd585; | |
xor.b64 %rd593, %rd592, %rd2463; | |
mul.lo.s64 %rd594, %rd593, %rd2448; | |
shr.u64 %rd595, %rd594, 32; | |
cvt.u32.u64 %r56, %rd595; | |
shr.u64 %rd596, %rd590, 32; | |
xor.b64 %rd597, %rd596, %rd564; | |
cvt.u32.u64 %r57, %rd597; | |
xor.b32 %r58, %r310, %r57; | |
mul.lo.s32 %r59, %r58, %r311; | |
xor.b32 %r60, %r59, %r56; | |
xor.b32 %r61, %r60, %r312; | |
shr.u32 %r62, %r61, 9; | |
cvt.rn.f32.u32 %f20, %r62; | |
mul.rn.f32 %f21, %f20, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f21; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p4, %h1, %h2; | |
mul.wide.u32 %rd598, %r2, 2048; | |
add.s64 %rd599, %rd10, %rd598; | |
mul.wide.u32 %rd600, %r3, 2; | |
add.s64 %rd45, %rd599, %rd600; | |
ld.global.nc.b16 %h3, [%rd45]; | |
mul.wide.u32 %rd601, %r3, 4; | |
add.s64 %rd46, %rd1, %rd601; | |
ld.global.nc.f32 %f22, [%rd46]; | |
cvt.rn.f16.f32 %h4, %f22; | |
add.rn.f16 %h5, %h3, %h4; | |
mov.b16 %h6, 0x3C72; | |
mul.rn.f16 %h7, %h5, %h6; | |
selp.b16 %h8, %h7, 0x0000, %p4; | |
cvt.f32.f16 %f23, %h8; | |
add.s64 %rd602, %rd9, %rd598; | |
add.s64 %rd47, %rd602, %rd600; | |
ld.global.nc.b16 %h9, [%rd47]; | |
cvt.f32.f16 %f24, %h9; | |
mul.wide.u32 %rd603, %r2, 4; | |
add.s64 %rd604, %rd5, %rd603; | |
ld.global.nc.f32 %f25, [%rd604]; | |
mul.rn.f32 %f26, %f25, 0f3A800000; | |
add.rn.f32 %f27, %f26, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f1, %f27; | |
add.s64 %rd48, %rd2, %rd601; | |
ld.global.nc.f32 %f28, [%rd48]; | |
mul.rn.f32 %f29, %f1, %f28; | |
mul.rn.f32 %f30, %f29, %f24; | |
add.s64 %rd49, %rd3, %rd601; | |
ld.global.nc.f32 %f31, [%rd49]; | |
add.s64 %rd605, %rd4, %rd603; | |
ld.global.nc.f32 %f32, [%rd605]; | |
mul.rn.f32 %f2, %f32, 0f3A800000; | |
mul.rn.f32 %f33, %f29, %f2; | |
sub.rn.f32 %f34, %f31, %f33; | |
add.rn.f32 %f35, %f30, %f34; | |
add.rn.f32 %f36, %f35, %f23; | |
add.s64 %rd606, %rd6, %rd603; | |
ld.global.nc.f32 %f37, [%rd606]; | |
mul.rn.f32 %f3, %f37, 0f3A800000; | |
sub.rn.f32 %f38, %f36, %f3; | |
mul.rn.f32 %f39, %f38, %f38; | |
add.rn.f32 %f4, %f39, 0f00000000; | |
or.b32 %r63, %r3, 1; | |
and.b32 %r64, %r63, 3; | |
setp.ne.s32 %p5, %r64, 1; | |
@%p5 bra LBB55_7; | |
mul.lo.s64 %rd2469, %rd2387, 3528531795; | |
selp.u64 %rd647, 1, 0, %p69; | |
add.s64 %rd648, %rd2464, %rd647; | |
xor.b64 %rd649, %rd648, %rd2469; | |
shr.u64 %rd650, %rd649, 32; | |
mul.lo.s64 %rd2472, %rd650, 3449720151; | |
shr.u64 %rd651, %rd2472, 32; | |
and.b64 %rd652, %rd648, 4294967295; | |
mul.lo.s64 %rd653, %rd652, 3449720151; | |
and.b64 %rd654, %rd653, 4294967295; | |
xor.b64 %rd655, %rd654, %rd651; | |
xor.b64 %rd656, %rd655, 2654435769; | |
mul.lo.s64 %rd2475, %rd656, 3528531795; | |
xor.b64 %rd2465, %rd653, %rd13; | |
mov.u32 %r314, -845247145; | |
mov.u32 %r313, -616729560; | |
mov.u64 %rd2482, 3041712726; | |
mov.u64 %rd2481, 1401181199; | |
mov.u64 %rd2480, 2835769497; | |
mov.u64 %rd2479, 1684936478; | |
mov.u64 %rd2478, 2027808484; | |
mov.u64 %rd2477, 387276957; | |
mov.u64 %rd2476, 842468239; | |
mov.u64 %rd2474, 3986602516; | |
mov.u64 %rd2473, 1013904242; | |
mov.u64 %rd2471, 3668340011; | |
mov.u64 %rd2470, 3144134277; | |
mov.u64 %rd2468, 3449720151; | |
mov.u64 %rd2467, 1993301258; | |
mov.u64 %rd2466, 3528531795; | |
bra.uni LBB55_8; | |
LBB55_7: | |
mov.u32 %r313, -239350328; | |
selp.u64 %rd621, 1, 0, %p69; | |
add.s64 %rd622, %rd2464, %rd621; | |
and.b64 %rd623, %rd622, 4294967295; | |
mul.lo.s64 %rd2469, %rd623, 3449720151; | |
xor.b64 %rd624, %rd2469, %rd13; | |
shr.u64 %rd625, %rd624, 32; | |
mul.lo.s64 %rd2472, %rd625, 3528531795; | |
shr.u64 %rd626, %rd2472, 32; | |
mul.lo.s64 %rd628, %rd2387, 3528531795; | |
and.b64 %rd629, %rd628, 4294967295; | |
xor.b64 %rd630, %rd629, %rd626; | |
xor.b64 %rd631, %rd630, 3144134277; | |
mul.lo.s64 %rd2475, %rd631, 3449720151; | |
xor.b64 %rd2465, %rd622, %rd628; | |
mov.u32 %r314, -766435501; | |
mov.u64 %rd2482, 1684936478; | |
mov.u64 %rd2481, 534103459; | |
mov.u64 %rd2480, 387276957; | |
mov.u64 %rd2479, 3041712726; | |
mov.u64 %rd2478, 3986602516; | |
mov.u64 %rd2477, 2835769497; | |
mov.u64 %rd2476, 3668340011; | |
mov.u64 %rd2474, 2027808484; | |
mov.u64 %rd2473, 1993301258; | |
mov.u64 %rd2471, 842468239; | |
mov.u64 %rd2470, 2654435769; | |
mov.u64 %rd2468, 3528531795; | |
mov.u64 %rd2467, 1013904242; | |
mov.u64 %rd2466, 3449720151; | |
LBB55_8: | |
setp.ne.s32 %p8, %r5, 0; | |
shr.u64 %rd657, %rd2475, 32; | |
shr.u64 %rd658, %rd2465, 32; | |
mul.lo.s64 %rd659, %rd658, %rd2466; | |
and.b64 %rd660, %rd659, 4294967295; | |
xor.b64 %rd661, %rd660, %rd657; | |
xor.b64 %rd662, %rd661, %rd2467; | |
mul.lo.s64 %rd663, %rd662, %rd2468; | |
shr.u64 %rd664, %rd663, 32; | |
shr.u64 %rd665, %rd659, 32; | |
and.b64 %rd666, %rd2469, 4294967295; | |
xor.b64 %rd667, %rd666, %rd665; | |
xor.b64 %rd668, %rd667, %rd2470; | |
mul.lo.s64 %rd669, %rd668, %rd2468; | |
and.b64 %rd670, %rd669, 4294967295; | |
xor.b64 %rd671, %rd670, %rd664; | |
xor.b64 %rd672, %rd671, %rd2471; | |
mul.lo.s64 %rd673, %rd672, %rd2466; | |
shr.u64 %rd674, %rd673, 32; | |
shr.u64 %rd675, %rd669, 32; | |
and.b64 %rd676, %rd2472, 4294967295; | |
xor.b64 %rd677, %rd676, %rd675; | |
xor.b64 %rd678, %rd677, %rd2473; | |
mul.lo.s64 %rd679, %rd678, %rd2466; | |
and.b64 %rd680, %rd679, 4294967295; | |
xor.b64 %rd681, %rd680, %rd674; | |
xor.b64 %rd682, %rd681, %rd2474; | |
mul.lo.s64 %rd683, %rd682, %rd2468; | |
shr.u64 %rd684, %rd683, 32; | |
shr.u64 %rd685, %rd679, 32; | |
and.b64 %rd686, %rd2475, 4294967295; | |
xor.b64 %rd687, %rd686, %rd685; | |
xor.b64 %rd688, %rd687, %rd2476; | |
mul.lo.s64 %rd689, %rd688, %rd2468; | |
and.b64 %rd690, %rd689, 4294967295; | |
xor.b64 %rd691, %rd690, %rd684; | |
xor.b64 %rd692, %rd691, %rd2477; | |
mul.lo.s64 %rd693, %rd692, %rd2466; | |
shr.u64 %rd694, %rd693, 32; | |
shr.u64 %rd695, %rd689, 32; | |
and.b64 %rd696, %rd663, 4294967295; | |
xor.b64 %rd697, %rd696, %rd695; | |
xor.b64 %rd698, %rd697, %rd2478; | |
mul.lo.s64 %rd699, %rd698, %rd2466; | |
and.b64 %rd700, %rd699, 4294967295; | |
xor.b64 %rd701, %rd700, %rd694; | |
xor.b64 %rd702, %rd701, %rd2479; | |
mul.lo.s64 %rd703, %rd702, %rd2468; | |
shr.u64 %rd704, %rd703, 32; | |
shr.u64 %rd705, %rd699, 32; | |
and.b64 %rd706, %rd673, 4294967295; | |
xor.b64 %rd707, %rd706, %rd705; | |
xor.b64 %rd708, %rd707, %rd2480; | |
mul.lo.s64 %rd709, %rd708, %rd2468; | |
and.b64 %rd710, %rd709, 4294967295; | |
xor.b64 %rd711, %rd710, %rd704; | |
xor.b64 %rd712, %rd711, %rd2481; | |
mul.lo.s64 %rd713, %rd712, %rd2466; | |
shr.u64 %rd714, %rd713, 32; | |
shr.u64 %rd715, %rd709, 32; | |
xor.b64 %rd716, %rd683, %rd715; | |
xor.b64 %rd717, %rd716, %rd2482; | |
mul.lo.s64 %rd718, %rd717, %rd2466; | |
xor.b64 %rd719, %rd714, %rd718; | |
cvt.u32.u64 %r69, %rd719; | |
xor.b32 %r70, %r313, %r69; | |
mul.lo.s32 %r71, %r70, %r314; | |
shr.u32 %r72, %r71, 9; | |
cvt.rn.f32.u32 %f40, %r72; | |
mul.rn.f32 %f41, %f40, 0f34000000; | |
cvt.rn.f16.f32 %h10, %f41; | |
mov.b16 %h11, 0x2E66; | |
setp.ge.f16 %p9, %h10, %h11; | |
ld.global.nc.b16 %h12, [%rd45+2]; | |
ld.global.nc.f32 %f42, [%rd46+4]; | |
cvt.rn.f16.f32 %h13, %f42; | |
add.rn.f16 %h14, %h12, %h13; | |
mov.b16 %h15, 0x3C72; | |
mul.rn.f16 %h16, %h14, %h15; | |
selp.b16 %h17, %h16, 0x0000, %p9; | |
cvt.f32.f16 %f43, %h17; | |
ld.global.nc.b16 %h18, [%rd47+2]; | |
cvt.f32.f16 %f44, %h18; | |
ld.global.nc.f32 %f45, [%rd48+4]; | |
mul.rn.f32 %f46, %f1, %f45; | |
mul.rn.f32 %f47, %f46, %f44; | |
ld.global.nc.f32 %f48, [%rd49+4]; | |
mul.rn.f32 %f49, %f2, %f46; | |
sub.rn.f32 %f50, %f48, %f49; | |
add.rn.f32 %f51, %f47, %f50; | |
add.rn.f32 %f52, %f51, %f43; | |
sub.rn.f32 %f53, %f52, %f3; | |
mul.rn.f32 %f54, %f53, %f53; | |
add.rn.f32 %f5, %f4, %f54; | |
or.b32 %r73, %r3, %r4; | |
or.b32 %r74, %r73, 128; | |
shr.u32 %r75, %r74, 2; | |
cvt.u64.u32 %rd720, %r75; | |
add.s64 %rd76, %rd12, %rd720; | |
and.b64 %rd2436, %rd76, 4294967295; | |
setp.lt.u64 %p74, %rd76, %rd12; | |
@%p8 bra LBB55_10; | |
mul.lo.s64 %rd2487, %rd2436, 3528531795; | |
selp.u64 %rd763, 1, 0, %p74; | |
add.s64 %rd764, %rd2464, %rd763; | |
xor.b64 %rd765, %rd764, %rd2487; | |
shr.u64 %rd766, %rd765, 32; | |
mul.lo.s64 %rd2490, %rd766, 3449720151; | |
shr.u64 %rd767, %rd2490, 32; | |
and.b64 %rd768, %rd764, 4294967295; | |
mul.lo.s64 %rd769, %rd768, 3449720151; | |
and.b64 %rd770, %rd769, 4294967295; | |
xor.b64 %rd771, %rd770, %rd767; | |
xor.b64 %rd772, %rd771, 2654435769; | |
mul.lo.s64 %rd2493, %rd772, 3528531795; | |
xor.b64 %rd2483, %rd769, %rd76; | |
mov.u32 %r317, -1879881855; | |
mov.u32 %r316, -845247145; | |
mov.u32 %r315, 534103459; | |
mov.u64 %rd2501, 3678237736; | |
mov.u64 %rd2500, 3041712726; | |
mov.u64 %rd2499, 1401181199; | |
mov.u64 %rd2498, 2835769497; | |
mov.u64 %rd2497, 1684936478; | |
mov.u64 %rd2496, 2027808484; | |
mov.u64 %rd2495, 387276957; | |
mov.u64 %rd2494, 842468239; | |
mov.u64 %rd2492, 3986602516; | |
mov.u64 %rd2491, 1013904242; | |
mov.u64 %rd2489, 3668340011; | |
mov.u64 %rd2488, 3144134277; | |
mov.u64 %rd2486, 3449720151; | |
mov.u64 %rd2485, 1993301258; | |
mov.u64 %rd2484, 3528531795; | |
bra.uni LBB55_11; | |
LBB55_10: | |
selp.u64 %rd736, 1, 0, %p74; | |
add.s64 %rd737, %rd2464, %rd736; | |
and.b64 %rd738, %rd737, 4294967295; | |
mul.lo.s64 %rd2487, %rd738, 3449720151; | |
xor.b64 %rd739, %rd2487, %rd76; | |
shr.u64 %rd740, %rd739, 32; | |
mul.lo.s64 %rd2490, %rd740, 3528531795; | |
shr.u64 %rd741, %rd2490, 32; | |
mul.lo.s64 %rd743, %rd2436, 3528531795; | |
and.b64 %rd744, %rd743, 4294967295; | |
xor.b64 %rd745, %rd744, %rd741; | |
xor.b64 %rd746, %rd745, 3144134277; | |
mul.lo.s64 %rd2493, %rd746, 3449720151; | |
xor.b64 %rd2483, %rd737, %rd743; | |
mov.u32 %r317, -1767562579; | |
mov.u32 %r316, -766435501; | |
mov.u32 %r315, 1401181199; | |
mov.u64 %rd2501, 4055616968; | |
mov.u64 %rd2500, 1684936478; | |
mov.u64 %rd2499, 534103459; | |
mov.u64 %rd2498, 387276957; | |
mov.u64 %rd2497, 3041712726; | |
mov.u64 %rd2496, 3986602516; | |
mov.u64 %rd2495, 2835769497; | |
mov.u64 %rd2494, 3668340011; | |
mov.u64 %rd2492, 2027808484; | |
mov.u64 %rd2491, 1993301258; | |
mov.u64 %rd2489, 842468239; | |
mov.u64 %rd2488, 2654435769; | |
mov.u64 %rd2486, 3528531795; | |
mov.u64 %rd2485, 1013904242; | |
mov.u64 %rd2484, 3449720151; | |
LBB55_11: | |
shr.u64 %rd773, %rd2493, 32; | |
shr.u64 %rd774, %rd2483, 32; | |
mul.lo.s64 %rd775, %rd774, %rd2484; | |
and.b64 %rd776, %rd775, 4294967295; | |
xor.b64 %rd777, %rd776, %rd773; | |
xor.b64 %rd778, %rd777, %rd2485; | |
mul.lo.s64 %rd779, %rd778, %rd2486; | |
shr.u64 %rd780, %rd779, 32; | |
shr.u64 %rd781, %rd775, 32; | |
and.b64 %rd782, %rd2487, 4294967295; | |
xor.b64 %rd783, %rd782, %rd781; | |
xor.b64 %rd784, %rd783, %rd2488; | |
mul.lo.s64 %rd785, %rd784, %rd2486; | |
and.b64 %rd786, %rd785, 4294967295; | |
xor.b64 %rd787, %rd786, %rd780; | |
xor.b64 %rd788, %rd787, %rd2489; | |
mul.lo.s64 %rd789, %rd788, %rd2484; | |
shr.u64 %rd790, %rd789, 32; | |
shr.u64 %rd791, %rd785, 32; | |
and.b64 %rd792, %rd2490, 4294967295; | |
xor.b64 %rd793, %rd792, %rd791; | |
xor.b64 %rd794, %rd793, %rd2491; | |
mul.lo.s64 %rd795, %rd794, %rd2484; | |
and.b64 %rd796, %rd795, 4294967295; | |
xor.b64 %rd797, %rd796, %rd790; | |
xor.b64 %rd798, %rd797, %rd2492; | |
mul.lo.s64 %rd799, %rd798, %rd2486; | |
shr.u64 %rd800, %rd799, 32; | |
shr.u64 %rd801, %rd795, 32; | |
and.b64 %rd802, %rd2493, 4294967295; | |
xor.b64 %rd803, %rd802, %rd801; | |
xor.b64 %rd804, %rd803, %rd2494; | |
mul.lo.s64 %rd805, %rd804, %rd2486; | |
and.b64 %rd806, %rd805, 4294967295; | |
xor.b64 %rd807, %rd806, %rd800; | |
xor.b64 %rd808, %rd807, %rd2495; | |
mul.lo.s64 %rd809, %rd808, %rd2484; | |
shr.u64 %rd810, %rd809, 32; | |
shr.u64 %rd811, %rd805, 32; | |
and.b64 %rd812, %rd779, 4294967295; | |
xor.b64 %rd813, %rd812, %rd811; | |
xor.b64 %rd814, %rd813, %rd2496; | |
mul.lo.s64 %rd815, %rd814, %rd2484; | |
and.b64 %rd816, %rd815, 4294967295; | |
xor.b64 %rd817, %rd816, %rd810; | |
xor.b64 %rd818, %rd817, %rd2497; | |
mul.lo.s64 %rd819, %rd818, %rd2486; | |
shr.u64 %rd820, %rd819, 32; | |
shr.u64 %rd821, %rd815, 32; | |
and.b64 %rd822, %rd789, 4294967295; | |
xor.b64 %rd823, %rd822, %rd821; | |
xor.b64 %rd824, %rd823, %rd2498; | |
mul.lo.s64 %rd825, %rd824, %rd2486; | |
and.b64 %rd826, %rd825, 4294967295; | |
xor.b64 %rd827, %rd826, %rd820; | |
xor.b64 %rd828, %rd827, %rd2499; | |
mul.lo.s64 %rd829, %rd828, %rd2484; | |
shr.u64 %rd830, %rd829, 32; | |
shr.u64 %rd831, %rd825, 32; | |
and.b64 %rd832, %rd799, 4294967295; | |
xor.b64 %rd833, %rd832, %rd831; | |
xor.b64 %rd834, %rd833, %rd2500; | |
mul.lo.s64 %rd835, %rd834, %rd2484; | |
and.b64 %rd836, %rd835, 4294967295; | |
xor.b64 %rd837, %rd836, %rd830; | |
xor.b64 %rd838, %rd837, %rd2501; | |
mul.lo.s64 %rd839, %rd838, %rd2486; | |
shr.u64 %rd840, %rd839, 32; | |
cvt.u32.u64 %r82, %rd840; | |
shr.u64 %rd841, %rd835, 32; | |
xor.b64 %rd842, %rd841, %rd809; | |
cvt.u32.u64 %r83, %rd842; | |
xor.b32 %r84, %r315, %r83; | |
mul.lo.s32 %r85, %r84, %r316; | |
xor.b32 %r86, %r85, %r82; | |
xor.b32 %r87, %r86, %r317; | |
shr.u32 %r88, %r87, 9; | |
cvt.rn.f32.u32 %f55, %r88; | |
mul.rn.f32 %f56, %f55, 0f34000000; | |
cvt.rn.f16.f32 %h19, %f56; | |
mov.b16 %h20, 0x2E66; | |
setp.ge.f16 %p12, %h19, %h20; | |
ld.global.nc.b16 %h21, [%rd45+256]; | |
ld.global.nc.f32 %f57, [%rd46+512]; | |
cvt.rn.f16.f32 %h22, %f57; | |
add.rn.f16 %h23, %h21, %h22; | |
mov.b16 %h24, 0x3C72; | |
mul.rn.f16 %h25, %h23, %h24; | |
selp.b16 %h26, %h25, 0x0000, %p12; | |
cvt.f32.f16 %f58, %h26; | |
ld.global.nc.b16 %h27, [%rd47+256]; | |
cvt.f32.f16 %f59, %h27; | |
ld.global.nc.f32 %f60, [%rd48+512]; | |
mul.rn.f32 %f61, %f1, %f60; | |
mul.rn.f32 %f62, %f61, %f59; | |
ld.global.nc.f32 %f63, [%rd49+512]; | |
mul.rn.f32 %f64, %f2, %f61; | |
sub.rn.f32 %f65, %f63, %f64; | |
add.rn.f32 %f66, %f62, %f65; | |
add.rn.f32 %f67, %f66, %f58; | |
sub.rn.f32 %f68, %f67, %f3; | |
mul.rn.f32 %f69, %f68, %f68; | |
add.rn.f32 %f6, %f5, %f69; | |
or.b32 %r89, %r3, 129; | |
or.b32 %r90, %r89, %r4; | |
and.b32 %r91, %r89, 3; | |
shr.u32 %r92, %r90, 2; | |
setp.ne.s32 %p13, %r91, 1; | |
cvt.u64.u32 %rd843, %r92; | |
add.s64 %rd104, %rd12, %rd843; | |
and.b64 %rd2433, %rd104, 4294967295; | |
setp.lt.u64 %p73, %rd104, %rd12; | |
@%p13 bra LBB55_13; | |
mul.lo.s64 %rd2506, %rd2433, 3528531795; | |
selp.u64 %rd884, 1, 0, %p73; | |
add.s64 %rd885, %rd2464, %rd884; | |
xor.b64 %rd886, %rd885, %rd2506; | |
shr.u64 %rd887, %rd886, 32; | |
mul.lo.s64 %rd2509, %rd887, 3449720151; | |
shr.u64 %rd888, %rd2509, 32; | |
and.b64 %rd889, %rd885, 4294967295; | |
mul.lo.s64 %rd890, %rd889, 3449720151; | |
and.b64 %rd891, %rd890, 4294967295; | |
xor.b64 %rd892, %rd891, %rd888; | |
xor.b64 %rd893, %rd892, 2654435769; | |
mul.lo.s64 %rd2512, %rd893, 3528531795; | |
xor.b64 %rd2502, %rd890, %rd104; | |
mov.u32 %r319, -845247145; | |
mov.u32 %r318, -616729560; | |
mov.u64 %rd2519, 3041712726; | |
mov.u64 %rd2518, 1401181199; | |
mov.u64 %rd2517, 2835769497; | |
mov.u64 %rd2516, 1684936478; | |
mov.u64 %rd2515, 2027808484; | |
mov.u64 %rd2514, 387276957; | |
mov.u64 %rd2513, 842468239; | |
mov.u64 %rd2511, 3986602516; | |
mov.u64 %rd2510, 1013904242; | |
mov.u64 %rd2508, 3668340011; | |
mov.u64 %rd2507, 3144134277; | |
mov.u64 %rd2505, 3449720151; | |
mov.u64 %rd2504, 1993301258; | |
mov.u64 %rd2503, 3528531795; | |
bra.uni LBB55_14; | |
LBB55_13: | |
selp.u64 %rd858, 1, 0, %p73; | |
add.s64 %rd859, %rd2464, %rd858; | |
and.b64 %rd860, %rd859, 4294967295; | |
mul.lo.s64 %rd2506, %rd860, 3449720151; | |
xor.b64 %rd861, %rd2506, %rd104; | |
shr.u64 %rd862, %rd861, 32; | |
mul.lo.s64 %rd2509, %rd862, 3528531795; | |
shr.u64 %rd863, %rd2509, 32; | |
mul.lo.s64 %rd865, %rd2433, 3528531795; | |
and.b64 %rd866, %rd865, 4294967295; | |
xor.b64 %rd867, %rd866, %rd863; | |
xor.b64 %rd868, %rd867, 3144134277; | |
mul.lo.s64 %rd2512, %rd868, 3449720151; | |
xor.b64 %rd2502, %rd859, %rd865; | |
mov.u32 %r319, -766435501; | |
mov.u32 %r318, -239350328; | |
mov.u64 %rd2519, 1684936478; | |
mov.u64 %rd2518, 534103459; | |
mov.u64 %rd2517, 387276957; | |
mov.u64 %rd2516, 3041712726; | |
mov.u64 %rd2515, 3986602516; | |
mov.u64 %rd2514, 2835769497; | |
mov.u64 %rd2513, 3668340011; | |
mov.u64 %rd2511, 2027808484; | |
mov.u64 %rd2510, 1993301258; | |
mov.u64 %rd2508, 842468239; | |
mov.u64 %rd2507, 2654435769; | |
mov.u64 %rd2505, 3528531795; | |
mov.u64 %rd2504, 1013904242; | |
mov.u64 %rd2503, 3449720151; | |
LBB55_14: | |
shr.u64 %rd894, %rd2512, 32; | |
shr.u64 %rd895, %rd2502, 32; | |
mul.lo.s64 %rd896, %rd895, %rd2503; | |
and.b64 %rd897, %rd896, 4294967295; | |
xor.b64 %rd898, %rd897, %rd894; | |
xor.b64 %rd899, %rd898, %rd2504; | |
mul.lo.s64 %rd900, %rd899, %rd2505; | |
shr.u64 %rd901, %rd900, 32; | |
shr.u64 %rd902, %rd896, 32; | |
and.b64 %rd903, %rd2506, 4294967295; | |
xor.b64 %rd904, %rd903, %rd902; | |
xor.b64 %rd905, %rd904, %rd2507; | |
mul.lo.s64 %rd906, %rd905, %rd2505; | |
and.b64 %rd907, %rd906, 4294967295; | |
xor.b64 %rd908, %rd907, %rd901; | |
xor.b64 %rd909, %rd908, %rd2508; | |
mul.lo.s64 %rd910, %rd909, %rd2503; | |
shr.u64 %rd911, %rd910, 32; | |
shr.u64 %rd912, %rd906, 32; | |
and.b64 %rd913, %rd2509, 4294967295; | |
xor.b64 %rd914, %rd913, %rd912; | |
xor.b64 %rd915, %rd914, %rd2510; | |
mul.lo.s64 %rd916, %rd915, %rd2503; | |
and.b64 %rd917, %rd916, 4294967295; | |
xor.b64 %rd918, %rd917, %rd911; | |
xor.b64 %rd919, %rd918, %rd2511; | |
mul.lo.s64 %rd920, %rd919, %rd2505; | |
shr.u64 %rd921, %rd920, 32; | |
shr.u64 %rd922, %rd916, 32; | |
and.b64 %rd923, %rd2512, 4294967295; | |
xor.b64 %rd924, %rd923, %rd922; | |
xor.b64 %rd925, %rd924, %rd2513; | |
mul.lo.s64 %rd926, %rd925, %rd2505; | |
and.b64 %rd927, %rd926, 4294967295; | |
xor.b64 %rd928, %rd927, %rd921; | |
xor.b64 %rd929, %rd928, %rd2514; | |
mul.lo.s64 %rd930, %rd929, %rd2503; | |
shr.u64 %rd931, %rd930, 32; | |
shr.u64 %rd932, %rd926, 32; | |
and.b64 %rd933, %rd900, 4294967295; | |
xor.b64 %rd934, %rd933, %rd932; | |
xor.b64 %rd935, %rd934, %rd2515; | |
mul.lo.s64 %rd936, %rd935, %rd2503; | |
and.b64 %rd937, %rd936, 4294967295; | |
xor.b64 %rd938, %rd937, %rd931; | |
xor.b64 %rd939, %rd938, %rd2516; | |
mul.lo.s64 %rd940, %rd939, %rd2505; | |
shr.u64 %rd941, %rd940, 32; | |
shr.u64 %rd942, %rd936, 32; | |
and.b64 %rd943, %rd910, 4294967295; | |
xor.b64 %rd944, %rd943, %rd942; | |
xor.b64 %rd945, %rd944, %rd2517; | |
mul.lo.s64 %rd946, %rd945, %rd2505; | |
and.b64 %rd947, %rd946, 4294967295; | |
xor.b64 %rd948, %rd947, %rd941; | |
xor.b64 %rd949, %rd948, %rd2518; | |
mul.lo.s64 %rd950, %rd949, %rd2503; | |
shr.u64 %rd951, %rd950, 32; | |
shr.u64 %rd952, %rd946, 32; | |
xor.b64 %rd953, %rd920, %rd952; | |
xor.b64 %rd954, %rd953, %rd2519; | |
mul.lo.s64 %rd955, %rd954, %rd2503; | |
xor.b64 %rd956, %rd951, %rd955; | |
cvt.u32.u64 %r97, %rd956; | |
xor.b32 %r98, %r318, %r97; | |
mul.lo.s32 %r99, %r98, %r319; | |
shr.u32 %r100, %r99, 9; | |
cvt.rn.f32.u32 %f70, %r100; | |
mul.rn.f32 %f71, %f70, 0f34000000; | |
cvt.rn.f16.f32 %h28, %f71; | |
mov.b16 %h29, 0x2E66; | |
setp.ge.f16 %p17, %h28, %h29; | |
ld.global.nc.b16 %h30, [%rd45+258]; | |
ld.global.nc.f32 %f72, [%rd46+516]; | |
cvt.rn.f16.f32 %h31, %f72; | |
add.rn.f16 %h32, %h30, %h31; | |
mov.b16 %h33, 0x3C72; | |
mul.rn.f16 %h34, %h32, %h33; | |
selp.b16 %h35, %h34, 0x0000, %p17; | |
cvt.f32.f16 %f73, %h35; | |
ld.global.nc.b16 %h36, [%rd47+258]; | |
cvt.f32.f16 %f74, %h36; | |
ld.global.nc.f32 %f75, [%rd48+516]; | |
mul.rn.f32 %f76, %f1, %f75; | |
mul.rn.f32 %f77, %f76, %f74; | |
ld.global.nc.f32 %f78, [%rd49+516]; | |
mul.rn.f32 %f79, %f2, %f76; | |
sub.rn.f32 %f80, %f78, %f79; | |
add.rn.f32 %f81, %f77, %f80; | |
add.rn.f32 %f82, %f81, %f73; | |
sub.rn.f32 %f83, %f82, %f3; | |
mul.rn.f32 %f84, %f83, %f83; | |
add.rn.f32 %f7, %f6, %f84; | |
or.b32 %r102, %r73, 256; | |
shr.u32 %r103, %r102, 2; | |
cvt.u64.u32 %rd957, %r103; | |
add.s64 %rd131, %rd12, %rd957; | |
and.b64 %rd2429, %rd131, 4294967295; | |
setp.lt.u64 %p72, %rd131, %rd12; | |
@%p8 bra LBB55_16; | |
mul.lo.s64 %rd2524, %rd2429, 3528531795; | |
selp.u64 %rd1000, 1, 0, %p72; | |
add.s64 %rd1001, %rd2464, %rd1000; | |
xor.b64 %rd1002, %rd1001, %rd2524; | |
shr.u64 %rd1003, %rd1002, 32; | |
mul.lo.s64 %rd2527, %rd1003, 3449720151; | |
shr.u64 %rd1004, %rd2527, 32; | |
and.b64 %rd1005, %rd1001, 4294967295; | |
mul.lo.s64 %rd1006, %rd1005, 3449720151; | |
and.b64 %rd1007, %rd1006, 4294967295; | |
xor.b64 %rd1008, %rd1007, %rd1004; | |
xor.b64 %rd1009, %rd1008, 2654435769; | |
mul.lo.s64 %rd2530, %rd1009, 3528531795; | |
xor.b64 %rd2520, %rd1006, %rd131; | |
mov.u32 %r322, -1879881855; | |
mov.u32 %r321, -845247145; | |
mov.u32 %r320, 534103459; | |
mov.u64 %rd2538, 3678237736; | |
mov.u64 %rd2537, 3041712726; | |
mov.u64 %rd2536, 1401181199; | |
mov.u64 %rd2535, 2835769497; | |
mov.u64 %rd2534, 1684936478; | |
mov.u64 %rd2533, 2027808484; | |
mov.u64 %rd2532, 387276957; | |
mov.u64 %rd2531, 842468239; | |
mov.u64 %rd2529, 3986602516; | |
mov.u64 %rd2528, 1013904242; | |
mov.u64 %rd2526, 3668340011; | |
mov.u64 %rd2525, 3144134277; | |
mov.u64 %rd2523, 3449720151; | |
mov.u64 %rd2522, 1993301258; | |
mov.u64 %rd2521, 3528531795; | |
bra.uni LBB55_17; | |
LBB55_16: | |
selp.u64 %rd973, 1, 0, %p72; | |
add.s64 %rd974, %rd2464, %rd973; | |
and.b64 %rd975, %rd974, 4294967295; | |
mul.lo.s64 %rd2524, %rd975, 3449720151; | |
xor.b64 %rd976, %rd2524, %rd131; | |
shr.u64 %rd977, %rd976, 32; | |
mul.lo.s64 %rd2527, %rd977, 3528531795; | |
shr.u64 %rd978, %rd2527, 32; | |
mul.lo.s64 %rd980, %rd2429, 3528531795; | |
and.b64 %rd981, %rd980, 4294967295; | |
xor.b64 %rd982, %rd981, %rd978; | |
xor.b64 %rd983, %rd982, 3144134277; | |
mul.lo.s64 %rd2530, %rd983, 3449720151; | |
xor.b64 %rd2520, %rd974, %rd980; | |
mov.u32 %r322, -1767562579; | |
mov.u32 %r321, -766435501; | |
mov.u32 %r320, 1401181199; | |
mov.u64 %rd2538, 4055616968; | |
mov.u64 %rd2537, 1684936478; | |
mov.u64 %rd2536, 534103459; | |
mov.u64 %rd2535, 387276957; | |
mov.u64 %rd2534, 3041712726; | |
mov.u64 %rd2533, 3986602516; | |
mov.u64 %rd2532, 2835769497; | |
mov.u64 %rd2531, 3668340011; | |
mov.u64 %rd2529, 2027808484; | |
mov.u64 %rd2528, 1993301258; | |
mov.u64 %rd2526, 842468239; | |
mov.u64 %rd2525, 2654435769; | |
mov.u64 %rd2523, 3528531795; | |
mov.u64 %rd2522, 1013904242; | |
mov.u64 %rd2521, 3449720151; | |
LBB55_17: | |
shr.u64 %rd1010, %rd2530, 32; | |
shr.u64 %rd1011, %rd2520, 32; | |
mul.lo.s64 %rd1012, %rd1011, %rd2521; | |
and.b64 %rd1013, %rd1012, 4294967295; | |
xor.b64 %rd1014, %rd1013, %rd1010; | |
xor.b64 %rd1015, %rd1014, %rd2522; | |
mul.lo.s64 %rd1016, %rd1015, %rd2523; | |
shr.u64 %rd1017, %rd1016, 32; | |
shr.u64 %rd1018, %rd1012, 32; | |
and.b64 %rd1019, %rd2524, 4294967295; | |
xor.b64 %rd1020, %rd1019, %rd1018; | |
xor.b64 %rd1021, %rd1020, %rd2525; | |
mul.lo.s64 %rd1022, %rd1021, %rd2523; | |
and.b64 %rd1023, %rd1022, 4294967295; | |
xor.b64 %rd1024, %rd1023, %rd1017; | |
xor.b64 %rd1025, %rd1024, %rd2526; | |
mul.lo.s64 %rd1026, %rd1025, %rd2521; | |
shr.u64 %rd1027, %rd1026, 32; | |
shr.u64 %rd1028, %rd1022, 32; | |
and.b64 %rd1029, %rd2527, 4294967295; | |
xor.b64 %rd1030, %rd1029, %rd1028; | |
xor.b64 %rd1031, %rd1030, %rd2528; | |
mul.lo.s64 %rd1032, %rd1031, %rd2521; | |
and.b64 %rd1033, %rd1032, 4294967295; | |
xor.b64 %rd1034, %rd1033, %rd1027; | |
xor.b64 %rd1035, %rd1034, %rd2529; | |
mul.lo.s64 %rd1036, %rd1035, %rd2523; | |
shr.u64 %rd1037, %rd1036, 32; | |
shr.u64 %rd1038, %rd1032, 32; | |
and.b64 %rd1039, %rd2530, 4294967295; | |
xor.b64 %rd1040, %rd1039, %rd1038; | |
xor.b64 %rd1041, %rd1040, %rd2531; | |
mul.lo.s64 %rd1042, %rd1041, %rd2523; | |
and.b64 %rd1043, %rd1042, 4294967295; | |
xor.b64 %rd1044, %rd1043, %rd1037; | |
xor.b64 %rd1045, %rd1044, %rd2532; | |
mul.lo.s64 %rd1046, %rd1045, %rd2521; | |
shr.u64 %rd1047, %rd1046, 32; | |
shr.u64 %rd1048, %rd1042, 32; | |
and.b64 %rd1049, %rd1016, 4294967295; | |
xor.b64 %rd1050, %rd1049, %rd1048; | |
xor.b64 %rd1051, %rd1050, %rd2533; | |
mul.lo.s64 %rd1052, %rd1051, %rd2521; | |
and.b64 %rd1053, %rd1052, 4294967295; | |
xor.b64 %rd1054, %rd1053, %rd1047; | |
xor.b64 %rd1055, %rd1054, %rd2534; | |
mul.lo.s64 %rd1056, %rd1055, %rd2523; | |
shr.u64 %rd1057, %rd1056, 32; | |
shr.u64 %rd1058, %rd1052, 32; | |
and.b64 %rd1059, %rd1026, 4294967295; | |
xor.b64 %rd1060, %rd1059, %rd1058; | |
xor.b64 %rd1061, %rd1060, %rd2535; | |
mul.lo.s64 %rd1062, %rd1061, %rd2523; | |
and.b64 %rd1063, %rd1062, 4294967295; | |
xor.b64 %rd1064, %rd1063, %rd1057; | |
xor.b64 %rd1065, %rd1064, %rd2536; | |
mul.lo.s64 %rd1066, %rd1065, %rd2521; | |
shr.u64 %rd1067, %rd1066, 32; | |
shr.u64 %rd1068, %rd1062, 32; | |
and.b64 %rd1069, %rd1036, 4294967295; | |
xor.b64 %rd1070, %rd1069, %rd1068; | |
xor.b64 %rd1071, %rd1070, %rd2537; | |
mul.lo.s64 %rd1072, %rd1071, %rd2521; | |
and.b64 %rd1073, %rd1072, 4294967295; | |
xor.b64 %rd1074, %rd1073, %rd1067; | |
xor.b64 %rd1075, %rd1074, %rd2538; | |
mul.lo.s64 %rd1076, %rd1075, %rd2523; | |
shr.u64 %rd1077, %rd1076, 32; | |
cvt.u32.u64 %r110, %rd1077; | |
shr.u64 %rd1078, %rd1072, 32; | |
xor.b64 %rd1079, %rd1078, %rd1046; | |
cvt.u32.u64 %r111, %rd1079; | |
xor.b32 %r112, %r320, %r111; | |
mul.lo.s32 %r113, %r112, %r321; | |
xor.b32 %r114, %r113, %r110; | |
xor.b32 %r115, %r114, %r322; | |
shr.u32 %r116, %r115, 9; | |
cvt.rn.f32.u32 %f85, %r116; | |
mul.rn.f32 %f86, %f85, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f86; | |
mov.b16 %h38, 0x2E66; | |
setp.ge.f16 %p20, %h37, %h38; | |
ld.global.nc.b16 %h39, [%rd45+512]; | |
ld.global.nc.f32 %f87, [%rd46+1024]; | |
cvt.rn.f16.f32 %h40, %f87; | |
add.rn.f16 %h41, %h39, %h40; | |
mov.b16 %h42, 0x3C72; | |
mul.rn.f16 %h43, %h41, %h42; | |
selp.b16 %h44, %h43, 0x0000, %p20; | |
cvt.f32.f16 %f88, %h44; | |
ld.global.nc.b16 %h45, [%rd47+512]; | |
cvt.f32.f16 %f89, %h45; | |
ld.global.nc.f32 %f90, [%rd48+1024]; | |
mul.rn.f32 %f91, %f1, %f90; | |
mul.rn.f32 %f92, %f91, %f89; | |
ld.global.nc.f32 %f93, [%rd49+1024]; | |
mul.rn.f32 %f94, %f2, %f91; | |
sub.rn.f32 %f95, %f93, %f94; | |
add.rn.f32 %f96, %f92, %f95; | |
add.rn.f32 %f97, %f96, %f88; | |
sub.rn.f32 %f98, %f97, %f3; | |
mul.rn.f32 %f99, %f98, %f98; | |
add.rn.f32 %f8, %f7, %f99; | |
or.b32 %r117, %r3, 257; | |
or.b32 %r118, %r117, %r4; | |
and.b32 %r119, %r117, 3; | |
shr.u32 %r120, %r118, 2; | |
setp.ne.s32 %p21, %r119, 1; | |
cvt.u64.u32 %rd1080, %r120; | |
add.s64 %rd159, %rd12, %rd1080; | |
and.b64 %rd2426, %rd159, 4294967295; | |
setp.lt.u64 %p71, %rd159, %rd12; | |
@%p21 bra LBB55_19; | |
mul.lo.s64 %rd2543, %rd2426, 3528531795; | |
selp.u64 %rd1121, 1, 0, %p71; | |
add.s64 %rd1122, %rd2464, %rd1121; | |
xor.b64 %rd1123, %rd1122, %rd2543; | |
shr.u64 %rd1124, %rd1123, 32; | |
mul.lo.s64 %rd2546, %rd1124, 3449720151; | |
shr.u64 %rd1125, %rd2546, 32; | |
and.b64 %rd1126, %rd1122, 4294967295; | |
mul.lo.s64 %rd1127, %rd1126, 3449720151; | |
and.b64 %rd1128, %rd1127, 4294967295; | |
xor.b64 %rd1129, %rd1128, %rd1125; | |
xor.b64 %rd1130, %rd1129, 2654435769; | |
mul.lo.s64 %rd2549, %rd1130, 3528531795; | |
xor.b64 %rd2539, %rd1127, %rd159; | |
mov.u32 %r324, -845247145; | |
mov.u32 %r323, -616729560; | |
mov.u64 %rd2556, 3041712726; | |
mov.u64 %rd2555, 1401181199; | |
mov.u64 %rd2554, 2835769497; | |
mov.u64 %rd2553, 1684936478; | |
mov.u64 %rd2552, 2027808484; | |
mov.u64 %rd2551, 387276957; | |
mov.u64 %rd2550, 842468239; | |
mov.u64 %rd2548, 3986602516; | |
mov.u64 %rd2547, 1013904242; | |
mov.u64 %rd2545, 3668340011; | |
mov.u64 %rd2544, 3144134277; | |
mov.u64 %rd2542, 3449720151; | |
mov.u64 %rd2541, 1993301258; | |
mov.u64 %rd2540, 3528531795; | |
bra.uni LBB55_20; | |
LBB55_19: | |
selp.u64 %rd1095, 1, 0, %p71; | |
add.s64 %rd1096, %rd2464, %rd1095; | |
and.b64 %rd1097, %rd1096, 4294967295; | |
mul.lo.s64 %rd2543, %rd1097, 3449720151; | |
xor.b64 %rd1098, %rd2543, %rd159; | |
shr.u64 %rd1099, %rd1098, 32; | |
mul.lo.s64 %rd2546, %rd1099, 3528531795; | |
shr.u64 %rd1100, %rd2546, 32; | |
mul.lo.s64 %rd1102, %rd2426, 3528531795; | |
and.b64 %rd1103, %rd1102, 4294967295; | |
xor.b64 %rd1104, %rd1103, %rd1100; | |
xor.b64 %rd1105, %rd1104, 3144134277; | |
mul.lo.s64 %rd2549, %rd1105, 3449720151; | |
xor.b64 %rd2539, %rd1096, %rd1102; | |
mov.u32 %r324, -766435501; | |
mov.u32 %r323, -239350328; | |
mov.u64 %rd2556, 1684936478; | |
mov.u64 %rd2555, 534103459; | |
mov.u64 %rd2554, 387276957; | |
mov.u64 %rd2553, 3041712726; | |
mov.u64 %rd2552, 3986602516; | |
mov.u64 %rd2551, 2835769497; | |
mov.u64 %rd2550, 3668340011; | |
mov.u64 %rd2548, 2027808484; | |
mov.u64 %rd2547, 1993301258; | |
mov.u64 %rd2545, 842468239; | |
mov.u64 %rd2544, 2654435769; | |
mov.u64 %rd2542, 3528531795; | |
mov.u64 %rd2541, 1013904242; | |
mov.u64 %rd2540, 3449720151; | |
LBB55_20: | |
shr.u64 %rd1131, %rd2549, 32; | |
shr.u64 %rd1132, %rd2539, 32; | |
mul.lo.s64 %rd1133, %rd1132, %rd2540; | |
and.b64 %rd1134, %rd1133, 4294967295; | |
xor.b64 %rd1135, %rd1134, %rd1131; | |
xor.b64 %rd1136, %rd1135, %rd2541; | |
mul.lo.s64 %rd1137, %rd1136, %rd2542; | |
shr.u64 %rd1138, %rd1137, 32; | |
shr.u64 %rd1139, %rd1133, 32; | |
and.b64 %rd1140, %rd2543, 4294967295; | |
xor.b64 %rd1141, %rd1140, %rd1139; | |
xor.b64 %rd1142, %rd1141, %rd2544; | |
mul.lo.s64 %rd1143, %rd1142, %rd2542; | |
and.b64 %rd1144, %rd1143, 4294967295; | |
xor.b64 %rd1145, %rd1144, %rd1138; | |
xor.b64 %rd1146, %rd1145, %rd2545; | |
mul.lo.s64 %rd1147, %rd1146, %rd2540; | |
shr.u64 %rd1148, %rd1147, 32; | |
shr.u64 %rd1149, %rd1143, 32; | |
and.b64 %rd1150, %rd2546, 4294967295; | |
xor.b64 %rd1151, %rd1150, %rd1149; | |
xor.b64 %rd1152, %rd1151, %rd2547; | |
mul.lo.s64 %rd1153, %rd1152, %rd2540; | |
and.b64 %rd1154, %rd1153, 4294967295; | |
xor.b64 %rd1155, %rd1154, %rd1148; | |
xor.b64 %rd1156, %rd1155, %rd2548; | |
mul.lo.s64 %rd1157, %rd1156, %rd2542; | |
shr.u64 %rd1158, %rd1157, 32; | |
shr.u64 %rd1159, %rd1153, 32; | |
and.b64 %rd1160, %rd2549, 4294967295; | |
xor.b64 %rd1161, %rd1160, %rd1159; | |
xor.b64 %rd1162, %rd1161, %rd2550; | |
mul.lo.s64 %rd1163, %rd1162, %rd2542; | |
and.b64 %rd1164, %rd1163, 4294967295; | |
xor.b64 %rd1165, %rd1164, %rd1158; | |
xor.b64 %rd1166, %rd1165, %rd2551; | |
mul.lo.s64 %rd1167, %rd1166, %rd2540; | |
shr.u64 %rd1168, %rd1167, 32; | |
shr.u64 %rd1169, %rd1163, 32; | |
and.b64 %rd1170, %rd1137, 4294967295; | |
xor.b64 %rd1171, %rd1170, %rd1169; | |
xor.b64 %rd1172, %rd1171, %rd2552; | |
mul.lo.s64 %rd1173, %rd1172, %rd2540; | |
and.b64 %rd1174, %rd1173, 4294967295; | |
xor.b64 %rd1175, %rd1174, %rd1168; | |
xor.b64 %rd1176, %rd1175, %rd2553; | |
mul.lo.s64 %rd1177, %rd1176, %rd2542; | |
shr.u64 %rd1178, %rd1177, 32; | |
shr.u64 %rd1179, %rd1173, 32; | |
and.b64 %rd1180, %rd1147, 4294967295; | |
xor.b64 %rd1181, %rd1180, %rd1179; | |
xor.b64 %rd1182, %rd1181, %rd2554; | |
mul.lo.s64 %rd1183, %rd1182, %rd2542; | |
and.b64 %rd1184, %rd1183, 4294967295; | |
xor.b64 %rd1185, %rd1184, %rd1178; | |
xor.b64 %rd1186, %rd1185, %rd2555; | |
mul.lo.s64 %rd1187, %rd1186, %rd2540; | |
shr.u64 %rd1188, %rd1187, 32; | |
shr.u64 %rd1189, %rd1183, 32; | |
xor.b64 %rd1190, %rd1157, %rd1189; | |
xor.b64 %rd1191, %rd1190, %rd2556; | |
mul.lo.s64 %rd1192, %rd1191, %rd2540; | |
xor.b64 %rd1193, %rd1188, %rd1192; | |
cvt.u32.u64 %r125, %rd1193; | |
xor.b32 %r126, %r323, %r125; | |
mul.lo.s32 %r127, %r126, %r324; | |
shr.u32 %r128, %r127, 9; | |
cvt.rn.f32.u32 %f100, %r128; | |
mul.rn.f32 %f101, %f100, 0f34000000; | |
cvt.rn.f16.f32 %h46, %f101; | |
mov.b16 %h47, 0x2E66; | |
setp.ge.f16 %p25, %h46, %h47; | |
ld.global.nc.b16 %h48, [%rd45+514]; | |
ld.global.nc.f32 %f102, [%rd46+1028]; | |
cvt.rn.f16.f32 %h49, %f102; | |
add.rn.f16 %h50, %h48, %h49; | |
mov.b16 %h51, 0x3C72; | |
mul.rn.f16 %h52, %h50, %h51; | |
selp.b16 %h53, %h52, 0x0000, %p25; | |
cvt.f32.f16 %f103, %h53; | |
ld.global.nc.b16 %h54, [%rd47+514]; | |
cvt.f32.f16 %f104, %h54; | |
ld.global.nc.f32 %f105, [%rd48+1028]; | |
mul.rn.f32 %f106, %f1, %f105; | |
mul.rn.f32 %f107, %f106, %f104; | |
ld.global.nc.f32 %f108, [%rd49+1028]; | |
mul.rn.f32 %f109, %f2, %f106; | |
sub.rn.f32 %f110, %f108, %f109; | |
add.rn.f32 %f111, %f107, %f110; | |
add.rn.f32 %f112, %f111, %f103; | |
sub.rn.f32 %f113, %f112, %f3; | |
mul.rn.f32 %f114, %f113, %f113; | |
add.rn.f32 %f9, %f8, %f114; | |
or.b32 %r130, %r73, 384; | |
shr.u32 %r131, %r130, 2; | |
cvt.u64.u32 %rd1194, %r131; | |
add.s64 %rd186, %rd12, %rd1194; | |
and.b64 %rd2422, %rd186, 4294967295; | |
setp.lt.u64 %p70, %rd186, %rd12; | |
@%p8 bra LBB55_22; | |
mul.lo.s64 %rd2561, %rd2422, 3528531795; | |
selp.u64 %rd1237, 1, 0, %p70; | |
add.s64 %rd1238, %rd2464, %rd1237; | |
xor.b64 %rd1239, %rd1238, %rd2561; | |
shr.u64 %rd1240, %rd1239, 32; | |
mul.lo.s64 %rd2564, %rd1240, 3449720151; | |
shr.u64 %rd1241, %rd2564, 32; | |
and.b64 %rd1242, %rd1238, 4294967295; | |
mul.lo.s64 %rd1243, %rd1242, 3449720151; | |
and.b64 %rd1244, %rd1243, 4294967295; | |
xor.b64 %rd1245, %rd1244, %rd1241; | |
xor.b64 %rd1246, %rd1245, 2654435769; | |
mul.lo.s64 %rd2567, %rd1246, 3528531795; | |
xor.b64 %rd2557, %rd1243, %rd186; | |
mov.u32 %r327, -1879881855; | |
mov.u32 %r326, -845247145; | |
mov.u32 %r325, 534103459; | |
mov.u64 %rd2575, 3678237736; | |
mov.u64 %rd2574, 3041712726; | |
mov.u64 %rd2573, 1401181199; | |
mov.u64 %rd2572, 2835769497; | |
mov.u64 %rd2571, 1684936478; | |
mov.u64 %rd2570, 2027808484; | |
mov.u64 %rd2569, 387276957; | |
mov.u64 %rd2568, 842468239; | |
mov.u64 %rd2566, 3986602516; | |
mov.u64 %rd2565, 1013904242; | |
mov.u64 %rd2563, 3668340011; | |
mov.u64 %rd2562, 3144134277; | |
mov.u64 %rd2560, 3449720151; | |
mov.u64 %rd2559, 1993301258; | |
mov.u64 %rd2558, 3528531795; | |
bra.uni LBB55_23; | |
LBB55_22: | |
selp.u64 %rd1210, 1, 0, %p70; | |
add.s64 %rd1211, %rd2464, %rd1210; | |
and.b64 %rd1212, %rd1211, 4294967295; | |
mul.lo.s64 %rd2561, %rd1212, 3449720151; | |
xor.b64 %rd1213, %rd2561, %rd186; | |
shr.u64 %rd1214, %rd1213, 32; | |
mul.lo.s64 %rd2564, %rd1214, 3528531795; | |
shr.u64 %rd1215, %rd2564, 32; | |
mul.lo.s64 %rd1217, %rd2422, 3528531795; | |
and.b64 %rd1218, %rd1217, 4294967295; | |
xor.b64 %rd1219, %rd1218, %rd1215; | |
xor.b64 %rd1220, %rd1219, 3144134277; | |
mul.lo.s64 %rd2567, %rd1220, 3449720151; | |
xor.b64 %rd2557, %rd1211, %rd1217; | |
mov.u32 %r327, -1767562579; | |
mov.u32 %r326, -766435501; | |
mov.u32 %r325, 1401181199; | |
mov.u64 %rd2575, 4055616968; | |
mov.u64 %rd2574, 1684936478; | |
mov.u64 %rd2573, 534103459; | |
mov.u64 %rd2572, 387276957; | |
mov.u64 %rd2571, 3041712726; | |
mov.u64 %rd2570, 3986602516; | |
mov.u64 %rd2569, 2835769497; | |
mov.u64 %rd2568, 3668340011; | |
mov.u64 %rd2566, 2027808484; | |
mov.u64 %rd2565, 1993301258; | |
mov.u64 %rd2563, 842468239; | |
mov.u64 %rd2562, 2654435769; | |
mov.u64 %rd2560, 3528531795; | |
mov.u64 %rd2559, 1013904242; | |
mov.u64 %rd2558, 3449720151; | |
LBB55_23: | |
shr.u64 %rd1247, %rd2567, 32; | |
shr.u64 %rd1248, %rd2557, 32; | |
mul.lo.s64 %rd1249, %rd1248, %rd2558; | |
and.b64 %rd1250, %rd1249, 4294967295; | |
xor.b64 %rd1251, %rd1250, %rd1247; | |
xor.b64 %rd1252, %rd1251, %rd2559; | |
mul.lo.s64 %rd1253, %rd1252, %rd2560; | |
shr.u64 %rd1254, %rd1253, 32; | |
shr.u64 %rd1255, %rd1249, 32; | |
and.b64 %rd1256, %rd2561, 4294967295; | |
xor.b64 %rd1257, %rd1256, %rd1255; | |
xor.b64 %rd1258, %rd1257, %rd2562; | |
mul.lo.s64 %rd1259, %rd1258, %rd2560; | |
and.b64 %rd1260, %rd1259, 4294967295; | |
xor.b64 %rd1261, %rd1260, %rd1254; | |
xor.b64 %rd1262, %rd1261, %rd2563; | |
mul.lo.s64 %rd1263, %rd1262, %rd2558; | |
shr.u64 %rd1264, %rd1263, 32; | |
shr.u64 %rd1265, %rd1259, 32; | |
and.b64 %rd1266, %rd2564, 4294967295; | |
xor.b64 %rd1267, %rd1266, %rd1265; | |
xor.b64 %rd1268, %rd1267, %rd2565; | |
mul.lo.s64 %rd1269, %rd1268, %rd2558; | |
and.b64 %rd1270, %rd1269, 4294967295; | |
xor.b64 %rd1271, %rd1270, %rd1264; | |
xor.b64 %rd1272, %rd1271, %rd2566; | |
mul.lo.s64 %rd1273, %rd1272, %rd2560; | |
shr.u64 %rd1274, %rd1273, 32; | |
shr.u64 %rd1275, %rd1269, 32; | |
and.b64 %rd1276, %rd2567, 4294967295; | |
xor.b64 %rd1277, %rd1276, %rd1275; | |
xor.b64 %rd1278, %rd1277, %rd2568; | |
mul.lo.s64 %rd1279, %rd1278, %rd2560; | |
and.b64 %rd1280, %rd1279, 4294967295; | |
xor.b64 %rd1281, %rd1280, %rd1274; | |
xor.b64 %rd1282, %rd1281, %rd2569; | |
mul.lo.s64 %rd1283, %rd1282, %rd2558; | |
shr.u64 %rd1284, %rd1283, 32; | |
shr.u64 %rd1285, %rd1279, 32; | |
and.b64 %rd1286, %rd1253, 4294967295; | |
xor.b64 %rd1287, %rd1286, %rd1285; | |
xor.b64 %rd1288, %rd1287, %rd2570; | |
mul.lo.s64 %rd1289, %rd1288, %rd2558; | |
and.b64 %rd1290, %rd1289, 4294967295; | |
xor.b64 %rd1291, %rd1290, %rd1284; | |
xor.b64 %rd1292, %rd1291, %rd2571; | |
mul.lo.s64 %rd1293, %rd1292, %rd2560; | |
shr.u64 %rd1294, %rd1293, 32; | |
shr.u64 %rd1295, %rd1289, 32; | |
and.b64 %rd1296, %rd1263, 4294967295; | |
xor.b64 %rd1297, %rd1296, %rd1295; | |
xor.b64 %rd1298, %rd1297, %rd2572; | |
mul.lo.s64 %rd1299, %rd1298, %rd2560; | |
and.b64 %rd1300, %rd1299, 4294967295; | |
xor.b64 %rd1301, %rd1300, %rd1294; | |
xor.b64 %rd1302, %rd1301, %rd2573; | |
mul.lo.s64 %rd1303, %rd1302, %rd2558; | |
shr.u64 %rd1304, %rd1303, 32; | |
shr.u64 %rd1305, %rd1299, 32; | |
and.b64 %rd1306, %rd1273, 4294967295; | |
xor.b64 %rd1307, %rd1306, %rd1305; | |
xor.b64 %rd1308, %rd1307, %rd2574; | |
mul.lo.s64 %rd1309, %rd1308, %rd2558; | |
and.b64 %rd1310, %rd1309, 4294967295; | |
xor.b64 %rd1311, %rd1310, %rd1304; | |
xor.b64 %rd1312, %rd1311, %rd2575; | |
mul.lo.s64 %rd1313, %rd1312, %rd2560; | |
shr.u64 %rd1314, %rd1313, 32; | |
cvt.u32.u64 %r138, %rd1314; | |
shr.u64 %rd1315, %rd1309, 32; | |
xor.b64 %rd1316, %rd1315, %rd1283; | |
cvt.u32.u64 %r139, %rd1316; | |
xor.b32 %r140, %r325, %r139; | |
mul.lo.s32 %r141, %r140, %r326; | |
xor.b32 %r142, %r141, %r138; | |
xor.b32 %r143, %r142, %r327; | |
shr.u32 %r144, %r143, 9; | |
cvt.rn.f32.u32 %f115, %r144; | |
mul.rn.f32 %f116, %f115, 0f34000000; | |
cvt.rn.f16.f32 %h55, %f116; | |
mov.b16 %h56, 0x2E66; | |
setp.ge.f16 %p28, %h55, %h56; | |
ld.global.nc.b16 %h57, [%rd45+768]; | |
ld.global.nc.f32 %f117, [%rd46+1536]; | |
cvt.rn.f16.f32 %h58, %f117; | |
add.rn.f16 %h59, %h57, %h58; | |
mov.b16 %h60, 0x3C72; | |
mul.rn.f16 %h61, %h59, %h60; | |
selp.b16 %h62, %h61, 0x0000, %p28; | |
cvt.f32.f16 %f118, %h62; | |
ld.global.nc.b16 %h63, [%rd47+768]; | |
cvt.f32.f16 %f119, %h63; | |
ld.global.nc.f32 %f120, [%rd48+1536]; | |
mul.rn.f32 %f121, %f1, %f120; | |
mul.rn.f32 %f122, %f121, %f119; | |
ld.global.nc.f32 %f123, [%rd49+1536]; | |
mul.rn.f32 %f124, %f2, %f121; | |
sub.rn.f32 %f125, %f123, %f124; | |
add.rn.f32 %f126, %f122, %f125; | |
add.rn.f32 %f127, %f126, %f118; | |
sub.rn.f32 %f128, %f127, %f3; | |
mul.rn.f32 %f129, %f128, %f128; | |
add.rn.f32 %f10, %f9, %f129; | |
or.b32 %r145, %r3, 385; | |
or.b32 %r146, %r145, %r4; | |
and.b32 %r147, %r145, 3; | |
shr.u32 %r148, %r146, 2; | |
setp.ne.s32 %p29, %r147, 1; | |
cvt.u64.u32 %rd1317, %r148; | |
add.s64 %rd214, %rd12, %rd1317; | |
@%p29 bra LBB55_25; | |
and.b64 %rd1357, %rd214, 4294967295; | |
mul.lo.s64 %rd2580, %rd1357, 3528531795; | |
setp.lt.u64 %p31, %rd214, %rd12; | |
selp.u64 %rd1358, 1, 0, %p31; | |
add.s64 %rd1359, %rd2464, %rd1358; | |
xor.b64 %rd1360, %rd1359, %rd2580; | |
shr.u64 %rd1361, %rd1360, 32; | |
mul.lo.s64 %rd2583, %rd1361, 3449720151; | |
shr.u64 %rd1362, %rd2583, 32; | |
and.b64 %rd1363, %rd1359, 4294967295; | |
mul.lo.s64 %rd1364, %rd1363, 3449720151; | |
and.b64 %rd1365, %rd1364, 4294967295; | |
xor.b64 %rd1366, %rd1365, %rd1362; | |
xor.b64 %rd1367, %rd1366, 2654435769; | |
mul.lo.s64 %rd2586, %rd1367, 3528531795; | |
xor.b64 %rd2576, %rd1364, %rd214; | |
mov.u32 %r329, -845247145; | |
mov.u32 %r328, -616729560; | |
mov.u64 %rd2593, 3041712726; | |
mov.u64 %rd2592, 1401181199; | |
mov.u64 %rd2591, 2835769497; | |
mov.u64 %rd2590, 1684936478; | |
mov.u64 %rd2589, 2027808484; | |
mov.u64 %rd2588, 387276957; | |
mov.u64 %rd2587, 842468239; | |
mov.u64 %rd2585, 3986602516; | |
mov.u64 %rd2584, 1013904242; | |
mov.u64 %rd2582, 3668340011; | |
mov.u64 %rd2581, 3144134277; | |
mov.u64 %rd2579, 3449720151; | |
mov.u64 %rd2578, 1993301258; | |
mov.u64 %rd2577, 3528531795; | |
bra.uni LBB55_26; | |
LBB55_25: | |
setp.lt.u64 %p30, %rd214, %rd12; | |
selp.u64 %rd1332, 1, 0, %p30; | |
add.s64 %rd1333, %rd2464, %rd1332; | |
and.b64 %rd1334, %rd1333, 4294967295; | |
mul.lo.s64 %rd2580, %rd1334, 3449720151; | |
xor.b64 %rd1335, %rd2580, %rd214; | |
shr.u64 %rd1336, %rd1335, 32; | |
mul.lo.s64 %rd2583, %rd1336, 3528531795; | |
shr.u64 %rd1337, %rd2583, 32; | |
and.b64 %rd1338, %rd214, 4294967295; | |
mul.lo.s64 %rd1339, %rd1338, 3528531795; | |
and.b64 %rd1340, %rd1339, 4294967295; | |
xor.b64 %rd1341, %rd1340, %rd1337; | |
xor.b64 %rd1342, %rd1341, 3144134277; | |
mul.lo.s64 %rd2586, %rd1342, 3449720151; | |
xor.b64 %rd2576, %rd1333, %rd1339; | |
mov.u32 %r329, -766435501; | |
mov.u32 %r328, -239350328; | |
mov.u64 %rd2593, 1684936478; | |
mov.u64 %rd2592, 534103459; | |
mov.u64 %rd2591, 387276957; | |
mov.u64 %rd2590, 3041712726; | |
mov.u64 %rd2589, 3986602516; | |
mov.u64 %rd2588, 2835769497; | |
mov.u64 %rd2587, 3668340011; | |
mov.u64 %rd2585, 2027808484; | |
mov.u64 %rd2584, 1993301258; | |
mov.u64 %rd2582, 842468239; | |
mov.u64 %rd2581, 2654435769; | |
mov.u64 %rd2579, 3528531795; | |
mov.u64 %rd2578, 1013904242; | |
mov.u64 %rd2577, 3449720151; | |
LBB55_26: | |
shr.u64 %rd1368, %rd2586, 32; | |
shr.u64 %rd1369, %rd2576, 32; | |
mul.lo.s64 %rd1370, %rd1369, %rd2577; | |
and.b64 %rd1371, %rd1370, 4294967295; | |
xor.b64 %rd1372, %rd1371, %rd1368; | |
xor.b64 %rd1373, %rd1372, %rd2578; | |
mul.lo.s64 %rd1374, %rd1373, %rd2579; | |
shr.u64 %rd1375, %rd1374, 32; | |
shr.u64 %rd1376, %rd1370, 32; | |
and.b64 %rd1377, %rd2580, 4294967295; | |
xor.b64 %rd1378, %rd1377, %rd1376; | |
xor.b64 %rd1379, %rd1378, %rd2581; | |
mul.lo.s64 %rd1380, %rd1379, %rd2579; | |
and.b64 %rd1381, %rd1380, 4294967295; | |
xor.b64 %rd1382, %rd1381, %rd1375; | |
xor.b64 %rd1383, %rd1382, %rd2582; | |
mul.lo.s64 %rd1384, %rd1383, %rd2577; | |
shr.u64 %rd1385, %rd1384, 32; | |
shr.u64 %rd1386, %rd1380, 32; | |
and.b64 %rd1387, %rd2583, 4294967295; | |
xor.b64 %rd1388, %rd1387, %rd1386; | |
xor.b64 %rd1389, %rd1388, %rd2584; | |
mul.lo.s64 %rd1390, %rd1389, %rd2577; | |
and.b64 %rd1391, %rd1390, 4294967295; | |
xor.b64 %rd1392, %rd1391, %rd1385; | |
xor.b64 %rd1393, %rd1392, %rd2585; | |
mul.lo.s64 %rd1394, %rd1393, %rd2579; | |
shr.u64 %rd1395, %rd1394, 32; | |
shr.u64 %rd1396, %rd1390, 32; | |
and.b64 %rd1397, %rd2586, 4294967295; | |
xor.b64 %rd1398, %rd1397, %rd1396; | |
xor.b64 %rd1399, %rd1398, %rd2587; | |
mul.lo.s64 %rd1400, %rd1399, %rd2579; | |
and.b64 %rd1401, %rd1400, 4294967295; | |
xor.b64 %rd1402, %rd1401, %rd1395; | |
xor.b64 %rd1403, %rd1402, %rd2588; | |
mul.lo.s64 %rd1404, %rd1403, %rd2577; | |
shr.u64 %rd1405, %rd1404, 32; | |
shr.u64 %rd1406, %rd1400, 32; | |
and.b64 %rd1407, %rd1374, 4294967295; | |
xor.b64 %rd1408, %rd1407, %rd1406; | |
xor.b64 %rd1409, %rd1408, %rd2589; | |
mul.lo.s64 %rd1410, %rd1409, %rd2577; | |
and.b64 %rd1411, %rd1410, 4294967295; | |
xor.b64 %rd1412, %rd1411, %rd1405; | |
xor.b64 %rd1413, %rd1412, %rd2590; | |
mul.lo.s64 %rd1414, %rd1413, %rd2579; | |
shr.u64 %rd1415, %rd1414, 32; | |
shr.u64 %rd1416, %rd1410, 32; | |
and.b64 %rd1417, %rd1384, 4294967295; | |
xor.b64 %rd1418, %rd1417, %rd1416; | |
xor.b64 %rd1419, %rd1418, %rd2591; | |
mul.lo.s64 %rd1420, %rd1419, %rd2579; | |
and.b64 %rd1421, %rd1420, 4294967295; | |
xor.b64 %rd1422, %rd1421, %rd1415; | |
xor.b64 %rd1423, %rd1422, %rd2592; | |
mul.lo.s64 %rd1424, %rd1423, %rd2577; | |
shr.u64 %rd1425, %rd1424, 32; | |
shr.u64 %rd1426, %rd1420, 32; | |
xor.b64 %rd1427, %rd1394, %rd1426; | |
xor.b64 %rd1428, %rd1427, %rd2593; | |
mul.lo.s64 %rd1429, %rd1428, %rd2577; | |
xor.b64 %rd1430, %rd1425, %rd1429; | |
cvt.u32.u64 %r153, %rd1430; | |
xor.b32 %r154, %r328, %r153; | |
mul.lo.s32 %r155, %r154, %r329; | |
shr.u32 %r156, %r155, 9; | |
cvt.rn.f32.u32 %f130, %r156; | |
mul.rn.f32 %f131, %f130, 0f34000000; | |
cvt.rn.f16.f32 %h64, %f131; | |
mov.b16 %h65, 0x2E66; | |
setp.ge.f16 %p33, %h64, %h65; | |
ld.global.nc.b16 %h66, [%rd45+770]; | |
ld.global.nc.f32 %f132, [%rd46+1540]; | |
cvt.rn.f16.f32 %h67, %f132; | |
add.rn.f16 %h68, %h66, %h67; | |
mov.b16 %h69, 0x3C72; | |
mul.rn.f16 %h70, %h68, %h69; | |
selp.b16 %h71, %h70, 0x0000, %p33; | |
cvt.f32.f16 %f133, %h71; | |
ld.global.nc.b16 %h72, [%rd47+770]; | |
cvt.f32.f16 %f134, %h72; | |
ld.global.nc.f32 %f135, [%rd48+1540]; | |
mul.rn.f32 %f136, %f1, %f135; | |
mul.rn.f32 %f137, %f136, %f134; | |
ld.global.nc.f32 %f138, [%rd49+1540]; | |
mul.rn.f32 %f139, %f2, %f136; | |
sub.rn.f32 %f140, %f138, %f139; | |
add.rn.f32 %f141, %f137, %f140; | |
add.rn.f32 %f142, %f141, %f133; | |
sub.rn.f32 %f143, %f142, %f3; | |
mul.rn.f32 %f144, %f143, %f143; | |
add.rn.f32 %f11, %f10, %f144; | |
or.b32 %r158, %r73, 512; | |
shr.u32 %r159, %r158, 2; | |
cvt.u64.u32 %rd1431, %r159; | |
add.s64 %rd241, %rd12, %rd1431; | |
@%p8 bra LBB55_28; | |
and.b64 %rd1473, %rd241, 4294967295; | |
mul.lo.s64 %rd2598, %rd1473, 3528531795; | |
setp.lt.u64 %p35, %rd241, %rd12; | |
selp.u64 %rd1474, 1, 0, %p35; | |
add.s64 %rd1475, %rd2464, %rd1474; | |
xor.b64 %rd1476, %rd1475, %rd2598; | |
shr.u64 %rd1477, %rd1476, 32; | |
mul.lo.s64 %rd2601, %rd1477, 3449720151; | |
shr.u64 %rd1478, %rd2601, 32; | |
and.b64 %rd1479, %rd1475, 4294967295; | |
mul.lo.s64 %rd1480, %rd1479, 3449720151; | |
and.b64 %rd1481, %rd1480, 4294967295; | |
xor.b64 %rd1482, %rd1481, %rd1478; | |
xor.b64 %rd1483, %rd1482, 2654435769; | |
mul.lo.s64 %rd2604, %rd1483, 3528531795; | |
xor.b64 %rd2594, %rd1480, %rd241; | |
mov.u32 %r332, -1879881855; | |
mov.u32 %r331, -845247145; | |
mov.u32 %r330, 534103459; | |
mov.u64 %rd2612, 3678237736; | |
mov.u64 %rd2611, 3041712726; | |
mov.u64 %rd2610, 1401181199; | |
mov.u64 %rd2609, 2835769497; | |
mov.u64 %rd2608, 1684936478; | |
mov.u64 %rd2607, 2027808484; | |
mov.u64 %rd2606, 387276957; | |
mov.u64 %rd2605, 842468239; | |
mov.u64 %rd2603, 3986602516; | |
mov.u64 %rd2602, 1013904242; | |
mov.u64 %rd2600, 3668340011; | |
mov.u64 %rd2599, 3144134277; | |
mov.u64 %rd2597, 3449720151; | |
mov.u64 %rd2596, 1993301258; | |
mov.u64 %rd2595, 3528531795; | |
bra.uni LBB55_29; | |
LBB55_28: | |
setp.lt.u64 %p34, %rd241, %rd12; | |
selp.u64 %rd1447, 1, 0, %p34; | |
add.s64 %rd1448, %rd2464, %rd1447; | |
and.b64 %rd1449, %rd1448, 4294967295; | |
mul.lo.s64 %rd2598, %rd1449, 3449720151; | |
xor.b64 %rd1450, %rd2598, %rd241; | |
shr.u64 %rd1451, %rd1450, 32; | |
mul.lo.s64 %rd2601, %rd1451, 3528531795; | |
shr.u64 %rd1452, %rd2601, 32; | |
and.b64 %rd1453, %rd241, 4294967295; | |
mul.lo.s64 %rd1454, %rd1453, 3528531795; | |
and.b64 %rd1455, %rd1454, 4294967295; | |
xor.b64 %rd1456, %rd1455, %rd1452; | |
xor.b64 %rd1457, %rd1456, 3144134277; | |
mul.lo.s64 %rd2604, %rd1457, 3449720151; | |
xor.b64 %rd2594, %rd1448, %rd1454; | |
mov.u32 %r332, -1767562579; | |
mov.u32 %r331, -766435501; | |
mov.u32 %r330, 1401181199; | |
mov.u64 %rd2612, 4055616968; | |
mov.u64 %rd2611, 1684936478; | |
mov.u64 %rd2610, 534103459; | |
mov.u64 %rd2609, 387276957; | |
mov.u64 %rd2608, 3041712726; | |
mov.u64 %rd2607, 3986602516; | |
mov.u64 %rd2606, 2835769497; | |
mov.u64 %rd2605, 3668340011; | |
mov.u64 %rd2603, 2027808484; | |
mov.u64 %rd2602, 1993301258; | |
mov.u64 %rd2600, 842468239; | |
mov.u64 %rd2599, 2654435769; | |
mov.u64 %rd2597, 3528531795; | |
mov.u64 %rd2596, 1013904242; | |
mov.u64 %rd2595, 3449720151; | |
LBB55_29: | |
shr.u64 %rd1484, %rd2604, 32; | |
shr.u64 %rd1485, %rd2594, 32; | |
mul.lo.s64 %rd1486, %rd1485, %rd2595; | |
and.b64 %rd1487, %rd1486, 4294967295; | |
xor.b64 %rd1488, %rd1487, %rd1484; | |
xor.b64 %rd1489, %rd1488, %rd2596; | |
mul.lo.s64 %rd1490, %rd1489, %rd2597; | |
shr.u64 %rd1491, %rd1490, 32; | |
shr.u64 %rd1492, %rd1486, 32; | |
and.b64 %rd1493, %rd2598, 4294967295; | |
xor.b64 %rd1494, %rd1493, %rd1492; | |
xor.b64 %rd1495, %rd1494, %rd2599; | |
mul.lo.s64 %rd1496, %rd1495, %rd2597; | |
and.b64 %rd1497, %rd1496, 4294967295; | |
xor.b64 %rd1498, %rd1497, %rd1491; | |
xor.b64 %rd1499, %rd1498, %rd2600; | |
mul.lo.s64 %rd1500, %rd1499, %rd2595; | |
shr.u64 %rd1501, %rd1500, 32; | |
shr.u64 %rd1502, %rd1496, 32; | |
and.b64 %rd1503, %rd2601, 4294967295; | |
xor.b64 %rd1504, %rd1503, %rd1502; | |
xor.b64 %rd1505, %rd1504, %rd2602; | |
mul.lo.s64 %rd1506, %rd1505, %rd2595; | |
and.b64 %rd1507, %rd1506, 4294967295; | |
xor.b64 %rd1508, %rd1507, %rd1501; | |
xor.b64 %rd1509, %rd1508, %rd2603; | |
mul.lo.s64 %rd1510, %rd1509, %rd2597; | |
shr.u64 %rd1511, %rd1510, 32; | |
shr.u64 %rd1512, %rd1506, 32; | |
and.b64 %rd1513, %rd2604, 4294967295; | |
xor.b64 %rd1514, %rd1513, %rd1512; | |
xor.b64 %rd1515, %rd1514, %rd2605; | |
mul.lo.s64 %rd1516, %rd1515, %rd2597; | |
and.b64 %rd1517, %rd1516, 4294967295; | |
xor.b64 %rd1518, %rd1517, %rd1511; | |
xor.b64 %rd1519, %rd1518, %rd2606; | |
mul.lo.s64 %rd1520, %rd1519, %rd2595; | |
shr.u64 %rd1521, %rd1520, 32; | |
shr.u64 %rd1522, %rd1516, 32; | |
and.b64 %rd1523, %rd1490, 4294967295; | |
xor.b64 %rd1524, %rd1523, %rd1522; | |
xor.b64 %rd1525, %rd1524, %rd2607; | |
mul.lo.s64 %rd1526, %rd1525, %rd2595; | |
and.b64 %rd1527, %rd1526, 4294967295; | |
xor.b64 %rd1528, %rd1527, %rd1521; | |
xor.b64 %rd1529, %rd1528, %rd2608; | |
mul.lo.s64 %rd1530, %rd1529, %rd2597; | |
shr.u64 %rd1531, %rd1530, 32; | |
shr.u64 %rd1532, %rd1526, 32; | |
and.b64 %rd1533, %rd1500, 4294967295; | |
xor.b64 %rd1534, %rd1533, %rd1532; | |
xor.b64 %rd1535, %rd1534, %rd2609; | |
mul.lo.s64 %rd1536, %rd1535, %rd2597; | |
and.b64 %rd1537, %rd1536, 4294967295; | |
xor.b64 %rd1538, %rd1537, %rd1531; | |
xor.b64 %rd1539, %rd1538, %rd2610; | |
mul.lo.s64 %rd1540, %rd1539, %rd2595; | |
shr.u64 %rd1541, %rd1540, 32; | |
shr.u64 %rd1542, %rd1536, 32; | |
and.b64 %rd1543, %rd1510, 4294967295; | |
xor.b64 %rd1544, %rd1543, %rd1542; | |
xor.b64 %rd1545, %rd1544, %rd2611; | |
mul.lo.s64 %rd1546, %rd1545, %rd2595; | |
and.b64 %rd1547, %rd1546, 4294967295; | |
xor.b64 %rd1548, %rd1547, %rd1541; | |
xor.b64 %rd1549, %rd1548, %rd2612; | |
mul.lo.s64 %rd1550, %rd1549, %rd2597; | |
shr.u64 %rd1551, %rd1550, 32; | |
cvt.u32.u64 %r166, %rd1551; | |
shr.u64 %rd1552, %rd1546, 32; | |
xor.b64 %rd1553, %rd1552, %rd1520; | |
cvt.u32.u64 %r167, %rd1553; | |
xor.b32 %r168, %r330, %r167; | |
mul.lo.s32 %r169, %r168, %r331; | |
xor.b32 %r170, %r169, %r166; | |
xor.b32 %r171, %r170, %r332; | |
shr.u32 %r172, %r171, 9; | |
cvt.rn.f32.u32 %f145, %r172; | |
mul.rn.f32 %f146, %f145, 0f34000000; | |
cvt.rn.f16.f32 %h73, %f146; | |
mov.b16 %h74, 0x2E66; | |
setp.ge.f16 %p36, %h73, %h74; | |
ld.global.nc.b16 %h75, [%rd45+1024]; | |
ld.global.nc.f32 %f147, [%rd46+2048]; | |
cvt.rn.f16.f32 %h76, %f147; | |
add.rn.f16 %h77, %h75, %h76; | |
mov.b16 %h78, 0x3C72; | |
mul.rn.f16 %h79, %h77, %h78; | |
selp.b16 %h80, %h79, 0x0000, %p36; | |
cvt.f32.f16 %f148, %h80; | |
ld.global.nc.b16 %h81, [%rd47+1024]; | |
cvt.f32.f16 %f149, %h81; | |
ld.global.nc.f32 %f150, [%rd48+2048]; | |
mul.rn.f32 %f151, %f1, %f150; | |
mul.rn.f32 %f152, %f151, %f149; | |
ld.global.nc.f32 %f153, [%rd49+2048]; | |
mul.rn.f32 %f154, %f2, %f151; | |
sub.rn.f32 %f155, %f153, %f154; | |
add.rn.f32 %f156, %f152, %f155; | |
add.rn.f32 %f157, %f156, %f148; | |
sub.rn.f32 %f158, %f157, %f3; | |
mul.rn.f32 %f159, %f158, %f158; | |
add.rn.f32 %f12, %f11, %f159; | |
or.b32 %r173, %r3, 513; | |
or.b32 %r174, %r173, %r4; | |
and.b32 %r175, %r173, 3; | |
shr.u32 %r176, %r174, 2; | |
setp.ne.s32 %p37, %r175, 1; | |
cvt.u64.u32 %rd1554, %r176; | |
add.s64 %rd269, %rd12, %rd1554; | |
@%p37 bra LBB55_31; | |
and.b64 %rd1594, %rd269, 4294967295; | |
mul.lo.s64 %rd2617, %rd1594, 3528531795; | |
setp.lt.u64 %p39, %rd269, %rd12; | |
selp.u64 %rd1595, 1, 0, %p39; | |
add.s64 %rd1596, %rd2464, %rd1595; | |
xor.b64 %rd1597, %rd1596, %rd2617; | |
shr.u64 %rd1598, %rd1597, 32; | |
mul.lo.s64 %rd2620, %rd1598, 3449720151; | |
shr.u64 %rd1599, %rd2620, 32; | |
and.b64 %rd1600, %rd1596, 4294967295; | |
mul.lo.s64 %rd1601, %rd1600, 3449720151; | |
and.b64 %rd1602, %rd1601, 4294967295; | |
xor.b64 %rd1603, %rd1602, %rd1599; | |
xor.b64 %rd1604, %rd1603, 2654435769; | |
mul.lo.s64 %rd2623, %rd1604, 3528531795; | |
xor.b64 %rd2613, %rd1601, %rd269; | |
mov.u32 %r334, -845247145; | |
mov.u32 %r333, -616729560; | |
mov.u64 %rd2630, 3041712726; | |
mov.u64 %rd2629, 1401181199; | |
mov.u64 %rd2628, 2835769497; | |
mov.u64 %rd2627, 1684936478; | |
mov.u64 %rd2626, 2027808484; | |
mov.u64 %rd2625, 387276957; | |
mov.u64 %rd2624, 842468239; | |
mov.u64 %rd2622, 3986602516; | |
mov.u64 %rd2621, 1013904242; | |
mov.u64 %rd2619, 3668340011; | |
mov.u64 %rd2618, 3144134277; | |
mov.u64 %rd2616, 3449720151; | |
mov.u64 %rd2615, 1993301258; | |
mov.u64 %rd2614, 3528531795; | |
bra.uni LBB55_32; | |
LBB55_31: | |
setp.lt.u64 %p38, %rd269, %rd12; | |
selp.u64 %rd1569, 1, 0, %p38; | |
add.s64 %rd1570, %rd2464, %rd1569; | |
and.b64 %rd1571, %rd1570, 4294967295; | |
mul.lo.s64 %rd2617, %rd1571, 3449720151; | |
xor.b64 %rd1572, %rd2617, %rd269; | |
shr.u64 %rd1573, %rd1572, 32; | |
mul.lo.s64 %rd2620, %rd1573, 3528531795; | |
shr.u64 %rd1574, %rd2620, 32; | |
and.b64 %rd1575, %rd269, 4294967295; | |
mul.lo.s64 %rd1576, %rd1575, 3528531795; | |
and.b64 %rd1577, %rd1576, 4294967295; | |
xor.b64 %rd1578, %rd1577, %rd1574; | |
xor.b64 %rd1579, %rd1578, 3144134277; | |
mul.lo.s64 %rd2623, %rd1579, 3449720151; | |
xor.b64 %rd2613, %rd1570, %rd1576; | |
mov.u32 %r334, -766435501; | |
mov.u32 %r333, -239350328; | |
mov.u64 %rd2630, 1684936478; | |
mov.u64 %rd2629, 534103459; | |
mov.u64 %rd2628, 387276957; | |
mov.u64 %rd2627, 3041712726; | |
mov.u64 %rd2626, 3986602516; | |
mov.u64 %rd2625, 2835769497; | |
mov.u64 %rd2624, 3668340011; | |
mov.u64 %rd2622, 2027808484; | |
mov.u64 %rd2621, 1993301258; | |
mov.u64 %rd2619, 842468239; | |
mov.u64 %rd2618, 2654435769; | |
mov.u64 %rd2616, 3528531795; | |
mov.u64 %rd2615, 1013904242; | |
mov.u64 %rd2614, 3449720151; | |
LBB55_32: | |
shr.u64 %rd1605, %rd2623, 32; | |
shr.u64 %rd1606, %rd2613, 32; | |
mul.lo.s64 %rd1607, %rd1606, %rd2614; | |
and.b64 %rd1608, %rd1607, 4294967295; | |
xor.b64 %rd1609, %rd1608, %rd1605; | |
xor.b64 %rd1610, %rd1609, %rd2615; | |
mul.lo.s64 %rd1611, %rd1610, %rd2616; | |
shr.u64 %rd1612, %rd1611, 32; | |
shr.u64 %rd1613, %rd1607, 32; | |
and.b64 %rd1614, %rd2617, 4294967295; | |
xor.b64 %rd1615, %rd1614, %rd1613; | |
xor.b64 %rd1616, %rd1615, %rd2618; | |
mul.lo.s64 %rd1617, %rd1616, %rd2616; | |
and.b64 %rd1618, %rd1617, 4294967295; | |
xor.b64 %rd1619, %rd1618, %rd1612; | |
xor.b64 %rd1620, %rd1619, %rd2619; | |
mul.lo.s64 %rd1621, %rd1620, %rd2614; | |
shr.u64 %rd1622, %rd1621, 32; | |
shr.u64 %rd1623, %rd1617, 32; | |
and.b64 %rd1624, %rd2620, 4294967295; | |
xor.b64 %rd1625, %rd1624, %rd1623; | |
xor.b64 %rd1626, %rd1625, %rd2621; | |
mul.lo.s64 %rd1627, %rd1626, %rd2614; | |
and.b64 %rd1628, %rd1627, 4294967295; | |
xor.b64 %rd1629, %rd1628, %rd1622; | |
xor.b64 %rd1630, %rd1629, %rd2622; | |
mul.lo.s64 %rd1631, %rd1630, %rd2616; | |
shr.u64 %rd1632, %rd1631, 32; | |
shr.u64 %rd1633, %rd1627, 32; | |
and.b64 %rd1634, %rd2623, 4294967295; | |
xor.b64 %rd1635, %rd1634, %rd1633; | |
xor.b64 %rd1636, %rd1635, %rd2624; | |
mul.lo.s64 %rd1637, %rd1636, %rd2616; | |
and.b64 %rd1638, %rd1637, 4294967295; | |
xor.b64 %rd1639, %rd1638, %rd1632; | |
xor.b64 %rd1640, %rd1639, %rd2625; | |
mul.lo.s64 %rd1641, %rd1640, %rd2614; | |
shr.u64 %rd1642, %rd1641, 32; | |
shr.u64 %rd1643, %rd1637, 32; | |
and.b64 %rd1644, %rd1611, 4294967295; | |
xor.b64 %rd1645, %rd1644, %rd1643; | |
xor.b64 %rd1646, %rd1645, %rd2626; | |
mul.lo.s64 %rd1647, %rd1646, %rd2614; | |
and.b64 %rd1648, %rd1647, 4294967295; | |
xor.b64 %rd1649, %rd1648, %rd1642; | |
xor.b64 %rd1650, %rd1649, %rd2627; | |
mul.lo.s64 %rd1651, %rd1650, %rd2616; | |
shr.u64 %rd1652, %rd1651, 32; | |
shr.u64 %rd1653, %rd1647, 32; | |
and.b64 %rd1654, %rd1621, 4294967295; | |
xor.b64 %rd1655, %rd1654, %rd1653; | |
xor.b64 %rd1656, %rd1655, %rd2628; | |
mul.lo.s64 %rd1657, %rd1656, %rd2616; | |
and.b64 %rd1658, %rd1657, 4294967295; | |
xor.b64 %rd1659, %rd1658, %rd1652; | |
xor.b64 %rd1660, %rd1659, %rd2629; | |
mul.lo.s64 %rd1661, %rd1660, %rd2614; | |
shr.u64 %rd1662, %rd1661, 32; | |
shr.u64 %rd1663, %rd1657, 32; | |
xor.b64 %rd1664, %rd1631, %rd1663; | |
xor.b64 %rd1665, %rd1664, %rd2630; | |
mul.lo.s64 %rd1666, %rd1665, %rd2614; | |
xor.b64 %rd1667, %rd1662, %rd1666; | |
cvt.u32.u64 %r181, %rd1667; | |
xor.b32 %r182, %r333, %r181; | |
mul.lo.s32 %r183, %r182, %r334; | |
shr.u32 %r184, %r183, 9; | |
cvt.rn.f32.u32 %f160, %r184; | |
mul.rn.f32 %f161, %f160, 0f34000000; | |
cvt.rn.f16.f32 %h82, %f161; | |
mov.b16 %h83, 0x2E66; | |
setp.ge.f16 %p41, %h82, %h83; | |
ld.global.nc.b16 %h84, [%rd45+1026]; | |
ld.global.nc.f32 %f162, [%rd46+2052]; | |
cvt.rn.f16.f32 %h85, %f162; | |
add.rn.f16 %h86, %h84, %h85; | |
mov.b16 %h87, 0x3C72; | |
mul.rn.f16 %h88, %h86, %h87; | |
selp.b16 %h89, %h88, 0x0000, %p41; | |
cvt.f32.f16 %f163, %h89; | |
ld.global.nc.b16 %h90, [%rd47+1026]; | |
cvt.f32.f16 %f164, %h90; | |
ld.global.nc.f32 %f165, [%rd48+2052]; | |
mul.rn.f32 %f166, %f1, %f165; | |
mul.rn.f32 %f167, %f166, %f164; | |
ld.global.nc.f32 %f168, [%rd49+2052]; | |
mul.rn.f32 %f169, %f2, %f166; | |
sub.rn.f32 %f170, %f168, %f169; | |
add.rn.f32 %f171, %f167, %f170; | |
add.rn.f32 %f172, %f171, %f163; | |
sub.rn.f32 %f173, %f172, %f3; | |
mul.rn.f32 %f174, %f173, %f173; | |
add.rn.f32 %f13, %f12, %f174; | |
or.b32 %r186, %r73, 640; | |
shr.u32 %r187, %r186, 2; | |
cvt.u64.u32 %rd1668, %r187; | |
add.s64 %rd296, %rd12, %rd1668; | |
@%p8 bra LBB55_34; | |
and.b64 %rd1710, %rd296, 4294967295; | |
mul.lo.s64 %rd2635, %rd1710, 3528531795; | |
setp.lt.u64 %p43, %rd296, %rd12; | |
selp.u64 %rd1711, 1, 0, %p43; | |
add.s64 %rd1712, %rd2464, %rd1711; | |
xor.b64 %rd1713, %rd1712, %rd2635; | |
shr.u64 %rd1714, %rd1713, 32; | |
mul.lo.s64 %rd2638, %rd1714, 3449720151; | |
shr.u64 %rd1715, %rd2638, 32; | |
and.b64 %rd1716, %rd1712, 4294967295; | |
mul.lo.s64 %rd1717, %rd1716, 3449720151; | |
and.b64 %rd1718, %rd1717, 4294967295; | |
xor.b64 %rd1719, %rd1718, %rd1715; | |
xor.b64 %rd1720, %rd1719, 2654435769; | |
mul.lo.s64 %rd2641, %rd1720, 3528531795; | |
xor.b64 %rd2631, %rd1717, %rd296; | |
mov.u32 %r337, -1879881855; | |
mov.u32 %r336, -845247145; | |
mov.u32 %r335, 534103459; | |
mov.u64 %rd2649, 3678237736; | |
mov.u64 %rd2648, 3041712726; | |
mov.u64 %rd2647, 1401181199; | |
mov.u64 %rd2646, 2835769497; | |
mov.u64 %rd2645, 1684936478; | |
mov.u64 %rd2644, 2027808484; | |
mov.u64 %rd2643, 387276957; | |
mov.u64 %rd2642, 842468239; | |
mov.u64 %rd2640, 3986602516; | |
mov.u64 %rd2639, 1013904242; | |
mov.u64 %rd2637, 3668340011; | |
mov.u64 %rd2636, 3144134277; | |
mov.u64 %rd2634, 3449720151; | |
mov.u64 %rd2633, 1993301258; | |
mov.u64 %rd2632, 3528531795; | |
bra.uni LBB55_35; | |
LBB55_34: | |
setp.lt.u64 %p42, %rd296, %rd12; | |
selp.u64 %rd1684, 1, 0, %p42; | |
add.s64 %rd1685, %rd2464, %rd1684; | |
and.b64 %rd1686, %rd1685, 4294967295; | |
mul.lo.s64 %rd2635, %rd1686, 3449720151; | |
xor.b64 %rd1687, %rd2635, %rd296; | |
shr.u64 %rd1688, %rd1687, 32; | |
mul.lo.s64 %rd2638, %rd1688, 3528531795; | |
shr.u64 %rd1689, %rd2638, 32; | |
and.b64 %rd1690, %rd296, 4294967295; | |
mul.lo.s64 %rd1691, %rd1690, 3528531795; | |
and.b64 %rd1692, %rd1691, 4294967295; | |
xor.b64 %rd1693, %rd1692, %rd1689; | |
xor.b64 %rd1694, %rd1693, 3144134277; | |
mul.lo.s64 %rd2641, %rd1694, 3449720151; | |
xor.b64 %rd2631, %rd1685, %rd1691; | |
mov.u32 %r337, -1767562579; | |
mov.u32 %r336, -766435501; | |
mov.u32 %r335, 1401181199; | |
mov.u64 %rd2649, 4055616968; | |
mov.u64 %rd2648, 1684936478; | |
mov.u64 %rd2647, 534103459; | |
mov.u64 %rd2646, 387276957; | |
mov.u64 %rd2645, 3041712726; | |
mov.u64 %rd2644, 3986602516; | |
mov.u64 %rd2643, 2835769497; | |
mov.u64 %rd2642, 3668340011; | |
mov.u64 %rd2640, 2027808484; | |
mov.u64 %rd2639, 1993301258; | |
mov.u64 %rd2637, 842468239; | |
mov.u64 %rd2636, 2654435769; | |
mov.u64 %rd2634, 3528531795; | |
mov.u64 %rd2633, 1013904242; | |
mov.u64 %rd2632, 3449720151; | |
LBB55_35: | |
shr.u64 %rd1721, %rd2641, 32; | |
shr.u64 %rd1722, %rd2631, 32; | |
mul.lo.s64 %rd1723, %rd1722, %rd2632; | |
and.b64 %rd1724, %rd1723, 4294967295; | |
xor.b64 %rd1725, %rd1724, %rd1721; | |
xor.b64 %rd1726, %rd1725, %rd2633; | |
mul.lo.s64 %rd1727, %rd1726, %rd2634; | |
shr.u64 %rd1728, %rd1727, 32; | |
shr.u64 %rd1729, %rd1723, 32; | |
and.b64 %rd1730, %rd2635, 4294967295; | |
xor.b64 %rd1731, %rd1730, %rd1729; | |
xor.b64 %rd1732, %rd1731, %rd2636; | |
mul.lo.s64 %rd1733, %rd1732, %rd2634; | |
and.b64 %rd1734, %rd1733, 4294967295; | |
xor.b64 %rd1735, %rd1734, %rd1728; | |
xor.b64 %rd1736, %rd1735, %rd2637; | |
mul.lo.s64 %rd1737, %rd1736, %rd2632; | |
shr.u64 %rd1738, %rd1737, 32; | |
shr.u64 %rd1739, %rd1733, 32; | |
and.b64 %rd1740, %rd2638, 4294967295; | |
xor.b64 %rd1741, %rd1740, %rd1739; | |
xor.b64 %rd1742, %rd1741, %rd2639; | |
mul.lo.s64 %rd1743, %rd1742, %rd2632; | |
and.b64 %rd1744, %rd1743, 4294967295; | |
xor.b64 %rd1745, %rd1744, %rd1738; | |
xor.b64 %rd1746, %rd1745, %rd2640; | |
mul.lo.s64 %rd1747, %rd1746, %rd2634; | |
shr.u64 %rd1748, %rd1747, 32; | |
shr.u64 %rd1749, %rd1743, 32; | |
and.b64 %rd1750, %rd2641, 4294967295; | |
xor.b64 %rd1751, %rd1750, %rd1749; | |
xor.b64 %rd1752, %rd1751, %rd2642; | |
mul.lo.s64 %rd1753, %rd1752, %rd2634; | |
and.b64 %rd1754, %rd1753, 4294967295; | |
xor.b64 %rd1755, %rd1754, %rd1748; | |
xor.b64 %rd1756, %rd1755, %rd2643; | |
mul.lo.s64 %rd1757, %rd1756, %rd2632; | |
shr.u64 %rd1758, %rd1757, 32; | |
shr.u64 %rd1759, %rd1753, 32; | |
and.b64 %rd1760, %rd1727, 4294967295; | |
xor.b64 %rd1761, %rd1760, %rd1759; | |
xor.b64 %rd1762, %rd1761, %rd2644; | |
mul.lo.s64 %rd1763, %rd1762, %rd2632; | |
and.b64 %rd1764, %rd1763, 4294967295; | |
xor.b64 %rd1765, %rd1764, %rd1758; | |
xor.b64 %rd1766, %rd1765, %rd2645; | |
mul.lo.s64 %rd1767, %rd1766, %rd2634; | |
shr.u64 %rd1768, %rd1767, 32; | |
shr.u64 %rd1769, %rd1763, 32; | |
and.b64 %rd1770, %rd1737, 4294967295; | |
xor.b64 %rd1771, %rd1770, %rd1769; | |
xor.b64 %rd1772, %rd1771, %rd2646; | |
mul.lo.s64 %rd1773, %rd1772, %rd2634; | |
and.b64 %rd1774, %rd1773, 4294967295; | |
xor.b64 %rd1775, %rd1774, %rd1768; | |
xor.b64 %rd1776, %rd1775, %rd2647; | |
mul.lo.s64 %rd1777, %rd1776, %rd2632; | |
shr.u64 %rd1778, %rd1777, 32; | |
shr.u64 %rd1779, %rd1773, 32; | |
and.b64 %rd1780, %rd1747, 4294967295; | |
xor.b64 %rd1781, %rd1780, %rd1779; | |
xor.b64 %rd1782, %rd1781, %rd2648; | |
mul.lo.s64 %rd1783, %rd1782, %rd2632; | |
and.b64 %rd1784, %rd1783, 4294967295; | |
xor.b64 %rd1785, %rd1784, %rd1778; | |
xor.b64 %rd1786, %rd1785, %rd2649; | |
mul.lo.s64 %rd1787, %rd1786, %rd2634; | |
shr.u64 %rd1788, %rd1787, 32; | |
cvt.u32.u64 %r194, %rd1788; | |
shr.u64 %rd1789, %rd1783, 32; | |
xor.b64 %rd1790, %rd1789, %rd1757; | |
cvt.u32.u64 %r195, %rd1790; | |
xor.b32 %r196, %r335, %r195; | |
mul.lo.s32 %r197, %r196, %r336; | |
xor.b32 %r198, %r197, %r194; | |
xor.b32 %r199, %r198, %r337; | |
shr.u32 %r200, %r199, 9; | |
cvt.rn.f32.u32 %f175, %r200; | |
mul.rn.f32 %f176, %f175, 0f34000000; | |
cvt.rn.f16.f32 %h91, %f176; | |
mov.b16 %h92, 0x2E66; | |
setp.ge.f16 %p44, %h91, %h92; | |
ld.global.nc.b16 %h93, [%rd45+1280]; | |
ld.global.nc.f32 %f177, [%rd46+2560]; | |
cvt.rn.f16.f32 %h94, %f177; | |
add.rn.f16 %h95, %h93, %h94; | |
mov.b16 %h96, 0x3C72; | |
mul.rn.f16 %h97, %h95, %h96; | |
selp.b16 %h98, %h97, 0x0000, %p44; | |
cvt.f32.f16 %f178, %h98; | |
ld.global.nc.b16 %h99, [%rd47+1280]; | |
cvt.f32.f16 %f179, %h99; | |
ld.global.nc.f32 %f180, [%rd48+2560]; | |
mul.rn.f32 %f181, %f1, %f180; | |
mul.rn.f32 %f182, %f181, %f179; | |
ld.global.nc.f32 %f183, [%rd49+2560]; | |
mul.rn.f32 %f184, %f2, %f181; | |
sub.rn.f32 %f185, %f183, %f184; | |
add.rn.f32 %f186, %f182, %f185; | |
add.rn.f32 %f187, %f186, %f178; | |
sub.rn.f32 %f188, %f187, %f3; | |
mul.rn.f32 %f189, %f188, %f188; | |
add.rn.f32 %f14, %f13, %f189; | |
or.b32 %r201, %r3, 641; | |
or.b32 %r202, %r201, %r4; | |
and.b32 %r203, %r201, 3; | |
shr.u32 %r204, %r202, 2; | |
setp.ne.s32 %p45, %r203, 1; | |
cvt.u64.u32 %rd1791, %r204; | |
add.s64 %rd324, %rd12, %rd1791; | |
@%p45 bra LBB55_37; | |
and.b64 %rd1831, %rd324, 4294967295; | |
mul.lo.s64 %rd2654, %rd1831, 3528531795; | |
setp.lt.u64 %p47, %rd324, %rd12; | |
selp.u64 %rd1832, 1, 0, %p47; | |
add.s64 %rd1833, %rd2464, %rd1832; | |
xor.b64 %rd1834, %rd1833, %rd2654; | |
shr.u64 %rd1835, %rd1834, 32; | |
mul.lo.s64 %rd2657, %rd1835, 3449720151; | |
shr.u64 %rd1836, %rd2657, 32; | |
and.b64 %rd1837, %rd1833, 4294967295; | |
mul.lo.s64 %rd1838, %rd1837, 3449720151; | |
and.b64 %rd1839, %rd1838, 4294967295; | |
xor.b64 %rd1840, %rd1839, %rd1836; | |
xor.b64 %rd1841, %rd1840, 2654435769; | |
mul.lo.s64 %rd2660, %rd1841, 3528531795; | |
xor.b64 %rd2650, %rd1838, %rd324; | |
mov.u32 %r339, -845247145; | |
mov.u32 %r338, -616729560; | |
mov.u64 %rd2667, 3041712726; | |
mov.u64 %rd2666, 1401181199; | |
mov.u64 %rd2665, 2835769497; | |
mov.u64 %rd2664, 1684936478; | |
mov.u64 %rd2663, 2027808484; | |
mov.u64 %rd2662, 387276957; | |
mov.u64 %rd2661, 842468239; | |
mov.u64 %rd2659, 3986602516; | |
mov.u64 %rd2658, 1013904242; | |
mov.u64 %rd2656, 3668340011; | |
mov.u64 %rd2655, 3144134277; | |
mov.u64 %rd2653, 3449720151; | |
mov.u64 %rd2652, 1993301258; | |
mov.u64 %rd2651, 3528531795; | |
bra.uni LBB55_38; | |
LBB55_37: | |
setp.lt.u64 %p46, %rd324, %rd12; | |
selp.u64 %rd1806, 1, 0, %p46; | |
add.s64 %rd1807, %rd2464, %rd1806; | |
and.b64 %rd1808, %rd1807, 4294967295; | |
mul.lo.s64 %rd2654, %rd1808, 3449720151; | |
xor.b64 %rd1809, %rd2654, %rd324; | |
shr.u64 %rd1810, %rd1809, 32; | |
mul.lo.s64 %rd2657, %rd1810, 3528531795; | |
shr.u64 %rd1811, %rd2657, 32; | |
and.b64 %rd1812, %rd324, 4294967295; | |
mul.lo.s64 %rd1813, %rd1812, 3528531795; | |
and.b64 %rd1814, %rd1813, 4294967295; | |
xor.b64 %rd1815, %rd1814, %rd1811; | |
xor.b64 %rd1816, %rd1815, 3144134277; | |
mul.lo.s64 %rd2660, %rd1816, 3449720151; | |
xor.b64 %rd2650, %rd1807, %rd1813; | |
mov.u32 %r339, -766435501; | |
mov.u32 %r338, -239350328; | |
mov.u64 %rd2667, 1684936478; | |
mov.u64 %rd2666, 534103459; | |
mov.u64 %rd2665, 387276957; | |
mov.u64 %rd2664, 3041712726; | |
mov.u64 %rd2663, 3986602516; | |
mov.u64 %rd2662, 2835769497; | |
mov.u64 %rd2661, 3668340011; | |
mov.u64 %rd2659, 2027808484; | |
mov.u64 %rd2658, 1993301258; | |
mov.u64 %rd2656, 842468239; | |
mov.u64 %rd2655, 2654435769; | |
mov.u64 %rd2653, 3528531795; | |
mov.u64 %rd2652, 1013904242; | |
mov.u64 %rd2651, 3449720151; | |
LBB55_38: | |
shr.u64 %rd1842, %rd2660, 32; | |
shr.u64 %rd1843, %rd2650, 32; | |
mul.lo.s64 %rd1844, %rd1843, %rd2651; | |
and.b64 %rd1845, %rd1844, 4294967295; | |
xor.b64 %rd1846, %rd1845, %rd1842; | |
xor.b64 %rd1847, %rd1846, %rd2652; | |
mul.lo.s64 %rd1848, %rd1847, %rd2653; | |
shr.u64 %rd1849, %rd1848, 32; | |
shr.u64 %rd1850, %rd1844, 32; | |
and.b64 %rd1851, %rd2654, 4294967295; | |
xor.b64 %rd1852, %rd1851, %rd1850; | |
xor.b64 %rd1853, %rd1852, %rd2655; | |
mul.lo.s64 %rd1854, %rd1853, %rd2653; | |
and.b64 %rd1855, %rd1854, 4294967295; | |
xor.b64 %rd1856, %rd1855, %rd1849; | |
xor.b64 %rd1857, %rd1856, %rd2656; | |
mul.lo.s64 %rd1858, %rd1857, %rd2651; | |
shr.u64 %rd1859, %rd1858, 32; | |
shr.u64 %rd1860, %rd1854, 32; | |
and.b64 %rd1861, %rd2657, 4294967295; | |
xor.b64 %rd1862, %rd1861, %rd1860; | |
xor.b64 %rd1863, %rd1862, %rd2658; | |
mul.lo.s64 %rd1864, %rd1863, %rd2651; | |
and.b64 %rd1865, %rd1864, 4294967295; | |
xor.b64 %rd1866, %rd1865, %rd1859; | |
xor.b64 %rd1867, %rd1866, %rd2659; | |
mul.lo.s64 %rd1868, %rd1867, %rd2653; | |
shr.u64 %rd1869, %rd1868, 32; | |
shr.u64 %rd1870, %rd1864, 32; | |
and.b64 %rd1871, %rd2660, 4294967295; | |
xor.b64 %rd1872, %rd1871, %rd1870; | |
xor.b64 %rd1873, %rd1872, %rd2661; | |
mul.lo.s64 %rd1874, %rd1873, %rd2653; | |
and.b64 %rd1875, %rd1874, 4294967295; | |
xor.b64 %rd1876, %rd1875, %rd1869; | |
xor.b64 %rd1877, %rd1876, %rd2662; | |
mul.lo.s64 %rd1878, %rd1877, %rd2651; | |
shr.u64 %rd1879, %rd1878, 32; | |
shr.u64 %rd1880, %rd1874, 32; | |
and.b64 %rd1881, %rd1848, 4294967295; | |
xor.b64 %rd1882, %rd1881, %rd1880; | |
xor.b64 %rd1883, %rd1882, %rd2663; | |
mul.lo.s64 %rd1884, %rd1883, %rd2651; | |
and.b64 %rd1885, %rd1884, 4294967295; | |
xor.b64 %rd1886, %rd1885, %rd1879; | |
xor.b64 %rd1887, %rd1886, %rd2664; | |
mul.lo.s64 %rd1888, %rd1887, %rd2653; | |
shr.u64 %rd1889, %rd1888, 32; | |
shr.u64 %rd1890, %rd1884, 32; | |
and.b64 %rd1891, %rd1858, 4294967295; | |
xor.b64 %rd1892, %rd1891, %rd1890; | |
xor.b64 %rd1893, %rd1892, %rd2665; | |
mul.lo.s64 %rd1894, %rd1893, %rd2653; | |
and.b64 %rd1895, %rd1894, 4294967295; | |
xor.b64 %rd1896, %rd1895, %rd1889; | |
xor.b64 %rd1897, %rd1896, %rd2666; | |
mul.lo.s64 %rd1898, %rd1897, %rd2651; | |
shr.u64 %rd1899, %rd1898, 32; | |
shr.u64 %rd1900, %rd1894, 32; | |
xor.b64 %rd1901, %rd1868, %rd1900; | |
xor.b64 %rd1902, %rd1901, %rd2667; | |
mul.lo.s64 %rd1903, %rd1902, %rd2651; | |
xor.b64 %rd1904, %rd1899, %rd1903; | |
cvt.u32.u64 %r209, %rd1904; | |
xor.b32 %r210, %r338, %r209; | |
mul.lo.s32 %r211, %r210, %r339; | |
shr.u32 %r212, %r211, 9; | |
cvt.rn.f32.u32 %f190, %r212; | |
mul.rn.f32 %f191, %f190, 0f34000000; | |
cvt.rn.f16.f32 %h100, %f191; | |
mov.b16 %h101, 0x2E66; | |
setp.ge.f16 %p49, %h100, %h101; | |
ld.global.nc.b16 %h102, [%rd45+1282]; | |
ld.global.nc.f32 %f192, [%rd46+2564]; | |
cvt.rn.f16.f32 %h103, %f192; | |
add.rn.f16 %h104, %h102, %h103; | |
mov.b16 %h105, 0x3C72; | |
mul.rn.f16 %h106, %h104, %h105; | |
selp.b16 %h107, %h106, 0x0000, %p49; | |
cvt.f32.f16 %f193, %h107; | |
ld.global.nc.b16 %h108, [%rd47+1282]; | |
cvt.f32.f16 %f194, %h108; | |
ld.global.nc.f32 %f195, [%rd48+2564]; | |
mul.rn.f32 %f196, %f1, %f195; | |
mul.rn.f32 %f197, %f196, %f194; | |
ld.global.nc.f32 %f198, [%rd49+2564]; | |
mul.rn.f32 %f199, %f2, %f196; | |
sub.rn.f32 %f200, %f198, %f199; | |
add.rn.f32 %f201, %f197, %f200; | |
add.rn.f32 %f202, %f201, %f193; | |
sub.rn.f32 %f203, %f202, %f3; | |
mul.rn.f32 %f204, %f203, %f203; | |
add.rn.f32 %f15, %f14, %f204; | |
or.b32 %r214, %r73, 768; | |
shr.u32 %r215, %r214, 2; | |
cvt.u64.u32 %rd1905, %r215; | |
add.s64 %rd351, %rd12, %rd1905; | |
@%p8 bra LBB55_40; | |
and.b64 %rd1947, %rd351, 4294967295; | |
mul.lo.s64 %rd2672, %rd1947, 3528531795; | |
setp.lt.u64 %p51, %rd351, %rd12; | |
selp.u64 %rd1948, 1, 0, %p51; | |
add.s64 %rd1949, %rd2464, %rd1948; | |
xor.b64 %rd1950, %rd1949, %rd2672; | |
shr.u64 %rd1951, %rd1950, 32; | |
mul.lo.s64 %rd2675, %rd1951, 3449720151; | |
shr.u64 %rd1952, %rd2675, 32; | |
and.b64 %rd1953, %rd1949, 4294967295; | |
mul.lo.s64 %rd1954, %rd1953, 3449720151; | |
and.b64 %rd1955, %rd1954, 4294967295; | |
xor.b64 %rd1956, %rd1955, %rd1952; | |
xor.b64 %rd1957, %rd1956, 2654435769; | |
mul.lo.s64 %rd2678, %rd1957, 3528531795; | |
xor.b64 %rd2668, %rd1954, %rd351; | |
mov.u32 %r342, -1879881855; | |
mov.u32 %r341, -845247145; | |
mov.u32 %r340, 534103459; | |
mov.u64 %rd2686, 3678237736; | |
mov.u64 %rd2685, 3041712726; | |
mov.u64 %rd2684, 1401181199; | |
mov.u64 %rd2683, 2835769497; | |
mov.u64 %rd2682, 1684936478; | |
mov.u64 %rd2681, 2027808484; | |
mov.u64 %rd2680, 387276957; | |
mov.u64 %rd2679, 842468239; | |
mov.u64 %rd2677, 3986602516; | |
mov.u64 %rd2676, 1013904242; | |
mov.u64 %rd2674, 3668340011; | |
mov.u64 %rd2673, 3144134277; | |
mov.u64 %rd2671, 3449720151; | |
mov.u64 %rd2670, 1993301258; | |
mov.u64 %rd2669, 3528531795; | |
bra.uni LBB55_41; | |
LBB55_40: | |
setp.lt.u64 %p50, %rd351, %rd12; | |
selp.u64 %rd1921, 1, 0, %p50; | |
add.s64 %rd1922, %rd2464, %rd1921; | |
and.b64 %rd1923, %rd1922, 4294967295; | |
mul.lo.s64 %rd2672, %rd1923, 3449720151; | |
xor.b64 %rd1924, %rd2672, %rd351; | |
shr.u64 %rd1925, %rd1924, 32; | |
mul.lo.s64 %rd2675, %rd1925, 3528531795; | |
shr.u64 %rd1926, %rd2675, 32; | |
and.b64 %rd1927, %rd351, 4294967295; | |
mul.lo.s64 %rd1928, %rd1927, 3528531795; | |
and.b64 %rd1929, %rd1928, 4294967295; | |
xor.b64 %rd1930, %rd1929, %rd1926; | |
xor.b64 %rd1931, %rd1930, 3144134277; | |
mul.lo.s64 %rd2678, %rd1931, 3449720151; | |
xor.b64 %rd2668, %rd1922, %rd1928; | |
mov.u32 %r342, -1767562579; | |
mov.u32 %r341, -766435501; | |
mov.u32 %r340, 1401181199; | |
mov.u64 %rd2686, 4055616968; | |
mov.u64 %rd2685, 1684936478; | |
mov.u64 %rd2684, 534103459; | |
mov.u64 %rd2683, 387276957; | |
mov.u64 %rd2682, 3041712726; | |
mov.u64 %rd2681, 3986602516; | |
mov.u64 %rd2680, 2835769497; | |
mov.u64 %rd2679, 3668340011; | |
mov.u64 %rd2677, 2027808484; | |
mov.u64 %rd2676, 1993301258; | |
mov.u64 %rd2674, 842468239; | |
mov.u64 %rd2673, 2654435769; | |
mov.u64 %rd2671, 3528531795; | |
mov.u64 %rd2670, 1013904242; | |
mov.u64 %rd2669, 3449720151; | |
LBB55_41: | |
shr.u64 %rd1958, %rd2678, 32; | |
shr.u64 %rd1959, %rd2668, 32; | |
mul.lo.s64 %rd1960, %rd1959, %rd2669; | |
and.b64 %rd1961, %rd1960, 4294967295; | |
xor.b64 %rd1962, %rd1961, %rd1958; | |
xor.b64 %rd1963, %rd1962, %rd2670; | |
mul.lo.s64 %rd1964, %rd1963, %rd2671; | |
shr.u64 %rd1965, %rd1964, 32; | |
shr.u64 %rd1966, %rd1960, 32; | |
and.b64 %rd1967, %rd2672, 4294967295; | |
xor.b64 %rd1968, %rd1967, %rd1966; | |
xor.b64 %rd1969, %rd1968, %rd2673; | |
mul.lo.s64 %rd1970, %rd1969, %rd2671; | |
and.b64 %rd1971, %rd1970, 4294967295; | |
xor.b64 %rd1972, %rd1971, %rd1965; | |
xor.b64 %rd1973, %rd1972, %rd2674; | |
mul.lo.s64 %rd1974, %rd1973, %rd2669; | |
shr.u64 %rd1975, %rd1974, 32; | |
shr.u64 %rd1976, %rd1970, 32; | |
and.b64 %rd1977, %rd2675, 4294967295; | |
xor.b64 %rd1978, %rd1977, %rd1976; | |
xor.b64 %rd1979, %rd1978, %rd2676; | |
mul.lo.s64 %rd1980, %rd1979, %rd2669; | |
and.b64 %rd1981, %rd1980, 4294967295; | |
xor.b64 %rd1982, %rd1981, %rd1975; | |
xor.b64 %rd1983, %rd1982, %rd2677; | |
mul.lo.s64 %rd1984, %rd1983, %rd2671; | |
shr.u64 %rd1985, %rd1984, 32; | |
shr.u64 %rd1986, %rd1980, 32; | |
and.b64 %rd1987, %rd2678, 4294967295; | |
xor.b64 %rd1988, %rd1987, %rd1986; | |
xor.b64 %rd1989, %rd1988, %rd2679; | |
mul.lo.s64 %rd1990, %rd1989, %rd2671; | |
and.b64 %rd1991, %rd1990, 4294967295; | |
xor.b64 %rd1992, %rd1991, %rd1985; | |
xor.b64 %rd1993, %rd1992, %rd2680; | |
mul.lo.s64 %rd1994, %rd1993, %rd2669; | |
shr.u64 %rd1995, %rd1994, 32; | |
shr.u64 %rd1996, %rd1990, 32; | |
and.b64 %rd1997, %rd1964, 4294967295; | |
xor.b64 %rd1998, %rd1997, %rd1996; | |
xor.b64 %rd1999, %rd1998, %rd2681; | |
mul.lo.s64 %rd2000, %rd1999, %rd2669; | |
and.b64 %rd2001, %rd2000, 4294967295; | |
xor.b64 %rd2002, %rd2001, %rd1995; | |
xor.b64 %rd2003, %rd2002, %rd2682; | |
mul.lo.s64 %rd2004, %rd2003, %rd2671; | |
shr.u64 %rd2005, %rd2004, 32; | |
shr.u64 %rd2006, %rd2000, 32; | |
and.b64 %rd2007, %rd1974, 4294967295; | |
xor.b64 %rd2008, %rd2007, %rd2006; | |
xor.b64 %rd2009, %rd2008, %rd2683; | |
mul.lo.s64 %rd2010, %rd2009, %rd2671; | |
and.b64 %rd2011, %rd2010, 4294967295; | |
xor.b64 %rd2012, %rd2011, %rd2005; | |
xor.b64 %rd2013, %rd2012, %rd2684; | |
mul.lo.s64 %rd2014, %rd2013, %rd2669; | |
shr.u64 %rd2015, %rd2014, 32; | |
shr.u64 %rd2016, %rd2010, 32; | |
and.b64 %rd2017, %rd1984, 4294967295; | |
xor.b64 %rd2018, %rd2017, %rd2016; | |
xor.b64 %rd2019, %rd2018, %rd2685; | |
mul.lo.s64 %rd2020, %rd2019, %rd2669; | |
and.b64 %rd2021, %rd2020, 4294967295; | |
xor.b64 %rd2022, %rd2021, %rd2015; | |
xor.b64 %rd2023, %rd2022, %rd2686; | |
mul.lo.s64 %rd2024, %rd2023, %rd2671; | |
shr.u64 %rd2025, %rd2024, 32; | |
cvt.u32.u64 %r222, %rd2025; | |
shr.u64 %rd2026, %rd2020, 32; | |
xor.b64 %rd2027, %rd2026, %rd1994; | |
cvt.u32.u64 %r223, %rd2027; | |
xor.b32 %r224, %r340, %r223; | |
mul.lo.s32 %r225, %r224, %r341; | |
xor.b32 %r226, %r225, %r222; | |
xor.b32 %r227, %r226, %r342; | |
shr.u32 %r228, %r227, 9; | |
cvt.rn.f32.u32 %f205, %r228; | |
mul.rn.f32 %f206, %f205, 0f34000000; | |
cvt.rn.f16.f32 %h109, %f206; | |
mov.b16 %h110, 0x2E66; | |
setp.ge.f16 %p52, %h109, %h110; | |
ld.global.nc.b16 %h111, [%rd45+1536]; | |
ld.global.nc.f32 %f207, [%rd46+3072]; | |
cvt.rn.f16.f32 %h112, %f207; | |
add.rn.f16 %h113, %h111, %h112; | |
mov.b16 %h114, 0x3C72; | |
mul.rn.f16 %h115, %h113, %h114; | |
selp.b16 %h116, %h115, 0x0000, %p52; | |
cvt.f32.f16 %f208, %h116; | |
ld.global.nc.b16 %h117, [%rd47+1536]; | |
cvt.f32.f16 %f209, %h117; | |
ld.global.nc.f32 %f210, [%rd48+3072]; | |
mul.rn.f32 %f211, %f1, %f210; | |
mul.rn.f32 %f212, %f211, %f209; | |
ld.global.nc.f32 %f213, [%rd49+3072]; | |
mul.rn.f32 %f214, %f2, %f211; | |
sub.rn.f32 %f215, %f213, %f214; | |
add.rn.f32 %f216, %f212, %f215; | |
add.rn.f32 %f217, %f216, %f208; | |
sub.rn.f32 %f218, %f217, %f3; | |
mul.rn.f32 %f219, %f218, %f218; | |
add.rn.f32 %f16, %f15, %f219; | |
or.b32 %r229, %r3, 769; | |
or.b32 %r230, %r229, %r4; | |
and.b32 %r231, %r229, 3; | |
shr.u32 %r232, %r230, 2; | |
setp.ne.s32 %p53, %r231, 1; | |
cvt.u64.u32 %rd2028, %r232; | |
add.s64 %rd379, %rd12, %rd2028; | |
@%p53 bra LBB55_43; | |
and.b64 %rd2068, %rd379, 4294967295; | |
mul.lo.s64 %rd2691, %rd2068, 3528531795; | |
setp.lt.u64 %p55, %rd379, %rd12; | |
selp.u64 %rd2069, 1, 0, %p55; | |
add.s64 %rd2070, %rd2464, %rd2069; | |
xor.b64 %rd2071, %rd2070, %rd2691; | |
shr.u64 %rd2072, %rd2071, 32; | |
mul.lo.s64 %rd2694, %rd2072, 3449720151; | |
shr.u64 %rd2073, %rd2694, 32; | |
and.b64 %rd2074, %rd2070, 4294967295; | |
mul.lo.s64 %rd2075, %rd2074, 3449720151; | |
and.b64 %rd2076, %rd2075, 4294967295; | |
xor.b64 %rd2077, %rd2076, %rd2073; | |
xor.b64 %rd2078, %rd2077, 2654435769; | |
mul.lo.s64 %rd2697, %rd2078, 3528531795; | |
xor.b64 %rd2687, %rd2075, %rd379; | |
mov.u32 %r344, -845247145; | |
mov.u32 %r343, -616729560; | |
mov.u64 %rd2704, 3041712726; | |
mov.u64 %rd2703, 1401181199; | |
mov.u64 %rd2702, 2835769497; | |
mov.u64 %rd2701, 1684936478; | |
mov.u64 %rd2700, 2027808484; | |
mov.u64 %rd2699, 387276957; | |
mov.u64 %rd2698, 842468239; | |
mov.u64 %rd2696, 3986602516; | |
mov.u64 %rd2695, 1013904242; | |
mov.u64 %rd2693, 3668340011; | |
mov.u64 %rd2692, 3144134277; | |
mov.u64 %rd2690, 3449720151; | |
mov.u64 %rd2689, 1993301258; | |
mov.u64 %rd2688, 3528531795; | |
bra.uni LBB55_44; | |
LBB55_43: | |
setp.lt.u64 %p54, %rd379, %rd12; | |
selp.u64 %rd2043, 1, 0, %p54; | |
add.s64 %rd2044, %rd2464, %rd2043; | |
and.b64 %rd2045, %rd2044, 4294967295; | |
mul.lo.s64 %rd2691, %rd2045, 3449720151; | |
xor.b64 %rd2046, %rd2691, %rd379; | |
shr.u64 %rd2047, %rd2046, 32; | |
mul.lo.s64 %rd2694, %rd2047, 3528531795; | |
shr.u64 %rd2048, %rd2694, 32; | |
and.b64 %rd2049, %rd379, 4294967295; | |
mul.lo.s64 %rd2050, %rd2049, 3528531795; | |
and.b64 %rd2051, %rd2050, 4294967295; | |
xor.b64 %rd2052, %rd2051, %rd2048; | |
xor.b64 %rd2053, %rd2052, 3144134277; | |
mul.lo.s64 %rd2697, %rd2053, 3449720151; | |
xor.b64 %rd2687, %rd2044, %rd2050; | |
mov.u32 %r344, -766435501; | |
mov.u32 %r343, -239350328; | |
mov.u64 %rd2704, 1684936478; | |
mov.u64 %rd2703, 534103459; | |
mov.u64 %rd2702, 387276957; | |
mov.u64 %rd2701, 3041712726; | |
mov.u64 %rd2700, 3986602516; | |
mov.u64 %rd2699, 2835769497; | |
mov.u64 %rd2698, 3668340011; | |
mov.u64 %rd2696, 2027808484; | |
mov.u64 %rd2695, 1993301258; | |
mov.u64 %rd2693, 842468239; | |
mov.u64 %rd2692, 2654435769; | |
mov.u64 %rd2690, 3528531795; | |
mov.u64 %rd2689, 1013904242; | |
mov.u64 %rd2688, 3449720151; | |
LBB55_44: | |
shr.u64 %rd2079, %rd2697, 32; | |
shr.u64 %rd2080, %rd2687, 32; | |
mul.lo.s64 %rd2081, %rd2080, %rd2688; | |
and.b64 %rd2082, %rd2081, 4294967295; | |
xor.b64 %rd2083, %rd2082, %rd2079; | |
xor.b64 %rd2084, %rd2083, %rd2689; | |
mul.lo.s64 %rd2085, %rd2084, %rd2690; | |
shr.u64 %rd2086, %rd2085, 32; | |
shr.u64 %rd2087, %rd2081, 32; | |
and.b64 %rd2088, %rd2691, 4294967295; | |
xor.b64 %rd2089, %rd2088, %rd2087; | |
xor.b64 %rd2090, %rd2089, %rd2692; | |
mul.lo.s64 %rd2091, %rd2090, %rd2690; | |
and.b64 %rd2092, %rd2091, 4294967295; | |
xor.b64 %rd2093, %rd2092, %rd2086; | |
xor.b64 %rd2094, %rd2093, %rd2693; | |
mul.lo.s64 %rd2095, %rd2094, %rd2688; | |
shr.u64 %rd2096, %rd2095, 32; | |
shr.u64 %rd2097, %rd2091, 32; | |
and.b64 %rd2098, %rd2694, 4294967295; | |
xor.b64 %rd2099, %rd2098, %rd2097; | |
xor.b64 %rd2100, %rd2099, %rd2695; | |
mul.lo.s64 %rd2101, %rd2100, %rd2688; | |
and.b64 %rd2102, %rd2101, 4294967295; | |
xor.b64 %rd2103, %rd2102, %rd2096; | |
xor.b64 %rd2104, %rd2103, %rd2696; | |
mul.lo.s64 %rd2105, %rd2104, %rd2690; | |
shr.u64 %rd2106, %rd2105, 32; | |
shr.u64 %rd2107, %rd2101, 32; | |
and.b64 %rd2108, %rd2697, 4294967295; | |
xor.b64 %rd2109, %rd2108, %rd2107; | |
xor.b64 %rd2110, %rd2109, %rd2698; | |
mul.lo.s64 %rd2111, %rd2110, %rd2690; | |
and.b64 %rd2112, %rd2111, 4294967295; | |
xor.b64 %rd2113, %rd2112, %rd2106; | |
xor.b64 %rd2114, %rd2113, %rd2699; | |
mul.lo.s64 %rd2115, %rd2114, %rd2688; | |
shr.u64 %rd2116, %rd2115, 32; | |
shr.u64 %rd2117, %rd2111, 32; | |
and.b64 %rd2118, %rd2085, 4294967295; | |
xor.b64 %rd2119, %rd2118, %rd2117; | |
xor.b64 %rd2120, %rd2119, %rd2700; | |
mul.lo.s64 %rd2121, %rd2120, %rd2688; | |
and.b64 %rd2122, %rd2121, 4294967295; | |
xor.b64 %rd2123, %rd2122, %rd2116; | |
xor.b64 %rd2124, %rd2123, %rd2701; | |
mul.lo.s64 %rd2125, %rd2124, %rd2690; | |
shr.u64 %rd2126, %rd2125, 32; | |
shr.u64 %rd2127, %rd2121, 32; | |
and.b64 %rd2128, %rd2095, 4294967295; | |
xor.b64 %rd2129, %rd2128, %rd2127; | |
xor.b64 %rd2130, %rd2129, %rd2702; | |
mul.lo.s64 %rd2131, %rd2130, %rd2690; | |
and.b64 %rd2132, %rd2131, 4294967295; | |
xor.b64 %rd2133, %rd2132, %rd2126; | |
xor.b64 %rd2134, %rd2133, %rd2703; | |
mul.lo.s64 %rd2135, %rd2134, %rd2688; | |
shr.u64 %rd2136, %rd2135, 32; | |
shr.u64 %rd2137, %rd2131, 32; | |
xor.b64 %rd2138, %rd2105, %rd2137; | |
xor.b64 %rd2139, %rd2138, %rd2704; | |
mul.lo.s64 %rd2140, %rd2139, %rd2688; | |
xor.b64 %rd2141, %rd2136, %rd2140; | |
cvt.u32.u64 %r237, %rd2141; | |
xor.b32 %r238, %r343, %r237; | |
mul.lo.s32 %r239, %r238, %r344; | |
shr.u32 %r240, %r239, 9; | |
cvt.rn.f32.u32 %f220, %r240; | |
mul.rn.f32 %f221, %f220, 0f34000000; | |
cvt.rn.f16.f32 %h118, %f221; | |
mov.b16 %h119, 0x2E66; | |
setp.ge.f16 %p57, %h118, %h119; | |
ld.global.nc.b16 %h120, [%rd45+1538]; | |
ld.global.nc.f32 %f222, [%rd46+3076]; | |
cvt.rn.f16.f32 %h121, %f222; | |
add.rn.f16 %h122, %h120, %h121; | |
mov.b16 %h123, 0x3C72; | |
mul.rn.f16 %h124, %h122, %h123; | |
selp.b16 %h125, %h124, 0x0000, %p57; | |
cvt.f32.f16 %f223, %h125; | |
ld.global.nc.b16 %h126, [%rd47+1538]; | |
cvt.f32.f16 %f224, %h126; | |
ld.global.nc.f32 %f225, [%rd48+3076]; | |
mul.rn.f32 %f226, %f1, %f225; | |
mul.rn.f32 %f227, %f226, %f224; | |
ld.global.nc.f32 %f228, [%rd49+3076]; | |
mul.rn.f32 %f229, %f2, %f226; | |
sub.rn.f32 %f230, %f228, %f229; | |
add.rn.f32 %f231, %f227, %f230; | |
add.rn.f32 %f232, %f231, %f223; | |
sub.rn.f32 %f233, %f232, %f3; | |
mul.rn.f32 %f234, %f233, %f233; | |
add.rn.f32 %f17, %f16, %f234; | |
or.b32 %r242, %r73, 896; | |
shr.u32 %r243, %r242, 2; | |
cvt.u64.u32 %rd2142, %r243; | |
add.s64 %rd406, %rd12, %rd2142; | |
@%p8 bra LBB55_46; | |
mov.u32 %r347, -1879881855; | |
mov.u32 %r345, 534103459; | |
mov.u64 %rd2723, 3678237736; | |
and.b64 %rd2184, %rd406, 4294967295; | |
mul.lo.s64 %rd2709, %rd2184, 3528531795; | |
setp.lt.u64 %p59, %rd406, %rd12; | |
selp.u64 %rd2185, 1, 0, %p59; | |
add.s64 %rd2186, %rd2464, %rd2185; | |
xor.b64 %rd2187, %rd2186, %rd2709; | |
shr.u64 %rd2188, %rd2187, 32; | |
mul.lo.s64 %rd2712, %rd2188, 3449720151; | |
shr.u64 %rd2189, %rd2712, 32; | |
and.b64 %rd2190, %rd2186, 4294967295; | |
mul.lo.s64 %rd2191, %rd2190, 3449720151; | |
and.b64 %rd2192, %rd2191, 4294967295; | |
xor.b64 %rd2193, %rd2192, %rd2189; | |
xor.b64 %rd2194, %rd2193, 2654435769; | |
mul.lo.s64 %rd2715, %rd2194, 3528531795; | |
xor.b64 %rd2705, %rd2191, %rd406; | |
mov.u32 %r346, -845247145; | |
mov.u64 %rd2722, 3041712726; | |
mov.u64 %rd2721, 1401181199; | |
mov.u64 %rd2720, 2835769497; | |
mov.u64 %rd2719, 1684936478; | |
mov.u64 %rd2718, 2027808484; | |
mov.u64 %rd2717, 387276957; | |
mov.u64 %rd2716, 842468239; | |
mov.u64 %rd2714, 3986602516; | |
mov.u64 %rd2713, 1013904242; | |
mov.u64 %rd2711, 3668340011; | |
mov.u64 %rd2710, 3144134277; | |
mov.u64 %rd2708, 3449720151; | |
mov.u64 %rd2707, 1993301258; | |
mov.u64 %rd2706, 3528531795; | |
bra.uni LBB55_47; | |
LBB55_46: | |
setp.lt.u64 %p58, %rd406, %rd12; | |
selp.u64 %rd2158, 1, 0, %p58; | |
add.s64 %rd2159, %rd2464, %rd2158; | |
and.b64 %rd2160, %rd2159, 4294967295; | |
mul.lo.s64 %rd2709, %rd2160, 3449720151; | |
xor.b64 %rd2161, %rd2709, %rd406; | |
shr.u64 %rd2162, %rd2161, 32; | |
mul.lo.s64 %rd2712, %rd2162, 3528531795; | |
shr.u64 %rd2163, %rd2712, 32; | |
and.b64 %rd2164, %rd406, 4294967295; | |
mul.lo.s64 %rd2165, %rd2164, 3528531795; | |
and.b64 %rd2166, %rd2165, 4294967295; | |
xor.b64 %rd2167, %rd2166, %rd2163; | |
xor.b64 %rd2168, %rd2167, 3144134277; | |
mul.lo.s64 %rd2715, %rd2168, 3449720151; | |
xor.b64 %rd2705, %rd2159, %rd2165; | |
mov.u32 %r347, -1767562579; | |
mov.u32 %r346, -766435501; | |
mov.u32 %r345, 1401181199; | |
mov.u64 %rd2723, 4055616968; | |
mov.u64 %rd2722, 1684936478; | |
mov.u64 %rd2721, 534103459; | |
mov.u64 %rd2720, 387276957; | |
mov.u64 %rd2719, 3041712726; | |
mov.u64 %rd2718, 3986602516; | |
mov.u64 %rd2717, 2835769497; | |
mov.u64 %rd2716, 3668340011; | |
mov.u64 %rd2714, 2027808484; | |
mov.u64 %rd2713, 1993301258; | |
mov.u64 %rd2711, 842468239; | |
mov.u64 %rd2710, 2654435769; | |
mov.u64 %rd2708, 3528531795; | |
mov.u64 %rd2707, 1013904242; | |
mov.u64 %rd2706, 3449720151; | |
LBB55_47: | |
shr.u64 %rd2195, %rd2715, 32; | |
shr.u64 %rd2196, %rd2705, 32; | |
mul.lo.s64 %rd2197, %rd2196, %rd2706; | |
and.b64 %rd2198, %rd2197, 4294967295; | |
xor.b64 %rd2199, %rd2198, %rd2195; | |
xor.b64 %rd2200, %rd2199, %rd2707; | |
mul.lo.s64 %rd2201, %rd2200, %rd2708; | |
shr.u64 %rd2202, %rd2201, 32; | |
shr.u64 %rd2203, %rd2197, 32; | |
and.b64 %rd2204, %rd2709, 4294967295; | |
xor.b64 %rd2205, %rd2204, %rd2203; | |
xor.b64 %rd2206, %rd2205, %rd2710; | |
mul.lo.s64 %rd2207, %rd2206, %rd2708; | |
and.b64 %rd2208, %rd2207, 4294967295; | |
xor.b64 %rd2209, %rd2208, %rd2202; | |
xor.b64 %rd2210, %rd2209, %rd2711; | |
mul.lo.s64 %rd2211, %rd2210, %rd2706; | |
shr.u64 %rd2212, %rd2211, 32; | |
shr.u64 %rd2213, %rd2207, 32; | |
and.b64 %rd2214, %rd2712, 4294967295; | |
xor.b64 %rd2215, %rd2214, %rd2213; | |
xor.b64 %rd2216, %rd2215, %rd2713; | |
mul.lo.s64 %rd2217, %rd2216, %rd2706; | |
and.b64 %rd2218, %rd2217, 4294967295; | |
xor.b64 %rd2219, %rd2218, %rd2212; | |
xor.b64 %rd2220, %rd2219, %rd2714; | |
mul.lo.s64 %rd2221, %rd2220, %rd2708; | |
shr.u64 %rd2222, %rd2221, 32; | |
shr.u64 %rd2223, %rd2217, 32; | |
and.b64 %rd2224, %rd2715, 4294967295; | |
xor.b64 %rd2225, %rd2224, %rd2223; | |
xor.b64 %rd2226, %rd2225, %rd2716; | |
mul.lo.s64 %rd2227, %rd2226, %rd2708; | |
and.b64 %rd2228, %rd2227, 4294967295; | |
xor.b64 %rd2229, %rd2228, %rd2222; | |
xor.b64 %rd2230, %rd2229, %rd2717; | |
mul.lo.s64 %rd2231, %rd2230, %rd2706; | |
shr.u64 %rd2232, %rd2231, 32; | |
shr.u64 %rd2233, %rd2227, 32; | |
and.b64 %rd2234, %rd2201, 4294967295; | |
xor.b64 %rd2235, %rd2234, %rd2233; | |
xor.b64 %rd2236, %rd2235, %rd2718; | |
mul.lo.s64 %rd2237, %rd2236, %rd2706; | |
and.b64 %rd2238, %rd2237, 4294967295; | |
xor.b64 %rd2239, %rd2238, %rd2232; | |
xor.b64 %rd2240, %rd2239, %rd2719; | |
mul.lo.s64 %rd2241, %rd2240, %rd2708; | |
shr.u64 %rd2242, %rd2241, 32; | |
shr.u64 %rd2243, %rd2237, 32; | |
and.b64 %rd2244, %rd2211, 4294967295; | |
xor.b64 %rd2245, %rd2244, %rd2243; | |
xor.b64 %rd2246, %rd2245, %rd2720; | |
mul.lo.s64 %rd2247, %rd2246, %rd2708; | |
and.b64 %rd2248, %rd2247, 4294967295; | |
xor.b64 %rd2249, %rd2248, %rd2242; | |
xor.b64 %rd2250, %rd2249, %rd2721; | |
mul.lo.s64 %rd2251, %rd2250, %rd2706; | |
shr.u64 %rd2252, %rd2251, 32; | |
shr.u64 %rd2253, %rd2247, 32; | |
and.b64 %rd2254, %rd2221, 4294967295; | |
xor.b64 %rd2255, %rd2254, %rd2253; | |
xor.b64 %rd2256, %rd2255, %rd2722; | |
mul.lo.s64 %rd2257, %rd2256, %rd2706; | |
and.b64 %rd2258, %rd2257, 4294967295; | |
xor.b64 %rd2259, %rd2258, %rd2252; | |
xor.b64 %rd2260, %rd2259, %rd2723; | |
mul.lo.s64 %rd2261, %rd2260, %rd2708; | |
shr.u64 %rd2262, %rd2261, 32; | |
cvt.u32.u64 %r250, %rd2262; | |
shr.u64 %rd2263, %rd2257, 32; | |
xor.b64 %rd2264, %rd2263, %rd2231; | |
cvt.u32.u64 %r251, %rd2264; | |
xor.b32 %r252, %r345, %r251; | |
mul.lo.s32 %r253, %r252, %r346; | |
xor.b32 %r254, %r253, %r250; | |
xor.b32 %r255, %r254, %r347; | |
shr.u32 %r256, %r255, 9; | |
cvt.rn.f32.u32 %f235, %r256; | |
mul.rn.f32 %f236, %f235, 0f34000000; | |
cvt.rn.f16.f32 %h127, %f236; | |
mov.b16 %h128, 0x2E66; | |
setp.ge.f16 %p60, %h127, %h128; | |
ld.global.nc.b16 %h129, [%rd45+1792]; | |
ld.global.nc.f32 %f237, [%rd46+3584]; | |
cvt.rn.f16.f32 %h130, %f237; | |
add.rn.f16 %h131, %h129, %h130; | |
mov.b16 %h132, 0x3C72; | |
mul.rn.f16 %h133, %h131, %h132; | |
selp.b16 %h134, %h133, 0x0000, %p60; | |
cvt.f32.f16 %f238, %h134; | |
ld.global.nc.b16 %h135, [%rd47+1792]; | |
cvt.f32.f16 %f239, %h135; | |
ld.global.nc.f32 %f240, [%rd48+3584]; | |
mul.rn.f32 %f241, %f1, %f240; | |
mul.rn.f32 %f242, %f241, %f239; | |
ld.global.nc.f32 %f243, [%rd49+3584]; | |
mul.rn.f32 %f244, %f2, %f241; | |
sub.rn.f32 %f245, %f243, %f244; | |
add.rn.f32 %f246, %f242, %f245; | |
add.rn.f32 %f247, %f246, %f238; | |
sub.rn.f32 %f248, %f247, %f3; | |
mul.rn.f32 %f249, %f248, %f248; | |
add.rn.f32 %f18, %f17, %f249; | |
or.b32 %r257, %r3, 897; | |
or.b32 %r258, %r257, %r4; | |
and.b32 %r259, %r257, 3; | |
shr.u32 %r260, %r258, 2; | |
setp.ne.s32 %p61, %r259, 1; | |
cvt.u64.u32 %rd2265, %r260; | |
add.s64 %rd434, %rd12, %rd2265; | |
@%p61 bra LBB55_49; | |
mov.u32 %r349, -845247145; | |
mov.u64 %rd2740, 1401181199; | |
mov.u64 %rd2729, 3144134277; | |
mov.u32 %r348, -616729560; | |
and.b64 %rd2305, %rd434, 4294967295; | |
mul.lo.s64 %rd2728, %rd2305, 3528531795; | |
setp.lt.u64 %p63, %rd434, %rd12; | |
selp.u64 %rd2306, 1, 0, %p63; | |
add.s64 %rd2307, %rd2464, %rd2306; | |
xor.b64 %rd2308, %rd2307, %rd2728; | |
shr.u64 %rd2309, %rd2308, 32; | |
mul.lo.s64 %rd2731, %rd2309, 3449720151; | |
shr.u64 %rd2310, %rd2731, 32; | |
and.b64 %rd2311, %rd2307, 4294967295; | |
mul.lo.s64 %rd2312, %rd2311, 3449720151; | |
and.b64 %rd2313, %rd2312, 4294967295; | |
xor.b64 %rd2314, %rd2313, %rd2310; | |
xor.b64 %rd2315, %rd2314, 2654435769; | |
mul.lo.s64 %rd2734, %rd2315, 3528531795; | |
xor.b64 %rd2724, %rd2312, %rd434; | |
mov.u64 %rd2741, 3041712726; | |
mov.u64 %rd2739, 2835769497; | |
mov.u64 %rd2738, 1684936478; | |
mov.u64 %rd2737, 2027808484; | |
mov.u64 %rd2736, 387276957; | |
mov.u64 %rd2735, 842468239; | |
mov.u64 %rd2733, 3986602516; | |
mov.u64 %rd2732, 1013904242; | |
mov.u64 %rd2730, 3668340011; | |
mov.u64 %rd2727, 3449720151; | |
mov.u64 %rd2726, 1993301258; | |
mov.u64 %rd2725, 3528531795; | |
bra.uni LBB55_50; | |
LBB55_49: | |
setp.lt.u64 %p62, %rd434, %rd12; | |
selp.u64 %rd2280, 1, 0, %p62; | |
add.s64 %rd2281, %rd2464, %rd2280; | |
and.b64 %rd2282, %rd2281, 4294967295; | |
mul.lo.s64 %rd2728, %rd2282, 3449720151; | |
xor.b64 %rd2283, %rd2728, %rd434; | |
shr.u64 %rd2284, %rd2283, 32; | |
mul.lo.s64 %rd2731, %rd2284, 3528531795; | |
shr.u64 %rd2285, %rd2731, 32; | |
and.b64 %rd2286, %rd434, 4294967295; | |
mul.lo.s64 %rd2287, %rd2286, 3528531795; | |
and.b64 %rd2288, %rd2287, 4294967295; | |
xor.b64 %rd2289, %rd2288, %rd2285; | |
xor.b64 %rd2290, %rd2289, 3144134277; | |
mul.lo.s64 %rd2734, %rd2290, 3449720151; | |
xor.b64 %rd2724, %rd2281, %rd2287; | |
mov.u32 %r349, -766435501; | |
mov.u32 %r348, -239350328; | |
mov.u64 %rd2741, 1684936478; | |
mov.u64 %rd2740, 534103459; | |
mov.u64 %rd2739, 387276957; | |
mov.u64 %rd2738, 3041712726; | |
mov.u64 %rd2737, 3986602516; | |
mov.u64 %rd2736, 2835769497; | |
mov.u64 %rd2735, 3668340011; | |
mov.u64 %rd2733, 2027808484; | |
mov.u64 %rd2732, 1993301258; | |
mov.u64 %rd2730, 842468239; | |
mov.u64 %rd2729, 2654435769; | |
mov.u64 %rd2727, 3528531795; | |
mov.u64 %rd2726, 1013904242; | |
mov.u64 %rd2725, 3449720151; | |
LBB55_50: | |
shr.u64 %rd2316, %rd2734, 32; | |
shr.u64 %rd2317, %rd2724, 32; | |
mul.lo.s64 %rd2318, %rd2317, %rd2725; | |
and.b64 %rd2319, %rd2318, 4294967295; | |
xor.b64 %rd2320, %rd2319, %rd2316; | |
xor.b64 %rd2321, %rd2320, %rd2726; | |
mul.lo.s64 %rd2322, %rd2321, %rd2727; | |
shr.u64 %rd2323, %rd2322, 32; | |
shr.u64 %rd2324, %rd2318, 32; | |
and.b64 %rd2325, %rd2728, 4294967295; | |
xor.b64 %rd2326, %rd2325, %rd2324; | |
xor.b64 %rd2327, %rd2326, %rd2729; | |
mul.lo.s64 %rd2328, %rd2327, %rd2727; | |
and.b64 %rd2329, %rd2328, 4294967295; | |
xor.b64 %rd2330, %rd2329, %rd2323; | |
xor.b64 %rd2331, %rd2330, %rd2730; | |
mul.lo.s64 %rd2332, %rd2331, %rd2725; | |
shr.u64 %rd2333, %rd2332, 32; | |
shr.u64 %rd2334, %rd2328, 32; | |
and.b64 %rd2335, %rd2731, 4294967295; | |
xor.b64 %rd2336, %rd2335, %rd2334; | |
xor.b64 %rd2337, %rd2336, %rd2732; | |
mul.lo.s64 %rd2338, %rd2337, %rd2725; | |
and.b64 %rd2339, %rd2338, 4294967295; | |
xor.b64 %rd2340, %rd2339, %rd2333; | |
xor.b64 %rd2341, %rd2340, %rd2733; | |
mul.lo.s64 %rd2342, %rd2341, %rd2727; | |
shr.u64 %rd2343, %rd2342, 32; | |
shr.u64 %rd2344, %rd2338, 32; | |
and.b64 %rd2345, %rd2734, 4294967295; | |
xor.b64 %rd2346, %rd2345, %rd2344; | |
xor.b64 %rd2347, %rd2346, %rd2735; | |
mul.lo.s64 %rd2348, %rd2347, %rd2727; | |
and.b64 %rd2349, %rd2348, 4294967295; | |
xor.b64 %rd2350, %rd2349, %rd2343; | |
xor.b64 %rd2351, %rd2350, %rd2736; | |
mul.lo.s64 %rd2352, %rd2351, %rd2725; | |
shr.u64 %rd2353, %rd2352, 32; | |
shr.u64 %rd2354, %rd2348, 32; | |
and.b64 %rd2355, %rd2322, 4294967295; | |
xor.b64 %rd2356, %rd2355, %rd2354; | |
xor.b64 %rd2357, %rd2356, %rd2737; | |
mul.lo.s64 %rd2358, %rd2357, %rd2725; | |
and.b64 %rd2359, %rd2358, 4294967295; | |
xor.b64 %rd2360, %rd2359, %rd2353; | |
xor.b64 %rd2361, %rd2360, %rd2738; | |
mul.lo.s64 %rd2362, %rd2361, %rd2727; | |
shr.u64 %rd2363, %rd2362, 32; | |
shr.u64 %rd2364, %rd2358, 32; | |
and.b64 %rd2365, %rd2332, 4294967295; | |
xor.b64 %rd2366, %rd2365, %rd2364; | |
xor.b64 %rd2367, %rd2366, %rd2739; | |
mul.lo.s64 %rd2368, %rd2367, %rd2727; | |
and.b64 %rd2369, %rd2368, 4294967295; | |
xor.b64 %rd2370, %rd2369, %rd2363; | |
xor.b64 %rd2371, %rd2370, %rd2740; | |
mul.lo.s64 %rd2372, %rd2371, %rd2725; | |
shr.u64 %rd2373, %rd2372, 32; | |
shr.u64 %rd2374, %rd2368, 32; | |
xor.b64 %rd2375, %rd2342, %rd2374; | |
xor.b64 %rd2376, %rd2375, %rd2741; | |
mul.lo.s64 %rd2377, %rd2376, %rd2725; | |
xor.b64 %rd2378, %rd2373, %rd2377; | |
cvt.u32.u64 %r265, %rd2378; | |
xor.b32 %r266, %r348, %r265; | |
mul.lo.s32 %r267, %r266, %r349; | |
shr.u32 %r268, %r267, 9; | |
cvt.rn.f32.u32 %f250, %r268; | |
mul.rn.f32 %f251, %f250, 0f34000000; | |
cvt.rn.f16.f32 %h136, %f251; | |
mov.b16 %h137, 0x2E66; | |
setp.ge.f16 %p64, %h136, %h137; | |
ld.global.nc.b16 %h138, [%rd45+1794]; | |
ld.global.nc.f32 %f252, [%rd46+3588]; | |
cvt.rn.f16.f32 %h139, %f252; | |
add.rn.f16 %h140, %h138, %h139; | |
mov.b16 %h141, 0x3C72; | |
mul.rn.f16 %h142, %h140, %h141; | |
selp.b16 %h143, %h142, 0x0000, %p64; | |
cvt.f32.f16 %f253, %h143; | |
ld.global.nc.b16 %h144, [%rd47+1794]; | |
cvt.f32.f16 %f254, %h144; | |
ld.global.nc.f32 %f255, [%rd48+3588]; | |
mul.rn.f32 %f256, %f1, %f255; | |
mul.rn.f32 %f257, %f256, %f254; | |
ld.global.nc.f32 %f258, [%rd49+3588]; | |
mul.rn.f32 %f259, %f2, %f256; | |
sub.rn.f32 %f260, %f258, %f259; | |
add.rn.f32 %f261, %f257, %f260; | |
add.rn.f32 %f262, %f261, %f253; | |
sub.rn.f32 %f263, %f262, %f3; | |
mul.rn.f32 %f264, %f263, %f263; | |
add.rn.f32 %f265, %f18, %f264; | |
and.b32 %r46, %r1, 31; | |
shfl.sync.down.b32 %f266, %f265, 16, 31, -1; | |
add.rn.f32 %f267, %f266, %f265; | |
shfl.sync.down.b32 %f268, %f267, 8, 31, -1; | |
add.rn.f32 %f269, %f268, %f267; | |
shfl.sync.down.b32 %f270, %f269, 4, 31, -1; | |
add.rn.f32 %f271, %f270, %f269; | |
shfl.sync.down.b32 %f272, %f271, 2, 31, -1; | |
add.rn.f32 %f273, %f272, %f271; | |
shfl.sync.down.b32 %f274, %f273, 1, 31, -1; | |
shr.u32 %r47, %r1, 5; | |
setp.ne.s32 %p65, %r46, 0; | |
mov.u64 %rd2381, shared_cache_013; | |
@%p65 bra LBB55_2; | |
mul.wide.u32 %rd2380, %r47, 4; | |
add.s64 %rd462, %rd2381, %rd2380; | |
add.rn.f32 %f19, %f274, %f273; | |
st.shared.f32 [%rd462], %f19; | |
LBB55_2: | |
bar.sync 0; | |
setp.eq.s32 %p66, %r47, 0; | |
@%p66 bra LBB55_52; | |
bra.uni LBB55_3; | |
LBB55_52: | |
add.u64 %rd474, %SP, 0; | |
add.u64 %rd11, %SPL, 0; | |
mul.wide.u32 %rd2382, %r46, 4; | |
add.s64 %rd463, %rd2381, %rd2382; | |
cvta.shared.u64 %rd2384, %rd463; | |
mov.u32 %r269, 0; | |
st.local.u32 [%rd11], %r269; | |
setp.lt.u32 %p67, %r1, 2; | |
selp.b64 %rd2386, %rd2384, %rd474, %p67; | |
ld.f32 %f275, [%rd2386]; | |
shfl.sync.down.b32 %f276, %f275, 16, 31, -1; | |
add.rn.f32 %f277, %f275, %f276; | |
shfl.sync.down.b32 %f278, %f277, 8, 31, -1; | |
add.rn.f32 %f279, %f277, %f278; | |
shfl.sync.down.b32 %f280, %f279, 4, 31, -1; | |
add.rn.f32 %f281, %f279, %f280; | |
shfl.sync.down.b32 %f282, %f281, 2, 31, -1; | |
add.rn.f32 %f283, %f281, %f282; | |
shfl.sync.down.b32 %f284, %f283, 1, 31, -1; | |
add.rn.f32 %f285, %f283, %f284; | |
st.f32 [%rd2386], %f285; | |
setp.ne.s32 %p68, %r1, 0; | |
@%p68 bra LBB55_3; | |
ld.param.u64 %rd470, [fusion_2212_param_3]; | |
cvt.u64.u32 %rd44, %r2; | |
cvta.to.global.u64 %rd7, %rd470; | |
shl.b64 %rd2379, %rd44, 2; | |
add.s64 %rd461, %rd7, %rd2379; | |
ld.shared.f32 %f286, [%rd463]; | |
atom.global.add.f32 %f287, [%rd461], %f286; | |
LBB55_3: | |
ret; | |
} | |
// .globl fusion_2209 | |
.visible .entry fusion_2209( | |
.param .u64 fusion_2209_param_0, | |
.param .u64 fusion_2209_param_1, | |
.param .u64 fusion_2209_param_2, | |
.param .u64 fusion_2209_param_3, | |
.param .u64 fusion_2209_param_4, | |
.param .u64 fusion_2209_param_5, | |
.param .u64 fusion_2209_param_6, | |
.param .u64 fusion_2209_param_7, | |
.param .u64 fusion_2209_param_8, | |
.param .u64 fusion_2209_param_9, | |
.param .u64 fusion_2209_param_10, | |
.param .u64 fusion_2209_param_11, | |
.param .u64 fusion_2209_param_12, | |
.param .u64 fusion_2209_param_13 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<39>; | |
.reg .b32 %hh<5>; | |
.reg .f32 %f<97>; | |
.reg .b32 %r<31>; | |
.reg .b64 %rd<162>; | |
ld.param.u64 %rd1, [fusion_2209_param_0]; | |
ld.param.u64 %rd2, [fusion_2209_param_12]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2209_param_1]; | |
ld.param.u64 %rd5, [fusion_2209_param_11]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2209_param_2]; | |
ld.param.u64 %rd8, [fusion_2209_param_10]; | |
cvta.to.global.u64 %rd9, %rd8; | |
ld.param.u64 %rd10, [fusion_2209_param_3]; | |
ld.param.u64 %rd11, [fusion_2209_param_9]; | |
cvta.to.global.u64 %rd12, %rd11; | |
ld.param.u64 %rd13, [fusion_2209_param_4]; | |
ld.param.u64 %rd14, [fusion_2209_param_8]; | |
cvta.to.global.u64 %rd15, %rd14; | |
ld.param.u64 %rd16, [fusion_2209_param_5]; | |
ld.param.u64 %rd17, [fusion_2209_param_7]; | |
cvta.to.global.u64 %rd18, %rd17; | |
ld.param.u64 %rd19, [fusion_2209_param_6]; | |
cvta.to.global.u64 %rd20, %rd19; | |
cvta.to.global.u64 %rd21, %rd16; | |
cvta.to.global.u64 %rd22, %rd13; | |
cvta.to.global.u64 %rd23, %rd10; | |
cvta.to.global.u64 %rd24, %rd7; | |
cvta.to.global.u64 %rd25, %rd4; | |
cvta.to.global.u64 %rd26, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
shr.u32 %r8, %r5, 2; | |
ld.global.nc.v2.u64 {%rd27, %rd28}, [%rd23]; | |
cvt.u64.u32 %rd29, %r8; | |
add.s64 %rd30, %rd27, %rd29; | |
setp.lt.u64 %p1, %rd30, %rd27; | |
and.b64 %rd31, %rd30, 4294967295; | |
mul.lo.s64 %rd32, %rd31, 3528531795; | |
selp.u64 %rd33, 1, 0, %p1; | |
add.s64 %rd34, %rd28, %rd33; | |
xor.b64 %rd35, %rd34, %rd32; | |
shr.u64 %rd36, %rd35, 32; | |
mul.lo.s64 %rd37, %rd36, 3449720151; | |
shr.u64 %rd38, %rd37, 32; | |
and.b64 %rd39, %rd34, 4294967295; | |
mul.lo.s64 %rd40, %rd39, 3449720151; | |
and.b64 %rd41, %rd40, 4294967295; | |
xor.b64 %rd42, %rd41, %rd38; | |
xor.b64 %rd43, %rd42, 2654435769; | |
mul.lo.s64 %rd44, %rd43, 3528531795; | |
shr.u64 %rd45, %rd44, 32; | |
xor.b64 %rd46, %rd40, %rd30; | |
shr.u64 %rd47, %rd46, 32; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
and.b64 %rd49, %rd48, 4294967295; | |
xor.b64 %rd50, %rd49, %rd45; | |
xor.b64 %rd51, %rd50, 1993301258; | |
mul.lo.s64 %rd52, %rd51, 3449720151; | |
shr.u64 %rd53, %rd52, 32; | |
shr.u64 %rd54, %rd48, 32; | |
and.b64 %rd55, %rd32, 4294967295; | |
xor.b64 %rd56, %rd55, %rd54; | |
xor.b64 %rd57, %rd56, 3144134277; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
and.b64 %rd59, %rd58, 4294967295; | |
xor.b64 %rd60, %rd59, %rd53; | |
xor.b64 %rd61, %rd60, 3668340011; | |
mul.lo.s64 %rd62, %rd61, 3528531795; | |
shr.u64 %rd63, %rd62, 32; | |
shr.u64 %rd64, %rd58, 32; | |
and.b64 %rd65, %rd37, 4294967295; | |
xor.b64 %rd66, %rd65, %rd64; | |
xor.b64 %rd67, %rd66, 1013904242; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
and.b64 %rd69, %rd68, 4294967295; | |
xor.b64 %rd70, %rd69, %rd63; | |
xor.b64 %rd71, %rd70, 3986602516; | |
mul.lo.s64 %rd72, %rd71, 3449720151; | |
shr.u64 %rd73, %rd72, 32; | |
shr.u64 %rd74, %rd68, 32; | |
and.b64 %rd75, %rd44, 4294967295; | |
xor.b64 %rd76, %rd75, %rd74; | |
xor.b64 %rd77, %rd76, 842468239; | |
mul.lo.s64 %rd78, %rd77, 3449720151; | |
and.b64 %rd79, %rd78, 4294967295; | |
xor.b64 %rd80, %rd79, %rd73; | |
xor.b64 %rd81, %rd80, 387276957; | |
mul.lo.s64 %rd82, %rd81, 3528531795; | |
shr.u64 %rd83, %rd82, 32; | |
shr.u64 %rd84, %rd78, 32; | |
and.b64 %rd85, %rd52, 4294967295; | |
xor.b64 %rd86, %rd85, %rd84; | |
xor.b64 %rd87, %rd86, 2027808484; | |
mul.lo.s64 %rd88, %rd87, 3528531795; | |
and.b64 %rd89, %rd88, 4294967295; | |
shr.u64 %rd90, %rd88, 32; | |
and.b64 %rd91, %rd62, 4294967295; | |
xor.b64 %rd92, %rd91, %rd90; | |
xor.b64 %rd93, %rd92, 2835769497; | |
mul.lo.s64 %rd94, %rd93, 3449720151; | |
and.b64 %rd95, %rd94, 4294967295; | |
shr.u64 %rd96, %rd94, 32; | |
and.b64 %rd97, %rd72, 4294967295; | |
xor.b64 %rd98, %rd97, %rd96; | |
xor.b64 %rd99, %rd98, 3041712726; | |
mul.lo.s64 %rd100, %rd99, 3528531795; | |
and.b64 %rd101, %rd100, 4294967295; | |
xor.b64 %rd102, %rd89, %rd83; | |
xor.b64 %rd103, %rd102, 1684936478; | |
mul.lo.s64 %rd104, %rd103, 3449720151; | |
shr.u64 %rd105, %rd104, 32; | |
xor.b64 %rd106, %rd95, %rd105; | |
xor.b64 %rd107, %rd106, 1401181199; | |
mul.lo.s64 %rd108, %rd107, 3528531795; | |
shr.u64 %rd109, %rd108, 32; | |
xor.b64 %rd110, %rd101, %rd109; | |
xor.b64 %rd111, %rd110, 3678237736; | |
mul.lo.s64 %rd112, %rd111, 3449720151; | |
shr.u64 %rd113, %rd112, 32; | |
cvt.u32.u64 %r9, %rd113; | |
shr.u64 %rd114, %rd100, 32; | |
xor.b64 %rd115, %rd114, %rd82; | |
cvt.u32.u64 %r10, %rd115; | |
xor.b32 %r11, %r10, 534103459; | |
mul.lo.s32 %r12, %r11, -845247145; | |
xor.b32 %r13, %r12, %r9; | |
shr.u32 %r14, %r13, 9; | |
xor.b32 %r15, %r14, 4716963; | |
cvt.rn.f32.u32 %f1, %r15; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f2; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p2, %h1, %h2; | |
mul.wide.u32 %rd116, %r5, 2; | |
add.s64 %rd117, %rd25, %rd116; | |
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd117]; | |
mov.b32 %hh1, {%h5, %h6}; | |
mov.b32 %hh2, {%h3, %h4}; | |
mov.b32 {%h7, %h8}, %hh2; | |
mov.b32 {%h9, %h10}, %hh1; | |
mul.wide.u32 %rd118, %r4, 4; | |
add.s64 %rd119, %rd6, %rd118; | |
ld.global.nc.f32 %f3, [%rd119]; | |
cvt.rn.f16.f32 %h11, %f3; | |
add.rn.f16 %h12, %h7, %h11; | |
mov.b16 %h13, 0x3C72; | |
mul.rn.f16 %h14, %h12, %h13; | |
cvt.f32.f16 %f4, %h14; | |
selp.f32 %f5, %f4, 0f00000000, %p2; | |
add.s64 %rd120, %rd24, %rd116; | |
ld.global.nc.v4.b16 {%h15, %h16, %h17, %h18}, [%rd120]; | |
mov.b32 %hh3, {%h17, %h18}; | |
mov.b32 %hh4, {%h15, %h16}; | |
mov.b32 {%h19, %h20}, %hh4; | |
mov.b32 {%h21, %h22}, %hh3; | |
cvt.f32.f16 %f6, %h19; | |
mul.wide.u32 %rd121, %r1, 4; | |
add.s64 %rd122, %rd20, %rd121; | |
ld.global.nc.f32 %f7, [%rd122]; | |
mul.rn.f32 %f8, %f7, 0f3A800000; | |
add.rn.f32 %f9, %f8, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f10, %f9; | |
add.s64 %rd123, %rd9, %rd118; | |
ld.global.nc.f32 %f11, [%rd123]; | |
mul.rn.f32 %f12, %f10, %f11; | |
mul.rn.f32 %f13, %f12, %f6; | |
add.s64 %rd124, %rd12, %rd118; | |
ld.global.nc.f32 %f14, [%rd124]; | |
add.s64 %rd125, %rd18, %rd121; | |
ld.global.nc.f32 %f15, [%rd125]; | |
mul.rn.f32 %f16, %f15, 0f3A800000; | |
mul.rn.f32 %f17, %f12, %f16; | |
sub.rn.f32 %f18, %f14, %f17; | |
add.rn.f32 %f19, %f13, %f18; | |
add.rn.f32 %f20, %f19, %f5; | |
add.s64 %rd126, %rd22, %rd121; | |
ld.global.nc.f32 %f21, [%rd126]; | |
mul.rn.f32 %f22, %f21, 0f3A800000; | |
add.rn.f32 %f23, %f22, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f24, %f23; | |
add.s64 %rd127, %rd15, %rd118; | |
ld.global.nc.f32 %f25, [%rd127]; | |
mul.rn.f32 %f26, %f24, %f25; | |
mul.rn.f32 %f27, %f26, %f20; | |
add.s64 %rd128, %rd3, %rd118; | |
ld.global.nc.f32 %f28, [%rd128]; | |
add.s64 %rd129, %rd21, %rd121; | |
ld.global.nc.f32 %f29, [%rd129]; | |
mul.rn.f32 %f30, %f29, 0f3A800000; | |
mul.rn.f32 %f31, %f26, %f30; | |
sub.rn.f32 %f32, %f28, %f31; | |
add.rn.f32 %f33, %f32, %f27; | |
cvt.rn.f16.f32 %h23, %f33; | |
add.s64 %rd130, %rd26, %rd116; | |
xor.b64 %rd131, %rd72, %rd96; | |
xor.b64 %rd132, %rd131, 3041712726; | |
mul.lo.s64 %rd133, %rd132, 3528531795; | |
xor.b64 %rd134, %rd109, %rd133; | |
cvt.u32.u64 %r16, %rd134; | |
xor.b32 %r17, %r16, -616729560; | |
mul.lo.s32 %r18, %r17, -845247145; | |
shr.u32 %r19, %r18, 9; | |
cvt.rn.f32.u32 %f34, %r19; | |
mul.rn.f32 %f35, %f34, 0f34000000; | |
cvt.rn.f16.f32 %h24, %f35; | |
setp.ge.f16 %p3, %h24, %h2; | |
mul.wide.u32 %rd135, %r6, 4; | |
add.s64 %rd136, %rd6, %rd135; | |
ld.global.nc.f32 %f36, [%rd136]; | |
cvt.rn.f16.f32 %h25, %f36; | |
add.rn.f16 %h26, %h8, %h25; | |
mul.rn.f16 %h27, %h26, %h13; | |
cvt.f32.f16 %f37, %h27; | |
selp.f32 %f38, %f37, 0f00000000, %p3; | |
cvt.f32.f16 %f39, %h20; | |
add.s64 %rd137, %rd9, %rd135; | |
ld.global.nc.f32 %f40, [%rd137]; | |
mul.rn.f32 %f41, %f10, %f40; | |
mul.rn.f32 %f42, %f41, %f39; | |
add.s64 %rd138, %rd12, %rd135; | |
ld.global.nc.f32 %f43, [%rd138]; | |
mul.rn.f32 %f44, %f16, %f41; | |
sub.rn.f32 %f45, %f43, %f44; | |
add.rn.f32 %f46, %f42, %f45; | |
add.rn.f32 %f47, %f46, %f38; | |
add.s64 %rd139, %rd15, %rd135; | |
ld.global.nc.f32 %f48, [%rd139]; | |
mul.rn.f32 %f49, %f24, %f48; | |
mul.rn.f32 %f50, %f49, %f47; | |
add.s64 %rd140, %rd3, %rd135; | |
ld.global.nc.f32 %f51, [%rd140]; | |
mul.rn.f32 %f52, %f30, %f49; | |
sub.rn.f32 %f53, %f51, %f52; | |
add.rn.f32 %f54, %f53, %f50; | |
cvt.rn.f16.f32 %h28, %f54; | |
and.b64 %rd141, %rd104, 4294967295; | |
and.b64 %rd142, %rd82, 4294967295; | |
xor.b64 %rd143, %rd142, %rd114; | |
xor.b64 %rd144, %rd143, 534103459; | |
mul.lo.s64 %rd145, %rd144, 3449720151; | |
shr.u64 %rd146, %rd145, 32; | |
xor.b64 %rd147, %rd141, %rd146; | |
xor.b64 %rd148, %rd147, 4055616968; | |
mul.lo.s64 %rd149, %rd148, 3528531795; | |
shr.u64 %rd150, %rd149, 32; | |
cvt.u32.u64 %r20, %rd150; | |
xor.b64 %rd151, %rd105, %rd94; | |
cvt.u32.u64 %r21, %rd151; | |
xor.b32 %r22, %r21, 1401181199; | |
mul.lo.s32 %r23, %r22, -766435501; | |
xor.b32 %r24, %r23, %r20; | |
shr.u32 %r25, %r24, 9; | |
xor.b32 %r26, %r25, 4936337; | |
cvt.rn.f32.u32 %f55, %r26; | |
mul.rn.f32 %f56, %f55, 0f34000000; | |
cvt.rn.f16.f32 %h29, %f56; | |
setp.ge.f16 %p4, %h29, %h2; | |
mul.wide.u32 %rd152, %r7, 4; | |
add.s64 %rd153, %rd6, %rd152; | |
ld.global.nc.f32 %f57, [%rd153]; | |
cvt.rn.f16.f32 %h30, %f57; | |
add.rn.f16 %h31, %h9, %h30; | |
mul.rn.f16 %h32, %h31, %h13; | |
cvt.f32.f16 %f58, %h32; | |
selp.f32 %f59, %f58, 0f00000000, %p4; | |
cvt.f32.f16 %f60, %h21; | |
add.s64 %rd154, %rd9, %rd152; | |
ld.global.nc.f32 %f61, [%rd154]; | |
mul.rn.f32 %f62, %f10, %f61; | |
mul.rn.f32 %f63, %f62, %f60; | |
add.s64 %rd155, %rd12, %rd152; | |
ld.global.nc.f32 %f64, [%rd155]; | |
mul.rn.f32 %f65, %f16, %f62; | |
sub.rn.f32 %f66, %f64, %f65; | |
add.rn.f32 %f67, %f63, %f66; | |
add.rn.f32 %f68, %f67, %f59; | |
add.s64 %rd156, %rd15, %rd152; | |
ld.global.nc.f32 %f69, [%rd156]; | |
mul.rn.f32 %f70, %f24, %f69; | |
mul.rn.f32 %f71, %f70, %f68; | |
add.s64 %rd157, %rd3, %rd152; | |
ld.global.nc.f32 %f72, [%rd157]; | |
mul.rn.f32 %f73, %f30, %f70; | |
sub.rn.f32 %f74, %f72, %f73; | |
add.rn.f32 %f75, %f74, %f71; | |
cvt.rn.f16.f32 %h33, %f75; | |
xor.b64 %rd158, %rd83, %rd88; | |
xor.b64 %rd159, %rd158, 1684936478; | |
mul.lo.s64 %rd160, %rd159, 3449720151; | |
xor.b64 %rd161, %rd146, %rd160; | |
cvt.u32.u64 %r27, %rd161; | |
xor.b32 %r28, %r27, -239350328; | |
mul.lo.s32 %r29, %r28, -766435501; | |
shr.u32 %r30, %r29, 9; | |
cvt.rn.f32.u32 %f76, %r30; | |
mul.rn.f32 %f77, %f76, 0f34000000; | |
cvt.rn.f16.f32 %h34, %f77; | |
setp.ge.f16 %p5, %h34, %h2; | |
ld.global.nc.f32 %f78, [%rd119+12]; | |
cvt.rn.f16.f32 %h35, %f78; | |
add.rn.f16 %h36, %h10, %h35; | |
mul.rn.f16 %h37, %h36, %h13; | |
cvt.f32.f16 %f79, %h37; | |
selp.f32 %f80, %f79, 0f00000000, %p5; | |
cvt.f32.f16 %f81, %h22; | |
ld.global.nc.f32 %f82, [%rd123+12]; | |
mul.rn.f32 %f83, %f10, %f82; | |
mul.rn.f32 %f84, %f83, %f81; | |
ld.global.nc.f32 %f85, [%rd124+12]; | |
mul.rn.f32 %f86, %f16, %f83; | |
sub.rn.f32 %f87, %f85, %f86; | |
add.rn.f32 %f88, %f84, %f87; | |
add.rn.f32 %f89, %f88, %f80; | |
ld.global.nc.f32 %f90, [%rd127+12]; | |
mul.rn.f32 %f91, %f24, %f90; | |
mul.rn.f32 %f92, %f91, %f89; | |
ld.global.nc.f32 %f93, [%rd128+12]; | |
mul.rn.f32 %f94, %f30, %f91; | |
sub.rn.f32 %f95, %f93, %f94; | |
add.rn.f32 %f96, %f95, %f92; | |
cvt.rn.f16.f32 %h38, %f96; | |
st.global.v4.b16 [%rd130], {%h23, %h28, %h33, %h38}; | |
ret; | |
} | |
// .globl fusion_2702 | |
.visible .entry fusion_2702( | |
.param .u64 fusion_2702_param_0, | |
.param .u64 fusion_2702_param_1, | |
.param .u64 fusion_2702_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2702_param_0]; | |
ld.param.u64 %rd2, [fusion_2702_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2205 | |
.visible .entry fusion_2205( | |
.param .u64 fusion_2205_param_0, | |
.param .u64 fusion_2205_param_1, | |
.param .u64 fusion_2205_param_2, | |
.param .u64 fusion_2205_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2205_param_0]; | |
ld.param.u64 %rd2, [fusion_2205_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2205_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd5, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd6, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2703 | |
.visible .entry fusion_2703( | |
.param .u64 fusion_2703_param_0, | |
.param .u64 fusion_2703_param_1, | |
.param .u64 fusion_2703_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2703_param_0]; | |
ld.param.u64 %rd2, [fusion_2703_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2206 | |
.visible .entry fusion_2206( | |
.param .u64 fusion_2206_param_0, | |
.param .u64 fusion_2206_param_1, | |
.param .u64 fusion_2206_param_2, | |
.param .u64 fusion_2206_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2206_param_0]; | |
ld.param.u64 %rd2, [fusion_2206_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2206_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd5, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd6, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2203 | |
.visible .entry fusion_2203( | |
.param .u64 fusion_2203_param_0, | |
.param .u64 fusion_2203_param_1, | |
.param .u64 fusion_2203_param_2, | |
.param .u64 fusion_2203_param_3 | |
) | |
.reqntid 32, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot61[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<4>; | |
.reg .b16 %h<83>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<57>; | |
.reg .b32 %r<37>; | |
.reg .b64 %rd<37>; | |
mov.u64 %SPL, __local_depot61; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd4, [fusion_2203_param_0]; | |
ld.param.u64 %rd5, [fusion_2203_param_2]; | |
cvta.to.global.u64 %rd6, %rd5; | |
cvta.to.global.u64 %rd9, %rd4; | |
add.u64 %rd10, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r5, %ctaid.x; | |
shl.b32 %r6, %r1, 1; | |
shl.b32 %r7, %r5, 9; | |
or.b32 %r8, %r7, %r6; | |
mul.wide.u32 %rd11, %r8, 2; | |
add.s64 %rd12, %rd9, %rd11; | |
ld.global.nc.b32 %hh1, [%rd12]; | |
mov.b32 {%h1, %h2}, %hh1; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd6, %rd13; | |
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14]; | |
cvt.rn.f16.s32 %h3, %r9; | |
mov.b16 %h4, 0x3C00; | |
sub.rn.f16 %h5, %h4, %h3; | |
mov.b16 %h6, 0x70E2; | |
mul.rn.f16 %h7, %h5, %h6; | |
sub.rn.f16 %h8, %h1, %h7; | |
cvt.f32.f16 %f2, %h8; | |
max.f32 %f3, %f2, 0fFF800000; | |
cvt.rn.f16.s32 %h9, %r10; | |
sub.rn.f16 %h10, %h4, %h9; | |
mul.rn.f16 %h11, %h10, %h6; | |
sub.rn.f16 %h12, %h2, %h11; | |
cvt.f32.f16 %f4, %h12; | |
max.f32 %f5, %f3, %f4; | |
or.b32 %r11, %r6, 64; | |
ld.global.nc.b32 %hh2, [%rd12+128]; | |
mov.b32 {%h13, %h14}, %hh2; | |
mul.wide.u32 %rd15, %r11, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.u32 %r12, [%rd16]; | |
cvt.rn.f16.s32 %h15, %r12; | |
sub.rn.f16 %h16, %h4, %h15; | |
mul.rn.f16 %h17, %h16, %h6; | |
sub.rn.f16 %h18, %h13, %h17; | |
cvt.f32.f16 %f6, %h18; | |
max.f32 %f7, %f5, %f6; | |
ld.global.nc.u32 %r13, [%rd14+260]; | |
cvt.rn.f16.s32 %h19, %r13; | |
sub.rn.f16 %h20, %h4, %h19; | |
mul.rn.f16 %h21, %h20, %h6; | |
sub.rn.f16 %h22, %h14, %h21; | |
cvt.f32.f16 %f8, %h22; | |
max.f32 %f9, %f7, %f8; | |
or.b32 %r14, %r6, 128; | |
ld.global.nc.b32 %hh3, [%rd12+256]; | |
mov.b32 {%h23, %h24}, %hh3; | |
mul.wide.u32 %rd17, %r14, 4; | |
add.s64 %rd18, %rd6, %rd17; | |
ld.global.nc.u32 %r15, [%rd18]; | |
cvt.rn.f16.s32 %h25, %r15; | |
sub.rn.f16 %h26, %h4, %h25; | |
mul.rn.f16 %h27, %h26, %h6; | |
sub.rn.f16 %h28, %h23, %h27; | |
cvt.f32.f16 %f10, %h28; | |
max.f32 %f11, %f9, %f10; | |
ld.global.nc.u32 %r16, [%rd14+516]; | |
cvt.rn.f16.s32 %h29, %r16; | |
sub.rn.f16 %h30, %h4, %h29; | |
mul.rn.f16 %h31, %h30, %h6; | |
sub.rn.f16 %h32, %h24, %h31; | |
cvt.f32.f16 %f12, %h32; | |
max.f32 %f13, %f11, %f12; | |
or.b32 %r17, %r6, 192; | |
ld.global.nc.b32 %hh4, [%rd12+384]; | |
mov.b32 {%h33, %h34}, %hh4; | |
mul.wide.u32 %rd19, %r17, 4; | |
add.s64 %rd20, %rd6, %rd19; | |
ld.global.nc.u32 %r18, [%rd20]; | |
cvt.rn.f16.s32 %h35, %r18; | |
sub.rn.f16 %h36, %h4, %h35; | |
mul.rn.f16 %h37, %h36, %h6; | |
sub.rn.f16 %h38, %h33, %h37; | |
cvt.f32.f16 %f14, %h38; | |
max.f32 %f15, %f13, %f14; | |
ld.global.nc.u32 %r19, [%rd14+772]; | |
cvt.rn.f16.s32 %h39, %r19; | |
sub.rn.f16 %h40, %h4, %h39; | |
mul.rn.f16 %h41, %h40, %h6; | |
sub.rn.f16 %h42, %h34, %h41; | |
cvt.f32.f16 %f16, %h42; | |
max.f32 %f17, %f15, %f16; | |
or.b32 %r20, %r6, 256; | |
ld.global.nc.b32 %hh5, [%rd12+512]; | |
mov.b32 {%h43, %h44}, %hh5; | |
mul.wide.u32 %rd21, %r20, 4; | |
add.s64 %rd22, %rd6, %rd21; | |
ld.global.nc.u32 %r21, [%rd22]; | |
cvt.rn.f16.s32 %h45, %r21; | |
sub.rn.f16 %h46, %h4, %h45; | |
mul.rn.f16 %h47, %h46, %h6; | |
sub.rn.f16 %h48, %h43, %h47; | |
cvt.f32.f16 %f18, %h48; | |
max.f32 %f19, %f17, %f18; | |
ld.global.nc.u32 %r22, [%rd14+1028]; | |
cvt.rn.f16.s32 %h49, %r22; | |
sub.rn.f16 %h50, %h4, %h49; | |
mul.rn.f16 %h51, %h50, %h6; | |
sub.rn.f16 %h52, %h44, %h51; | |
cvt.f32.f16 %f20, %h52; | |
max.f32 %f21, %f19, %f20; | |
or.b32 %r23, %r6, 320; | |
ld.global.nc.b32 %hh6, [%rd12+640]; | |
mov.b32 {%h53, %h54}, %hh6; | |
mul.wide.u32 %rd23, %r23, 4; | |
add.s64 %rd24, %rd6, %rd23; | |
ld.global.nc.u32 %r24, [%rd24]; | |
cvt.rn.f16.s32 %h55, %r24; | |
sub.rn.f16 %h56, %h4, %h55; | |
mul.rn.f16 %h57, %h56, %h6; | |
sub.rn.f16 %h58, %h53, %h57; | |
cvt.f32.f16 %f22, %h58; | |
max.f32 %f23, %f21, %f22; | |
ld.global.nc.u32 %r25, [%rd14+1284]; | |
cvt.rn.f16.s32 %h59, %r25; | |
sub.rn.f16 %h60, %h4, %h59; | |
mul.rn.f16 %h61, %h60, %h6; | |
sub.rn.f16 %h62, %h54, %h61; | |
cvt.f32.f16 %f24, %h62; | |
max.f32 %f25, %f23, %f24; | |
or.b32 %r26, %r6, 384; | |
ld.global.nc.b32 %hh7, [%rd12+768]; | |
mov.b32 {%h63, %h64}, %hh7; | |
mul.wide.u32 %rd25, %r26, 4; | |
add.s64 %rd26, %rd6, %rd25; | |
ld.global.nc.u32 %r27, [%rd26]; | |
cvt.rn.f16.s32 %h65, %r27; | |
sub.rn.f16 %h66, %h4, %h65; | |
mul.rn.f16 %h67, %h66, %h6; | |
sub.rn.f16 %h68, %h63, %h67; | |
cvt.f32.f16 %f26, %h68; | |
max.f32 %f27, %f25, %f26; | |
ld.global.nc.u32 %r28, [%rd14+1540]; | |
cvt.rn.f16.s32 %h69, %r28; | |
sub.rn.f16 %h70, %h4, %h69; | |
mul.rn.f16 %h71, %h70, %h6; | |
sub.rn.f16 %h72, %h64, %h71; | |
cvt.f32.f16 %f28, %h72; | |
max.f32 %f29, %f27, %f28; | |
or.b32 %r29, %r6, 448; | |
ld.global.nc.b32 %hh8, [%rd12+896]; | |
mov.b32 {%h73, %h74}, %hh8; | |
mul.wide.u32 %rd27, %r29, 4; | |
add.s64 %rd28, %rd6, %rd27; | |
ld.global.nc.u32 %r30, [%rd28]; | |
cvt.rn.f16.s32 %h75, %r30; | |
sub.rn.f16 %h76, %h4, %h75; | |
mul.rn.f16 %h77, %h76, %h6; | |
sub.rn.f16 %h78, %h73, %h77; | |
cvt.f32.f16 %f30, %h78; | |
max.f32 %f31, %f29, %f30; | |
ld.global.nc.u32 %r31, [%rd14+1796]; | |
cvt.rn.f16.s32 %h79, %r31; | |
sub.rn.f16 %h80, %h4, %h79; | |
mul.rn.f16 %h81, %h80, %h6; | |
sub.rn.f16 %h82, %h74, %h81; | |
cvt.f32.f16 %f32, %h82; | |
max.f32 %f33, %f31, %f32; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
max.f32 %f35, %f33, %f34; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
max.f32 %f37, %f35, %f36; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
max.f32 %f39, %f37, %f38; | |
shfl.sync.down.b32 %f40, %f39, 2, 31, -1; | |
max.f32 %f41, %f39, %f40; | |
shfl.sync.down.b32 %f42, %f41, 1, 31, -1; | |
setp.eq.s32 %p1, %r1, 0; | |
@%p1 bra LBB61_3; | |
bra.uni LBB61_1; | |
LBB61_3: | |
max.f32 %f1, %f41, %f42; | |
st.shared.f32 [shared_cache_014], %f1; | |
LBB61_1: | |
bar.sync 0; | |
mul.wide.u32 %rd32, %r1, 4; | |
mov.u64 %rd33, shared_cache_014; | |
add.s64 %rd3, %rd33, %rd32; | |
cvta.shared.u64 %rd34, %rd3; | |
mov.u32 %r34, -8388608; | |
st.local.u32 [%rd1], %r34; | |
selp.b64 %rd36, %rd34, %rd10, %p1; | |
ld.f32 %f43, [%rd36]; | |
shfl.sync.down.b32 %f44, %f43, 16, 31, -1; | |
max.f32 %f45, %f43, %f44; | |
shfl.sync.down.b32 %f46, %f45, 8, 31, -1; | |
max.f32 %f47, %f45, %f46; | |
shfl.sync.down.b32 %f48, %f47, 4, 31, -1; | |
max.f32 %f49, %f47, %f48; | |
shfl.sync.down.b32 %f50, %f49, 2, 31, -1; | |
max.f32 %f51, %f49, %f50; | |
shfl.sync.down.b32 %f52, %f51, 1, 31, -1; | |
max.f32 %f53, %f51, %f52; | |
st.f32 [%rd36], %f53; | |
@%p1 bra LBB61_4; | |
bra.uni LBB61_2; | |
LBB61_4: | |
ld.param.u64 %rd7, [fusion_2203_param_1]; | |
shr.u32 %r33, %r5, 9; | |
cvta.to.global.u64 %rd8, %rd7; | |
and.b32 %r32, %r5, 511; | |
mul.wide.u32 %rd29, %r33, 2048; | |
add.s64 %rd30, %rd8, %rd29; | |
mul.wide.u32 %rd31, %r32, 4; | |
add.s64 %rd2, %rd30, %rd31; | |
ld.global.u32 %r36, [%rd2]; | |
LBB61_5: | |
mov.b32 %f54, %r36; | |
ld.shared.f32 %f55, [%rd3]; | |
max.f32 %f56, %f54, %f55; | |
mov.b32 %r35, %f56; | |
atom.global.cas.b32 %r4, [%rd2], %r36, %r35; | |
setp.eq.s32 %p3, %r4, %r36; | |
mov.u32 %r36, %r4; | |
@%p3 bra LBB61_2; | |
bra.uni LBB61_5; | |
LBB61_2: | |
ret; | |
} | |
// .globl fusion_2201 | |
.visible .entry fusion_2201( | |
.param .u64 fusion_2201_param_0, | |
.param .u64 fusion_2201_param_1, | |
.param .u64 fusion_2201_param_2, | |
.param .u64 fusion_2201_param_3, | |
.param .u64 fusion_2201_param_4 | |
) | |
.reqntid 32, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot62[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<35>; | |
.reg .b16 %h<83>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<249>; | |
.reg .b32 %r<32>; | |
.reg .b64 %rd<41>; | |
mov.u64 %SPL, __local_depot62; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd4, [fusion_2201_param_0]; | |
ld.param.u64 %rd5, [fusion_2201_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd8, [fusion_2201_param_2]; | |
cvta.to.global.u64 %rd9, %rd8; | |
cvta.to.global.u64 %rd11, %rd4; | |
add.u64 %rd12, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 9; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd13, %r5, 2; | |
add.s64 %rd14, %rd11, %rd13; | |
ld.global.nc.b32 %hh1, [%rd14]; | |
mov.b32 {%h1, %h2}, %hh1; | |
mul.wide.u32 %rd15, %r3, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.v2.u32 {%r6, %r7}, [%rd16]; | |
cvt.rn.f16.s32 %h3, %r6; | |
mov.b16 %h4, 0x3C00; | |
sub.rn.f16 %h5, %h4, %h3; | |
mov.b16 %h6, 0x70E2; | |
mul.rn.f16 %h7, %h5, %h6; | |
sub.rn.f16 %h8, %h1, %h7; | |
cvt.f32.f16 %f2, %h8; | |
mul.wide.u32 %rd17, %r2, 4; | |
add.s64 %rd18, %rd9, %rd17; | |
ld.global.nc.f32 %f3, [%rd18]; | |
sub.rn.f32 %f4, %f2, %f3; | |
mul.rn.f32 %f5, %f4, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f6, %f5; | |
add.rn.f32 %f7, %f6, 0f00000000; | |
ex2.approx.f32 %f8, %f7; | |
fma.rn.f32 %f9, %f6, 0fBF317200, %f4; | |
fma.rn.f32 %f10, %f6, 0fB5BFBE8E, %f9; | |
mul.rn.f32 %f11, %f10, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f12, %f11; | |
mul.rn.f32 %f13, %f8, %f12; | |
setp.lt.f32 %p1, %f4, 0fC2D20000; | |
setp.gt.f32 %p2, %f4, 0f42D20000; | |
add.rn.f32 %f14, %f13, 0f00000000; | |
selp.f32 %f15, 0f00000000, %f14, %p1; | |
selp.f32 %f16, 0f7F800000, %f15, %p2; | |
cvt.rn.f16.s32 %h9, %r7; | |
sub.rn.f16 %h10, %h4, %h9; | |
mul.rn.f16 %h11, %h10, %h6; | |
sub.rn.f16 %h12, %h2, %h11; | |
cvt.f32.f16 %f17, %h12; | |
sub.rn.f32 %f18, %f17, %f3; | |
mul.rn.f32 %f19, %f18, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f20, %f19; | |
add.rn.f32 %f21, %f20, 0f00000000; | |
ex2.approx.f32 %f22, %f21; | |
fma.rn.f32 %f23, %f20, 0fBF317200, %f18; | |
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23; | |
mul.rn.f32 %f25, %f24, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f26, %f25; | |
mul.rn.f32 %f27, %f22, %f26; | |
setp.lt.f32 %p3, %f18, 0fC2D20000; | |
selp.f32 %f28, 0f00000000, %f27, %p3; | |
setp.gt.f32 %p4, %f18, 0f42D20000; | |
selp.f32 %f29, 0f7F800000, %f28, %p4; | |
add.rn.f32 %f30, %f16, %f29; | |
or.b32 %r8, %r3, 64; | |
ld.global.nc.b32 %hh2, [%rd14+128]; | |
mov.b32 {%h13, %h14}, %hh2; | |
mul.wide.u32 %rd19, %r8, 4; | |
add.s64 %rd20, %rd6, %rd19; | |
ld.global.nc.u32 %r9, [%rd20]; | |
cvt.rn.f16.s32 %h15, %r9; | |
sub.rn.f16 %h16, %h4, %h15; | |
mul.rn.f16 %h17, %h16, %h6; | |
sub.rn.f16 %h18, %h13, %h17; | |
cvt.f32.f16 %f31, %h18; | |
sub.rn.f32 %f32, %f31, %f3; | |
mul.rn.f32 %f33, %f32, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f34, %f33; | |
add.rn.f32 %f35, %f34, 0f00000000; | |
ex2.approx.f32 %f36, %f35; | |
fma.rn.f32 %f37, %f34, 0fBF317200, %f32; | |
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37; | |
mul.rn.f32 %f39, %f38, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f40, %f39; | |
mul.rn.f32 %f41, %f36, %f40; | |
setp.lt.f32 %p5, %f32, 0fC2D20000; | |
selp.f32 %f42, 0f00000000, %f41, %p5; | |
setp.gt.f32 %p6, %f32, 0f42D20000; | |
selp.f32 %f43, 0f7F800000, %f42, %p6; | |
add.rn.f32 %f44, %f30, %f43; | |
ld.global.nc.u32 %r10, [%rd16+260]; | |
cvt.rn.f16.s32 %h19, %r10; | |
sub.rn.f16 %h20, %h4, %h19; | |
mul.rn.f16 %h21, %h20, %h6; | |
sub.rn.f16 %h22, %h14, %h21; | |
cvt.f32.f16 %f45, %h22; | |
sub.rn.f32 %f46, %f45, %f3; | |
mul.rn.f32 %f47, %f46, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f48, %f47; | |
add.rn.f32 %f49, %f48, 0f00000000; | |
ex2.approx.f32 %f50, %f49; | |
fma.rn.f32 %f51, %f48, 0fBF317200, %f46; | |
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51; | |
mul.rn.f32 %f53, %f52, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f54, %f53; | |
mul.rn.f32 %f55, %f50, %f54; | |
setp.lt.f32 %p7, %f46, 0fC2D20000; | |
selp.f32 %f56, 0f00000000, %f55, %p7; | |
setp.gt.f32 %p8, %f46, 0f42D20000; | |
selp.f32 %f57, 0f7F800000, %f56, %p8; | |
add.rn.f32 %f58, %f44, %f57; | |
or.b32 %r11, %r3, 128; | |
ld.global.nc.b32 %hh3, [%rd14+256]; | |
mov.b32 {%h23, %h24}, %hh3; | |
mul.wide.u32 %rd21, %r11, 4; | |
add.s64 %rd22, %rd6, %rd21; | |
ld.global.nc.u32 %r12, [%rd22]; | |
cvt.rn.f16.s32 %h25, %r12; | |
sub.rn.f16 %h26, %h4, %h25; | |
mul.rn.f16 %h27, %h26, %h6; | |
sub.rn.f16 %h28, %h23, %h27; | |
cvt.f32.f16 %f59, %h28; | |
sub.rn.f32 %f60, %f59, %f3; | |
mul.rn.f32 %f61, %f60, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f62, %f61; | |
add.rn.f32 %f63, %f62, 0f00000000; | |
ex2.approx.f32 %f64, %f63; | |
fma.rn.f32 %f65, %f62, 0fBF317200, %f60; | |
fma.rn.f32 %f66, %f62, 0fB5BFBE8E, %f65; | |
mul.rn.f32 %f67, %f66, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f68, %f67; | |
mul.rn.f32 %f69, %f64, %f68; | |
setp.lt.f32 %p9, %f60, 0fC2D20000; | |
selp.f32 %f70, 0f00000000, %f69, %p9; | |
setp.gt.f32 %p10, %f60, 0f42D20000; | |
selp.f32 %f71, 0f7F800000, %f70, %p10; | |
add.rn.f32 %f72, %f58, %f71; | |
ld.global.nc.u32 %r13, [%rd16+516]; | |
cvt.rn.f16.s32 %h29, %r13; | |
sub.rn.f16 %h30, %h4, %h29; | |
mul.rn.f16 %h31, %h30, %h6; | |
sub.rn.f16 %h32, %h24, %h31; | |
cvt.f32.f16 %f73, %h32; | |
sub.rn.f32 %f74, %f73, %f3; | |
mul.rn.f32 %f75, %f74, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f76, %f75; | |
add.rn.f32 %f77, %f76, 0f00000000; | |
ex2.approx.f32 %f78, %f77; | |
fma.rn.f32 %f79, %f76, 0fBF317200, %f74; | |
fma.rn.f32 %f80, %f76, 0fB5BFBE8E, %f79; | |
mul.rn.f32 %f81, %f80, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f82, %f81; | |
mul.rn.f32 %f83, %f78, %f82; | |
setp.lt.f32 %p11, %f74, 0fC2D20000; | |
selp.f32 %f84, 0f00000000, %f83, %p11; | |
setp.gt.f32 %p12, %f74, 0f42D20000; | |
selp.f32 %f85, 0f7F800000, %f84, %p12; | |
add.rn.f32 %f86, %f72, %f85; | |
or.b32 %r14, %r3, 192; | |
ld.global.nc.b32 %hh4, [%rd14+384]; | |
mov.b32 {%h33, %h34}, %hh4; | |
mul.wide.u32 %rd23, %r14, 4; | |
add.s64 %rd24, %rd6, %rd23; | |
ld.global.nc.u32 %r15, [%rd24]; | |
cvt.rn.f16.s32 %h35, %r15; | |
sub.rn.f16 %h36, %h4, %h35; | |
mul.rn.f16 %h37, %h36, %h6; | |
sub.rn.f16 %h38, %h33, %h37; | |
cvt.f32.f16 %f87, %h38; | |
sub.rn.f32 %f88, %f87, %f3; | |
mul.rn.f32 %f89, %f88, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f90, %f89; | |
add.rn.f32 %f91, %f90, 0f00000000; | |
ex2.approx.f32 %f92, %f91; | |
fma.rn.f32 %f93, %f90, 0fBF317200, %f88; | |
fma.rn.f32 %f94, %f90, 0fB5BFBE8E, %f93; | |
mul.rn.f32 %f95, %f94, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f96, %f95; | |
mul.rn.f32 %f97, %f92, %f96; | |
setp.lt.f32 %p13, %f88, 0fC2D20000; | |
selp.f32 %f98, 0f00000000, %f97, %p13; | |
setp.gt.f32 %p14, %f88, 0f42D20000; | |
selp.f32 %f99, 0f7F800000, %f98, %p14; | |
add.rn.f32 %f100, %f86, %f99; | |
ld.global.nc.u32 %r16, [%rd16+772]; | |
cvt.rn.f16.s32 %h39, %r16; | |
sub.rn.f16 %h40, %h4, %h39; | |
mul.rn.f16 %h41, %h40, %h6; | |
sub.rn.f16 %h42, %h34, %h41; | |
cvt.f32.f16 %f101, %h42; | |
sub.rn.f32 %f102, %f101, %f3; | |
mul.rn.f32 %f103, %f102, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f104, %f103; | |
add.rn.f32 %f105, %f104, 0f00000000; | |
ex2.approx.f32 %f106, %f105; | |
fma.rn.f32 %f107, %f104, 0fBF317200, %f102; | |
fma.rn.f32 %f108, %f104, 0fB5BFBE8E, %f107; | |
mul.rn.f32 %f109, %f108, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f110, %f109; | |
mul.rn.f32 %f111, %f106, %f110; | |
setp.lt.f32 %p15, %f102, 0fC2D20000; | |
selp.f32 %f112, 0f00000000, %f111, %p15; | |
setp.gt.f32 %p16, %f102, 0f42D20000; | |
selp.f32 %f113, 0f7F800000, %f112, %p16; | |
add.rn.f32 %f114, %f100, %f113; | |
or.b32 %r17, %r3, 256; | |
ld.global.nc.b32 %hh5, [%rd14+512]; | |
mov.b32 {%h43, %h44}, %hh5; | |
mul.wide.u32 %rd25, %r17, 4; | |
add.s64 %rd26, %rd6, %rd25; | |
ld.global.nc.u32 %r18, [%rd26]; | |
cvt.rn.f16.s32 %h45, %r18; | |
sub.rn.f16 %h46, %h4, %h45; | |
mul.rn.f16 %h47, %h46, %h6; | |
sub.rn.f16 %h48, %h43, %h47; | |
cvt.f32.f16 %f115, %h48; | |
sub.rn.f32 %f116, %f115, %f3; | |
mul.rn.f32 %f117, %f116, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f118, %f117; | |
add.rn.f32 %f119, %f118, 0f00000000; | |
ex2.approx.f32 %f120, %f119; | |
fma.rn.f32 %f121, %f118, 0fBF317200, %f116; | |
fma.rn.f32 %f122, %f118, 0fB5BFBE8E, %f121; | |
mul.rn.f32 %f123, %f122, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f124, %f123; | |
mul.rn.f32 %f125, %f120, %f124; | |
setp.lt.f32 %p17, %f116, 0fC2D20000; | |
selp.f32 %f126, 0f00000000, %f125, %p17; | |
setp.gt.f32 %p18, %f116, 0f42D20000; | |
selp.f32 %f127, 0f7F800000, %f126, %p18; | |
add.rn.f32 %f128, %f114, %f127; | |
ld.global.nc.u32 %r19, [%rd16+1028]; | |
cvt.rn.f16.s32 %h49, %r19; | |
sub.rn.f16 %h50, %h4, %h49; | |
mul.rn.f16 %h51, %h50, %h6; | |
sub.rn.f16 %h52, %h44, %h51; | |
cvt.f32.f16 %f129, %h52; | |
sub.rn.f32 %f130, %f129, %f3; | |
mul.rn.f32 %f131, %f130, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f132, %f131; | |
add.rn.f32 %f133, %f132, 0f00000000; | |
ex2.approx.f32 %f134, %f133; | |
fma.rn.f32 %f135, %f132, 0fBF317200, %f130; | |
fma.rn.f32 %f136, %f132, 0fB5BFBE8E, %f135; | |
mul.rn.f32 %f137, %f136, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f138, %f137; | |
mul.rn.f32 %f139, %f134, %f138; | |
setp.lt.f32 %p19, %f130, 0fC2D20000; | |
selp.f32 %f140, 0f00000000, %f139, %p19; | |
setp.gt.f32 %p20, %f130, 0f42D20000; | |
selp.f32 %f141, 0f7F800000, %f140, %p20; | |
add.rn.f32 %f142, %f128, %f141; | |
or.b32 %r20, %r3, 320; | |
ld.global.nc.b32 %hh6, [%rd14+640]; | |
mov.b32 {%h53, %h54}, %hh6; | |
mul.wide.u32 %rd27, %r20, 4; | |
add.s64 %rd28, %rd6, %rd27; | |
ld.global.nc.u32 %r21, [%rd28]; | |
cvt.rn.f16.s32 %h55, %r21; | |
sub.rn.f16 %h56, %h4, %h55; | |
mul.rn.f16 %h57, %h56, %h6; | |
sub.rn.f16 %h58, %h53, %h57; | |
cvt.f32.f16 %f143, %h58; | |
sub.rn.f32 %f144, %f143, %f3; | |
mul.rn.f32 %f145, %f144, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f146, %f145; | |
add.rn.f32 %f147, %f146, 0f00000000; | |
ex2.approx.f32 %f148, %f147; | |
fma.rn.f32 %f149, %f146, 0fBF317200, %f144; | |
fma.rn.f32 %f150, %f146, 0fB5BFBE8E, %f149; | |
mul.rn.f32 %f151, %f150, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f152, %f151; | |
mul.rn.f32 %f153, %f148, %f152; | |
setp.lt.f32 %p21, %f144, 0fC2D20000; | |
selp.f32 %f154, 0f00000000, %f153, %p21; | |
setp.gt.f32 %p22, %f144, 0f42D20000; | |
selp.f32 %f155, 0f7F800000, %f154, %p22; | |
add.rn.f32 %f156, %f142, %f155; | |
ld.global.nc.u32 %r22, [%rd16+1284]; | |
cvt.rn.f16.s32 %h59, %r22; | |
sub.rn.f16 %h60, %h4, %h59; | |
mul.rn.f16 %h61, %h60, %h6; | |
sub.rn.f16 %h62, %h54, %h61; | |
cvt.f32.f16 %f157, %h62; | |
sub.rn.f32 %f158, %f157, %f3; | |
mul.rn.f32 %f159, %f158, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f160, %f159; | |
add.rn.f32 %f161, %f160, 0f00000000; | |
ex2.approx.f32 %f162, %f161; | |
fma.rn.f32 %f163, %f160, 0fBF317200, %f158; | |
fma.rn.f32 %f164, %f160, 0fB5BFBE8E, %f163; | |
mul.rn.f32 %f165, %f164, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f166, %f165; | |
mul.rn.f32 %f167, %f162, %f166; | |
setp.lt.f32 %p23, %f158, 0fC2D20000; | |
selp.f32 %f168, 0f00000000, %f167, %p23; | |
setp.gt.f32 %p24, %f158, 0f42D20000; | |
selp.f32 %f169, 0f7F800000, %f168, %p24; | |
add.rn.f32 %f170, %f156, %f169; | |
or.b32 %r23, %r3, 384; | |
ld.global.nc.b32 %hh7, [%rd14+768]; | |
mov.b32 {%h63, %h64}, %hh7; | |
mul.wide.u32 %rd29, %r23, 4; | |
add.s64 %rd30, %rd6, %rd29; | |
ld.global.nc.u32 %r24, [%rd30]; | |
cvt.rn.f16.s32 %h65, %r24; | |
sub.rn.f16 %h66, %h4, %h65; | |
mul.rn.f16 %h67, %h66, %h6; | |
sub.rn.f16 %h68, %h63, %h67; | |
cvt.f32.f16 %f171, %h68; | |
sub.rn.f32 %f172, %f171, %f3; | |
mul.rn.f32 %f173, %f172, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f174, %f173; | |
add.rn.f32 %f175, %f174, 0f00000000; | |
ex2.approx.f32 %f176, %f175; | |
fma.rn.f32 %f177, %f174, 0fBF317200, %f172; | |
fma.rn.f32 %f178, %f174, 0fB5BFBE8E, %f177; | |
mul.rn.f32 %f179, %f178, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f180, %f179; | |
mul.rn.f32 %f181, %f176, %f180; | |
setp.lt.f32 %p25, %f172, 0fC2D20000; | |
selp.f32 %f182, 0f00000000, %f181, %p25; | |
setp.gt.f32 %p26, %f172, 0f42D20000; | |
selp.f32 %f183, 0f7F800000, %f182, %p26; | |
add.rn.f32 %f184, %f170, %f183; | |
ld.global.nc.u32 %r25, [%rd16+1540]; | |
cvt.rn.f16.s32 %h69, %r25; | |
sub.rn.f16 %h70, %h4, %h69; | |
mul.rn.f16 %h71, %h70, %h6; | |
sub.rn.f16 %h72, %h64, %h71; | |
cvt.f32.f16 %f185, %h72; | |
sub.rn.f32 %f186, %f185, %f3; | |
mul.rn.f32 %f187, %f186, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f188, %f187; | |
add.rn.f32 %f189, %f188, 0f00000000; | |
ex2.approx.f32 %f190, %f189; | |
fma.rn.f32 %f191, %f188, 0fBF317200, %f186; | |
fma.rn.f32 %f192, %f188, 0fB5BFBE8E, %f191; | |
mul.rn.f32 %f193, %f192, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f194, %f193; | |
mul.rn.f32 %f195, %f190, %f194; | |
setp.lt.f32 %p27, %f186, 0fC2D20000; | |
selp.f32 %f196, 0f00000000, %f195, %p27; | |
setp.gt.f32 %p28, %f186, 0f42D20000; | |
selp.f32 %f197, 0f7F800000, %f196, %p28; | |
add.rn.f32 %f198, %f184, %f197; | |
or.b32 %r26, %r3, 448; | |
ld.global.nc.b32 %hh8, [%rd14+896]; | |
mov.b32 {%h73, %h74}, %hh8; | |
mul.wide.u32 %rd31, %r26, 4; | |
add.s64 %rd32, %rd6, %rd31; | |
ld.global.nc.u32 %r27, [%rd32]; | |
cvt.rn.f16.s32 %h75, %r27; | |
sub.rn.f16 %h76, %h4, %h75; | |
mul.rn.f16 %h77, %h76, %h6; | |
sub.rn.f16 %h78, %h73, %h77; | |
cvt.f32.f16 %f199, %h78; | |
sub.rn.f32 %f200, %f199, %f3; | |
mul.rn.f32 %f201, %f200, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f202, %f201; | |
add.rn.f32 %f203, %f202, 0f00000000; | |
ex2.approx.f32 %f204, %f203; | |
fma.rn.f32 %f205, %f202, 0fBF317200, %f200; | |
fma.rn.f32 %f206, %f202, 0fB5BFBE8E, %f205; | |
mul.rn.f32 %f207, %f206, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f208, %f207; | |
mul.rn.f32 %f209, %f204, %f208; | |
setp.lt.f32 %p29, %f200, 0fC2D20000; | |
selp.f32 %f210, 0f00000000, %f209, %p29; | |
setp.gt.f32 %p30, %f200, 0f42D20000; | |
selp.f32 %f211, 0f7F800000, %f210, %p30; | |
add.rn.f32 %f212, %f198, %f211; | |
ld.global.nc.u32 %r28, [%rd16+1796]; | |
cvt.rn.f16.s32 %h79, %r28; | |
sub.rn.f16 %h80, %h4, %h79; | |
mul.rn.f16 %h81, %h80, %h6; | |
sub.rn.f16 %h82, %h74, %h81; | |
cvt.f32.f16 %f213, %h82; | |
sub.rn.f32 %f214, %f213, %f3; | |
mul.rn.f32 %f215, %f214, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f216, %f215; | |
add.rn.f32 %f217, %f216, 0f00000000; | |
ex2.approx.f32 %f218, %f217; | |
fma.rn.f32 %f219, %f216, 0fBF317200, %f214; | |
fma.rn.f32 %f220, %f216, 0fB5BFBE8E, %f219; | |
mul.rn.f32 %f221, %f220, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f222, %f221; | |
mul.rn.f32 %f223, %f218, %f222; | |
setp.lt.f32 %p31, %f214, 0fC2D20000; | |
selp.f32 %f224, 0f00000000, %f223, %p31; | |
setp.gt.f32 %p32, %f214, 0f42D20000; | |
selp.f32 %f225, 0f7F800000, %f224, %p32; | |
add.rn.f32 %f226, %f212, %f225; | |
shfl.sync.down.b32 %f227, %f226, 16, 31, -1; | |
add.rn.f32 %f228, %f227, %f226; | |
shfl.sync.down.b32 %f229, %f228, 8, 31, -1; | |
add.rn.f32 %f230, %f229, %f228; | |
shfl.sync.down.b32 %f231, %f230, 4, 31, -1; | |
add.rn.f32 %f232, %f231, %f230; | |
shfl.sync.down.b32 %f233, %f232, 2, 31, -1; | |
add.rn.f32 %f234, %f233, %f232; | |
shfl.sync.down.b32 %f235, %f234, 1, 31, -1; | |
setp.eq.s32 %p33, %r1, 0; | |
@%p33 bra LBB62_3; | |
bra.uni LBB62_1; | |
LBB62_3: | |
add.rn.f32 %f1, %f235, %f234; | |
st.shared.f32 [shared_cache_015], %f1; | |
LBB62_1: | |
bar.sync 0; | |
mul.wide.u32 %rd36, %r1, 4; | |
mov.u64 %rd37, shared_cache_015; | |
add.s64 %rd3, %rd37, %rd36; | |
cvta.shared.u64 %rd38, %rd3; | |
mov.u32 %r31, 0; | |
st.local.u32 [%rd1], %r31; | |
selp.b64 %rd40, %rd38, %rd12, %p33; | |
ld.f32 %f236, [%rd40]; | |
shfl.sync.down.b32 %f237, %f236, 16, 31, -1; | |
add.rn.f32 %f238, %f236, %f237; | |
shfl.sync.down.b32 %f239, %f238, 8, 31, -1; | |
add.rn.f32 %f240, %f238, %f239; | |
shfl.sync.down.b32 %f241, %f240, 4, 31, -1; | |
add.rn.f32 %f242, %f240, %f241; | |
shfl.sync.down.b32 %f243, %f242, 2, 31, -1; | |
add.rn.f32 %f244, %f242, %f243; | |
shfl.sync.down.b32 %f245, %f244, 1, 31, -1; | |
add.rn.f32 %f246, %f244, %f245; | |
st.f32 [%rd40], %f246; | |
@%p33 bra LBB62_4; | |
bra.uni LBB62_2; | |
LBB62_4: | |
ld.param.u64 %rd7, [fusion_2201_param_1]; | |
shr.u32 %r30, %r2, 9; | |
cvta.to.global.u64 %rd10, %rd7; | |
and.b32 %r29, %r2, 511; | |
mul.wide.u32 %rd33, %r30, 2048; | |
add.s64 %rd34, %rd10, %rd33; | |
mul.wide.u32 %rd35, %r29, 4; | |
add.s64 %rd2, %rd34, %rd35; | |
ld.shared.f32 %f247, [%rd3]; | |
atom.global.add.f32 %f248, [%rd2], %f247; | |
LBB62_2: | |
ret; | |
} | |
// .globl fusion_2200 | |
.visible .entry fusion_2200( | |
.param .u64 fusion_2200_param_0, | |
.param .u64 fusion_2200_param_1, | |
.param .u64 fusion_2200_param_2, | |
.param .u64 fusion_2200_param_3, | |
.param .u64 fusion_2200_param_4, | |
.param .u64 fusion_2200_param_5 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<9>; | |
.reg .b16 %h<27>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<59>; | |
.reg .b32 %r<18>; | |
.reg .b64 %rd<26>; | |
ld.param.u64 %rd1, [fusion_2200_param_0]; | |
ld.param.u64 %rd2, [fusion_2200_param_4]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2200_param_1]; | |
ld.param.u64 %rd5, [fusion_2200_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2200_param_2]; | |
cvta.to.global.u64 %rd8, %rd7; | |
cvta.to.global.u64 %rd9, %rd4; | |
cvta.to.global.u64 %rd10, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
or.b32 %r8, %r4, 3; | |
shr.u32 %r9, %r5, 9; | |
and.b32 %r10, %r8, 511; | |
and.b32 %r11, %r7, 510; | |
and.b32 %r12, %r6, 509; | |
and.b32 %r13, %r4, 508; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd10, %rd11; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
mul.wide.u32 %rd13, %r13, 4; | |
add.s64 %rd14, %rd3, %rd13; | |
ld.global.nc.u32 %r14, [%rd14]; | |
cvt.rn.f16.s32 %h9, %r14; | |
mov.b16 %h10, 0x3C00; | |
sub.rn.f16 %h11, %h10, %h9; | |
mov.b16 %h12, 0x70E2; | |
mul.rn.f16 %h13, %h11, %h12; | |
sub.rn.f16 %h14, %h5, %h13; | |
cvt.f32.f16 %f1, %h14; | |
mul.wide.u32 %rd15, %r9, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.f32 %f2, [%rd16]; | |
sub.rn.f32 %f3, %f1, %f2; | |
mul.rn.f32 %f4, %f3, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f5, %f4; | |
add.rn.f32 %f6, %f5, 0f00000000; | |
ex2.approx.f32 %f7, %f6; | |
fma.rn.f32 %f8, %f5, 0fBF317200, %f3; | |
fma.rn.f32 %f9, %f5, 0fB5BFBE8E, %f8; | |
mul.rn.f32 %f10, %f9, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f11, %f10; | |
mul.rn.f32 %f12, %f7, %f11; | |
setp.lt.f32 %p1, %f3, 0fC2D20000; | |
selp.f32 %f13, 0f00000000, %f12, %p1; | |
setp.gt.f32 %p2, %f3, 0f42D20000; | |
selp.f32 %f14, 0f7F800000, %f13, %p2; | |
add.s64 %rd17, %rd8, %rd15; | |
ld.global.nc.f32 %f15, [%rd17]; | |
div.full.f32 %f16, %f14, %f15; | |
mul.wide.u32 %rd18, %r5, 4; | |
add.s64 %rd19, %rd9, %rd18; | |
mul.wide.u32 %rd20, %r12, 4; | |
add.s64 %rd21, %rd3, %rd20; | |
ld.global.nc.u32 %r15, [%rd21]; | |
cvt.rn.f16.s32 %h15, %r15; | |
sub.rn.f16 %h16, %h10, %h15; | |
mul.rn.f16 %h17, %h16, %h12; | |
sub.rn.f16 %h18, %h6, %h17; | |
cvt.f32.f16 %f17, %h18; | |
sub.rn.f32 %f18, %f17, %f2; | |
mul.rn.f32 %f19, %f18, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f20, %f19; | |
add.rn.f32 %f21, %f20, 0f00000000; | |
ex2.approx.f32 %f22, %f21; | |
fma.rn.f32 %f23, %f20, 0fBF317200, %f18; | |
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23; | |
mul.rn.f32 %f25, %f24, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f26, %f25; | |
mul.rn.f32 %f27, %f22, %f26; | |
setp.lt.f32 %p3, %f18, 0fC2D20000; | |
selp.f32 %f28, 0f00000000, %f27, %p3; | |
setp.gt.f32 %p4, %f18, 0f42D20000; | |
selp.f32 %f29, 0f7F800000, %f28, %p4; | |
div.full.f32 %f30, %f29, %f15; | |
mul.wide.u32 %rd22, %r11, 4; | |
add.s64 %rd23, %rd3, %rd22; | |
ld.global.nc.u32 %r16, [%rd23]; | |
cvt.rn.f16.s32 %h19, %r16; | |
sub.rn.f16 %h20, %h10, %h19; | |
mul.rn.f16 %h21, %h20, %h12; | |
sub.rn.f16 %h22, %h7, %h21; | |
cvt.f32.f16 %f31, %h22; | |
sub.rn.f32 %f32, %f31, %f2; | |
mul.rn.f32 %f33, %f32, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f34, %f33; | |
add.rn.f32 %f35, %f34, 0f00000000; | |
ex2.approx.f32 %f36, %f35; | |
fma.rn.f32 %f37, %f34, 0fBF317200, %f32; | |
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37; | |
mul.rn.f32 %f39, %f38, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f40, %f39; | |
mul.rn.f32 %f41, %f36, %f40; | |
setp.lt.f32 %p5, %f32, 0fC2D20000; | |
selp.f32 %f42, 0f00000000, %f41, %p5; | |
setp.gt.f32 %p6, %f32, 0f42D20000; | |
selp.f32 %f43, 0f7F800000, %f42, %p6; | |
div.full.f32 %f44, %f43, %f15; | |
mul.wide.u32 %rd24, %r10, 4; | |
add.s64 %rd25, %rd3, %rd24; | |
ld.global.nc.u32 %r17, [%rd25]; | |
cvt.rn.f16.s32 %h23, %r17; | |
sub.rn.f16 %h24, %h10, %h23; | |
mul.rn.f16 %h25, %h24, %h12; | |
sub.rn.f16 %h26, %h8, %h25; | |
cvt.f32.f16 %f45, %h26; | |
sub.rn.f32 %f46, %f45, %f2; | |
mul.rn.f32 %f47, %f46, 0f3FB8AA3B; | |
cvt.rzi.f32.f32 %f48, %f47; | |
add.rn.f32 %f49, %f48, 0f00000000; | |
ex2.approx.f32 %f50, %f49; | |
fma.rn.f32 %f51, %f48, 0fBF317200, %f46; | |
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51; | |
mul.rn.f32 %f53, %f52, 0f3FB8AA3B; | |
ex2.approx.ftz.f32 %f54, %f53; | |
mul.rn.f32 %f55, %f50, %f54; | |
setp.lt.f32 %p7, %f46, 0fC2D20000; | |
selp.f32 %f56, 0f00000000, %f55, %p7; | |
setp.gt.f32 %p8, %f46, 0f42D20000; | |
selp.f32 %f57, 0f7F800000, %f56, %p8; | |
div.full.f32 %f58, %f57, %f15; | |
st.global.v4.f32 [%rd19], {%f16, %f30, %f44, %f58}; | |
ret; | |
} | |
// .globl rng_get_and_update_state_39 | |
.visible .entry rng_get_and_update_state_39( | |
.param .u64 rng_get_and_update_state_39_param_0, | |
.param .u64 rng_get_and_update_state_39_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_39_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 4194304; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 4194304; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2199 | |
.visible .entry fusion_2199( | |
.param .u64 fusion_2199_param_0, | |
.param .u64 fusion_2199_param_1, | |
.param .u64 fusion_2199_param_2, | |
.param .u64 fusion_2199_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<19>; | |
.reg .f32 %f<13>; | |
.reg .b32 %r<29>; | |
.reg .b64 %rd<119>; | |
ld.param.u64 %rd1, [fusion_2199_param_0]; | |
ld.param.u64 %rd2, [fusion_2199_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2199_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
ld.global.nc.v2.u64 {%rd7, %rd8}, [%rd3]; | |
shr.u32 %r6, %r5, 2; | |
cvt.u64.u32 %rd9, %r6; | |
add.s64 %rd10, %rd7, %rd9; | |
setp.lt.u64 %p1, %rd10, %rd7; | |
and.b64 %rd11, %rd10, 4294967295; | |
mul.lo.s64 %rd12, %rd11, 3528531795; | |
selp.u64 %rd13, 1, 0, %p1; | |
add.s64 %rd14, %rd8, %rd13; | |
xor.b64 %rd15, %rd14, %rd12; | |
shr.u64 %rd16, %rd15, 32; | |
mul.lo.s64 %rd17, %rd16, 3449720151; | |
shr.u64 %rd18, %rd17, 32; | |
and.b64 %rd19, %rd14, 4294967295; | |
mul.lo.s64 %rd20, %rd19, 3449720151; | |
and.b64 %rd21, %rd20, 4294967295; | |
xor.b64 %rd22, %rd21, %rd18; | |
xor.b64 %rd23, %rd22, 2654435769; | |
mul.lo.s64 %rd24, %rd23, 3528531795; | |
shr.u64 %rd25, %rd24, 32; | |
xor.b64 %rd26, %rd20, %rd10; | |
shr.u64 %rd27, %rd26, 32; | |
mul.lo.s64 %rd28, %rd27, 3528531795; | |
and.b64 %rd29, %rd28, 4294967295; | |
xor.b64 %rd30, %rd29, %rd25; | |
xor.b64 %rd31, %rd30, 1993301258; | |
mul.lo.s64 %rd32, %rd31, 3449720151; | |
shr.u64 %rd33, %rd32, 32; | |
shr.u64 %rd34, %rd28, 32; | |
and.b64 %rd35, %rd12, 4294967295; | |
xor.b64 %rd36, %rd35, %rd34; | |
xor.b64 %rd37, %rd36, 3144134277; | |
mul.lo.s64 %rd38, %rd37, 3449720151; | |
and.b64 %rd39, %rd38, 4294967295; | |
xor.b64 %rd40, %rd39, %rd33; | |
xor.b64 %rd41, %rd40, 3668340011; | |
mul.lo.s64 %rd42, %rd41, 3528531795; | |
shr.u64 %rd43, %rd42, 32; | |
shr.u64 %rd44, %rd38, 32; | |
and.b64 %rd45, %rd17, 4294967295; | |
xor.b64 %rd46, %rd45, %rd44; | |
xor.b64 %rd47, %rd46, 1013904242; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
and.b64 %rd49, %rd48, 4294967295; | |
xor.b64 %rd50, %rd49, %rd43; | |
xor.b64 %rd51, %rd50, 3986602516; | |
mul.lo.s64 %rd52, %rd51, 3449720151; | |
shr.u64 %rd53, %rd52, 32; | |
shr.u64 %rd54, %rd48, 32; | |
and.b64 %rd55, %rd24, 4294967295; | |
xor.b64 %rd56, %rd55, %rd54; | |
xor.b64 %rd57, %rd56, 842468239; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
and.b64 %rd59, %rd58, 4294967295; | |
xor.b64 %rd60, %rd59, %rd53; | |
xor.b64 %rd61, %rd60, 387276957; | |
mul.lo.s64 %rd62, %rd61, 3528531795; | |
shr.u64 %rd63, %rd62, 32; | |
shr.u64 %rd64, %rd58, 32; | |
and.b64 %rd65, %rd32, 4294967295; | |
xor.b64 %rd66, %rd65, %rd64; | |
xor.b64 %rd67, %rd66, 2027808484; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
and.b64 %rd69, %rd68, 4294967295; | |
xor.b64 %rd70, %rd69, %rd63; | |
xor.b64 %rd71, %rd70, 1684936478; | |
mul.lo.s64 %rd72, %rd71, 3449720151; | |
shr.u64 %rd73, %rd72, 32; | |
shr.u64 %rd74, %rd68, 32; | |
and.b64 %rd75, %rd42, 4294967295; | |
xor.b64 %rd76, %rd75, %rd74; | |
xor.b64 %rd77, %rd76, 2835769497; | |
mul.lo.s64 %rd78, %rd77, 3449720151; | |
and.b64 %rd79, %rd78, 4294967295; | |
xor.b64 %rd80, %rd79, %rd73; | |
xor.b64 %rd81, %rd80, 1401181199; | |
mul.lo.s64 %rd82, %rd81, 3528531795; | |
shr.u64 %rd83, %rd82, 32; | |
shr.u64 %rd84, %rd78, 32; | |
and.b64 %rd85, %rd52, 4294967295; | |
xor.b64 %rd86, %rd85, %rd84; | |
xor.b64 %rd87, %rd86, 3041712726; | |
mul.lo.s64 %rd88, %rd87, 3528531795; | |
and.b64 %rd89, %rd88, 4294967295; | |
xor.b64 %rd90, %rd89, %rd83; | |
xor.b64 %rd91, %rd90, 3678237736; | |
mul.lo.s64 %rd92, %rd91, 3449720151; | |
shr.u64 %rd93, %rd92, 32; | |
cvt.u32.u64 %r7, %rd93; | |
shr.u64 %rd94, %rd88, 32; | |
xor.b64 %rd95, %rd94, %rd62; | |
cvt.u32.u64 %r8, %rd95; | |
xor.b32 %r9, %r8, 534103459; | |
mul.lo.s32 %r10, %r9, -845247145; | |
xor.b32 %r11, %r10, %r7; | |
shr.u32 %r12, %r11, 9; | |
xor.b32 %r13, %r12, 4716963; | |
cvt.rn.f32.u32 %f1, %r13; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f2; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p2, %h1, %h2; | |
mul.wide.u32 %rd96, %r5, 4; | |
add.s64 %rd97, %rd5, %rd96; | |
ld.global.nc.v4.f32 {%f3, %f4, %f5, %f6}, [%rd97]; | |
cvt.rn.f16.f32 %h3, %f3; | |
mov.b16 %h4, 0x3C72; | |
mul.rn.f16 %h5, %h3, %h4; | |
selp.b16 %h6, %h5, 0x0000, %p2; | |
mul.wide.u32 %rd98, %r5, 2; | |
add.s64 %rd99, %rd6, %rd98; | |
xor.b64 %rd100, %rd84, %rd52; | |
xor.b64 %rd101, %rd100, 3041712726; | |
mul.lo.s64 %rd102, %rd101, 3528531795; | |
xor.b64 %rd103, %rd83, %rd102; | |
cvt.u32.u64 %r14, %rd103; | |
xor.b32 %r15, %r14, -616729560; | |
mul.lo.s32 %r16, %r15, -845247145; | |
shr.u32 %r17, %r16, 9; | |
cvt.rn.f32.u32 %f7, %r17; | |
mul.rn.f32 %f8, %f7, 0f34000000; | |
cvt.rn.f16.f32 %h7, %f8; | |
setp.ge.f16 %p3, %h7, %h2; | |
cvt.rn.f16.f32 %h8, %f4; | |
mul.rn.f16 %h9, %h8, %h4; | |
selp.b16 %h10, %h9, 0x0000, %p3; | |
and.b64 %rd104, %rd62, 4294967295; | |
xor.b64 %rd105, %rd104, %rd94; | |
xor.b64 %rd106, %rd105, 534103459; | |
mul.lo.s64 %rd107, %rd106, 3449720151; | |
shr.u64 %rd108, %rd107, 32; | |
and.b64 %rd109, %rd72, 4294967295; | |
xor.b64 %rd110, %rd109, %rd108; | |
xor.b64 %rd111, %rd110, 4055616968; | |
mul.lo.s64 %rd112, %rd111, 3528531795; | |
shr.u64 %rd113, %rd112, 32; | |
cvt.u32.u64 %r18, %rd113; | |
xor.b64 %rd114, %rd73, %rd78; | |
cvt.u32.u64 %r19, %rd114; | |
xor.b32 %r20, %r19, 1401181199; | |
mul.lo.s32 %r21, %r20, -766435501; | |
xor.b32 %r22, %r21, %r18; | |
shr.u32 %r23, %r22, 9; | |
xor.b32 %r24, %r23, 4936337; | |
cvt.rn.f32.u32 %f9, %r24; | |
mul.rn.f32 %f10, %f9, 0f34000000; | |
cvt.rn.f16.f32 %h11, %f10; | |
setp.ge.f16 %p4, %h11, %h2; | |
cvt.rn.f16.f32 %h12, %f5; | |
mul.rn.f16 %h13, %h12, %h4; | |
selp.b16 %h14, %h13, 0x0000, %p4; | |
xor.b64 %rd115, %rd63, %rd68; | |
xor.b64 %rd116, %rd115, 1684936478; | |
mul.lo.s64 %rd117, %rd116, 3449720151; | |
xor.b64 %rd118, %rd108, %rd117; | |
cvt.u32.u64 %r25, %rd118; | |
xor.b32 %r26, %r25, -239350328; | |
mul.lo.s32 %r27, %r26, -766435501; | |
shr.u32 %r28, %r27, 9; | |
cvt.rn.f32.u32 %f11, %r28; | |
mul.rn.f32 %f12, %f11, 0f34000000; | |
cvt.rn.f16.f32 %h15, %f12; | |
setp.ge.f16 %p5, %h15, %h2; | |
cvt.rn.f16.f32 %h16, %f6; | |
mul.rn.f16 %h17, %h16, %h4; | |
selp.b16 %h18, %h17, 0x0000, %p5; | |
st.global.v4.b16 [%rd99], {%h6, %h10, %h14, %h18}; | |
ret; | |
} | |
// .globl fusion_2701 | |
.visible .entry fusion_2701( | |
.param .u64 fusion_2701_param_0, | |
.param .u64 fusion_2701_param_1, | |
.param .u64 fusion_2701_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2701_param_0]; | |
ld.param.u64 %rd2, [fusion_2701_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2197 | |
.visible .entry fusion_2197( | |
.param .u64 fusion_2197_param_0, | |
.param .u64 fusion_2197_param_1, | |
.param .u64 fusion_2197_param_2, | |
.param .u64 fusion_2197_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2197_param_0]; | |
ld.param.u64 %rd2, [fusion_2197_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2197_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd7, %r14, 2048; | |
add.s64 %rd8, %rd6, %rd7; | |
mul.wide.u32 %rd9, %r16, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r7, 256; | |
add.s64 %rd12, %rd3, %rd11; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd12, %rd13; | |
ld.global.nc.f32 %f1, [%rd14]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
mul.wide.u32 %rd15, %r5, 2; | |
add.s64 %rd16, %rd5, %rd15; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd8, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd12, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd8, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd12, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd8, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd12, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2196 | |
.visible .entry fusion_2196( | |
.param .u64 fusion_2196_param_0, | |
.param .u64 fusion_2196_param_1, | |
.param .u64 fusion_2196_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .b32 %r<12>; | |
.reg .b64 %rd<17>; | |
ld.param.u64 %rd1, [fusion_2196_param_0]; | |
ld.param.u64 %rd2, [fusion_2196_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
and.b32 %r8, %r4, 60; | |
shr.u32 %r9, %r2, 4; | |
mul.wide.u32 %rd5, %r9, 65536; | |
add.s64 %rd6, %rd3, %rd5; | |
mul.wide.u32 %rd7, %r1, 128; | |
add.s64 %rd8, %rd6, %rd7; | |
mul.wide.u32 %rd9, %r8, 2; | |
add.s64 %rd10, %rd8, %rd9; | |
ld.global.nc.b16 %h1, [%rd10]; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd4, %rd11; | |
and.b32 %r10, %r6, 61; | |
mul.wide.u32 %rd13, %r10, 2; | |
add.s64 %rd14, %rd8, %rd13; | |
ld.global.nc.b16 %h2, [%rd14]; | |
and.b32 %r11, %r7, 62; | |
mul.wide.u32 %rd15, %r11, 2; | |
add.s64 %rd16, %rd8, %rd15; | |
ld.global.nc.b16 %h3, [%rd16]; | |
ld.global.nc.b16 %h4, [%rd10+6]; | |
st.global.v4.b16 [%rd12], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2700 | |
.visible .entry fusion_2700( | |
.param .u64 fusion_2700_param_0, | |
.param .u64 fusion_2700_param_1, | |
.param .u64 fusion_2700_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2700_param_0]; | |
ld.param.u64 %rd2, [fusion_2700_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl rng_get_and_update_state_37 | |
.visible .entry rng_get_and_update_state_37( | |
.param .u64 rng_get_and_update_state_37_param_0, | |
.param .u64 rng_get_and_update_state_37_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_37_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 524288; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 524288; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2195 | |
.visible .entry fusion_2195( | |
.param .u64 fusion_2195_param_0, | |
.param .u64 fusion_2195_param_1, | |
.param .u64 fusion_2195_param_2, | |
.param .u64 fusion_2195_param_3, | |
.param .u64 fusion_2195_param_4, | |
.param .u64 fusion_2195_param_5 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<43>; | |
.reg .b32 %hh<5>; | |
.reg .f32 %f<13>; | |
.reg .b32 %r<31>; | |
.reg .b64 %rd<129>; | |
ld.param.u64 %rd1, [fusion_2195_param_0]; | |
ld.param.u64 %rd2, [fusion_2195_param_4]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2195_param_1]; | |
ld.param.u64 %rd5, [fusion_2195_param_3]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2195_param_2]; | |
cvta.to.global.u64 %rd8, %rd7; | |
cvta.to.global.u64 %rd9, %rd4; | |
cvta.to.global.u64 %rd10, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
mul.wide.u32 %rd11, %r5, 2; | |
add.s64 %rd12, %rd9, %rd11; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
shr.u32 %r8, %r5, 2; | |
ld.global.nc.v2.u64 {%rd13, %rd14}, [%rd6]; | |
cvt.u64.u32 %rd15, %r8; | |
add.s64 %rd16, %rd13, %rd15; | |
setp.lt.u64 %p1, %rd16, %rd13; | |
and.b64 %rd17, %rd16, 4294967295; | |
mul.lo.s64 %rd18, %rd17, 3528531795; | |
selp.u64 %rd19, 1, 0, %p1; | |
add.s64 %rd20, %rd14, %rd19; | |
xor.b64 %rd21, %rd20, %rd18; | |
shr.u64 %rd22, %rd21, 32; | |
mul.lo.s64 %rd23, %rd22, 3449720151; | |
shr.u64 %rd24, %rd23, 32; | |
and.b64 %rd25, %rd20, 4294967295; | |
mul.lo.s64 %rd26, %rd25, 3449720151; | |
and.b64 %rd27, %rd26, 4294967295; | |
xor.b64 %rd28, %rd27, %rd24; | |
xor.b64 %rd29, %rd28, 2654435769; | |
mul.lo.s64 %rd30, %rd29, 3528531795; | |
shr.u64 %rd31, %rd30, 32; | |
xor.b64 %rd32, %rd26, %rd16; | |
shr.u64 %rd33, %rd32, 32; | |
mul.lo.s64 %rd34, %rd33, 3528531795; | |
and.b64 %rd35, %rd34, 4294967295; | |
xor.b64 %rd36, %rd35, %rd31; | |
xor.b64 %rd37, %rd36, 1993301258; | |
mul.lo.s64 %rd38, %rd37, 3449720151; | |
shr.u64 %rd39, %rd38, 32; | |
shr.u64 %rd40, %rd34, 32; | |
and.b64 %rd41, %rd18, 4294967295; | |
xor.b64 %rd42, %rd41, %rd40; | |
xor.b64 %rd43, %rd42, 3144134277; | |
mul.lo.s64 %rd44, %rd43, 3449720151; | |
and.b64 %rd45, %rd44, 4294967295; | |
xor.b64 %rd46, %rd45, %rd39; | |
xor.b64 %rd47, %rd46, 3668340011; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
shr.u64 %rd49, %rd48, 32; | |
shr.u64 %rd50, %rd44, 32; | |
and.b64 %rd51, %rd23, 4294967295; | |
xor.b64 %rd52, %rd51, %rd50; | |
xor.b64 %rd53, %rd52, 1013904242; | |
mul.lo.s64 %rd54, %rd53, 3528531795; | |
and.b64 %rd55, %rd54, 4294967295; | |
xor.b64 %rd56, %rd55, %rd49; | |
xor.b64 %rd57, %rd56, 3986602516; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
shr.u64 %rd59, %rd58, 32; | |
shr.u64 %rd60, %rd54, 32; | |
and.b64 %rd61, %rd30, 4294967295; | |
xor.b64 %rd62, %rd61, %rd60; | |
xor.b64 %rd63, %rd62, 842468239; | |
mul.lo.s64 %rd64, %rd63, 3449720151; | |
and.b64 %rd65, %rd64, 4294967295; | |
xor.b64 %rd66, %rd65, %rd59; | |
xor.b64 %rd67, %rd66, 387276957; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
shr.u64 %rd69, %rd68, 32; | |
shr.u64 %rd70, %rd64, 32; | |
and.b64 %rd71, %rd38, 4294967295; | |
xor.b64 %rd72, %rd71, %rd70; | |
xor.b64 %rd73, %rd72, 2027808484; | |
mul.lo.s64 %rd74, %rd73, 3528531795; | |
and.b64 %rd75, %rd74, 4294967295; | |
shr.u64 %rd76, %rd74, 32; | |
and.b64 %rd77, %rd48, 4294967295; | |
xor.b64 %rd78, %rd77, %rd76; | |
xor.b64 %rd79, %rd78, 2835769497; | |
mul.lo.s64 %rd80, %rd79, 3449720151; | |
and.b64 %rd81, %rd80, 4294967295; | |
shr.u64 %rd82, %rd80, 32; | |
and.b64 %rd83, %rd58, 4294967295; | |
xor.b64 %rd84, %rd83, %rd82; | |
xor.b64 %rd85, %rd84, 3041712726; | |
mul.lo.s64 %rd86, %rd85, 3528531795; | |
and.b64 %rd87, %rd86, 4294967295; | |
xor.b64 %rd88, %rd75, %rd69; | |
xor.b64 %rd89, %rd88, 1684936478; | |
mul.lo.s64 %rd90, %rd89, 3449720151; | |
shr.u64 %rd91, %rd90, 32; | |
xor.b64 %rd92, %rd81, %rd91; | |
xor.b64 %rd93, %rd92, 1401181199; | |
mul.lo.s64 %rd94, %rd93, 3528531795; | |
shr.u64 %rd95, %rd94, 32; | |
xor.b64 %rd96, %rd87, %rd95; | |
xor.b64 %rd97, %rd96, 3678237736; | |
mul.lo.s64 %rd98, %rd97, 3449720151; | |
shr.u64 %rd99, %rd98, 32; | |
cvt.u32.u64 %r9, %rd99; | |
shr.u64 %rd100, %rd86, 32; | |
xor.b64 %rd101, %rd100, %rd68; | |
cvt.u32.u64 %r10, %rd101; | |
xor.b32 %r11, %r10, 534103459; | |
mul.lo.s32 %r12, %r11, -845247145; | |
xor.b32 %r13, %r12, %r9; | |
shr.u32 %r14, %r13, 9; | |
xor.b32 %r15, %r14, 4716963; | |
cvt.rn.f32.u32 %f1, %r15; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h9, %f2; | |
mov.b16 %h10, 0x2E66; | |
setp.ge.f16 %p2, %h9, %h10; | |
add.s64 %rd102, %rd8, %rd11; | |
ld.global.nc.v4.b16 {%h11, %h12, %h13, %h14}, [%rd102]; | |
mov.b32 %hh3, {%h13, %h14}; | |
mov.b32 %hh4, {%h11, %h12}; | |
mov.b32 {%h15, %h16}, %hh4; | |
mov.b32 {%h17, %h18}, %hh3; | |
mul.wide.u32 %rd103, %r4, 4; | |
add.s64 %rd104, %rd3, %rd103; | |
ld.global.nc.f32 %f3, [%rd104]; | |
cvt.rn.f16.f32 %h19, %f3; | |
add.rn.f16 %h20, %h15, %h19; | |
mov.b16 %h21, 0x3C72; | |
mul.rn.f16 %h22, %h20, %h21; | |
selp.b16 %h23, %h22, 0x0000, %p2; | |
add.rn.f16 %h24, %h5, %h23; | |
add.s64 %rd105, %rd10, %rd11; | |
xor.b64 %rd106, %rd58, %rd82; | |
xor.b64 %rd107, %rd106, 3041712726; | |
mul.lo.s64 %rd108, %rd107, 3528531795; | |
xor.b64 %rd109, %rd95, %rd108; | |
cvt.u32.u64 %r16, %rd109; | |
xor.b32 %r17, %r16, -616729560; | |
mul.lo.s32 %r18, %r17, -845247145; | |
shr.u32 %r19, %r18, 9; | |
cvt.rn.f32.u32 %f4, %r19; | |
mul.rn.f32 %f5, %f4, 0f34000000; | |
cvt.rn.f16.f32 %h25, %f5; | |
setp.ge.f16 %p3, %h25, %h10; | |
mul.wide.u32 %rd110, %r6, 4; | |
add.s64 %rd111, %rd3, %rd110; | |
ld.global.nc.f32 %f6, [%rd111]; | |
cvt.rn.f16.f32 %h26, %f6; | |
add.rn.f16 %h27, %h16, %h26; | |
mul.rn.f16 %h28, %h27, %h21; | |
selp.b16 %h29, %h28, 0x0000, %p3; | |
add.rn.f16 %h30, %h6, %h29; | |
and.b64 %rd112, %rd90, 4294967295; | |
and.b64 %rd113, %rd68, 4294967295; | |
xor.b64 %rd114, %rd113, %rd100; | |
xor.b64 %rd115, %rd114, 534103459; | |
mul.lo.s64 %rd116, %rd115, 3449720151; | |
shr.u64 %rd117, %rd116, 32; | |
xor.b64 %rd118, %rd112, %rd117; | |
xor.b64 %rd119, %rd118, 4055616968; | |
mul.lo.s64 %rd120, %rd119, 3528531795; | |
shr.u64 %rd121, %rd120, 32; | |
cvt.u32.u64 %r20, %rd121; | |
xor.b64 %rd122, %rd91, %rd80; | |
cvt.u32.u64 %r21, %rd122; | |
xor.b32 %r22, %r21, 1401181199; | |
mul.lo.s32 %r23, %r22, -766435501; | |
xor.b32 %r24, %r23, %r20; | |
shr.u32 %r25, %r24, 9; | |
xor.b32 %r26, %r25, 4936337; | |
cvt.rn.f32.u32 %f7, %r26; | |
mul.rn.f32 %f8, %f7, 0f34000000; | |
cvt.rn.f16.f32 %h31, %f8; | |
setp.ge.f16 %p4, %h31, %h10; | |
mul.wide.u32 %rd123, %r7, 4; | |
add.s64 %rd124, %rd3, %rd123; | |
ld.global.nc.f32 %f9, [%rd124]; | |
cvt.rn.f16.f32 %h32, %f9; | |
add.rn.f16 %h33, %h17, %h32; | |
mul.rn.f16 %h34, %h33, %h21; | |
selp.b16 %h35, %h34, 0x0000, %p4; | |
add.rn.f16 %h36, %h7, %h35; | |
xor.b64 %rd125, %rd69, %rd74; | |
xor.b64 %rd126, %rd125, 1684936478; | |
mul.lo.s64 %rd127, %rd126, 3449720151; | |
xor.b64 %rd128, %rd117, %rd127; | |
cvt.u32.u64 %r27, %rd128; | |
xor.b32 %r28, %r27, -239350328; | |
mul.lo.s32 %r29, %r28, -766435501; | |
shr.u32 %r30, %r29, 9; | |
cvt.rn.f32.u32 %f10, %r30; | |
mul.rn.f32 %f11, %f10, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f11; | |
setp.ge.f16 %p5, %h37, %h10; | |
ld.global.nc.f32 %f12, [%rd104+12]; | |
cvt.rn.f16.f32 %h38, %f12; | |
add.rn.f16 %h39, %h18, %h38; | |
mul.rn.f16 %h40, %h39, %h21; | |
selp.b16 %h41, %h40, 0x0000, %p5; | |
add.rn.f16 %h42, %h8, %h41; | |
st.global.v4.b16 [%rd105], {%h24, %h30, %h36, %h42}; | |
ret; | |
} | |
// .globl fusion_2194 | |
.visible .entry fusion_2194( | |
.param .u64 fusion_2194_param_0, | |
.param .u64 fusion_2194_param_1, | |
.param .u64 fusion_2194_param_2 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot72[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<56>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<22>; | |
mov.u64 %SPL, __local_depot72; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2194_param_0]; | |
cvta.to.global.u64 %rd8, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd10, %r4, 2048; | |
add.s64 %rd11, %rd8, %rd10; | |
mul.wide.u32 %rd12, %r5, 2; | |
add.s64 %rd13, %rd11, %rd12; | |
ld.global.nc.b32 %hh1, [%rd13]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
add.rn.f32 %f3, %f2, 0f00000000; | |
cvt.f32.f16 %f4, %h2; | |
add.rn.f32 %f5, %f3, %f4; | |
ld.global.nc.b32 %hh2, [%rd13+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f6, %h3; | |
add.rn.f32 %f7, %f5, %f6; | |
cvt.f32.f16 %f8, %h4; | |
add.rn.f32 %f9, %f7, %f8; | |
ld.global.nc.b32 %hh3, [%rd13+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f10, %h5; | |
add.rn.f32 %f11, %f9, %f10; | |
cvt.f32.f16 %f12, %h6; | |
add.rn.f32 %f13, %f11, %f12; | |
ld.global.nc.b32 %hh4, [%rd13+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f14, %h7; | |
add.rn.f32 %f15, %f13, %f14; | |
cvt.f32.f16 %f16, %h8; | |
add.rn.f32 %f17, %f15, %f16; | |
ld.global.nc.b32 %hh5, [%rd13+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f18, %h9; | |
add.rn.f32 %f19, %f17, %f18; | |
cvt.f32.f16 %f20, %h10; | |
add.rn.f32 %f21, %f19, %f20; | |
ld.global.nc.b32 %hh6, [%rd13+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f22, %h11; | |
add.rn.f32 %f23, %f21, %f22; | |
cvt.f32.f16 %f24, %h12; | |
add.rn.f32 %f25, %f23, %f24; | |
ld.global.nc.b32 %hh7, [%rd13+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f26, %h13; | |
add.rn.f32 %f27, %f25, %f26; | |
cvt.f32.f16 %f28, %h14; | |
add.rn.f32 %f29, %f27, %f28; | |
ld.global.nc.b32 %hh8, [%rd13+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f30, %h15; | |
add.rn.f32 %f31, %f29, %f30; | |
cvt.f32.f16 %f32, %h16; | |
add.rn.f32 %f33, %f31, %f32; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
add.rn.f32 %f35, %f34, %f33; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
add.rn.f32 %f37, %f36, %f35; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
add.rn.f32 %f39, %f38, %f37; | |
shfl.sync.down.b32 %f40, %f39, 2, 31, -1; | |
add.rn.f32 %f41, %f40, %f39; | |
shfl.sync.down.b32 %f42, %f41, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd16, shared_cache_016; | |
@%p1 bra LBB72_3; | |
bra.uni LBB72_1; | |
LBB72_3: | |
mul.wide.u32 %rd15, %r3, 4; | |
add.s64 %rd3, %rd16, %rd15; | |
add.rn.f32 %f1, %f42, %f41; | |
st.shared.f32 [%rd3], %f1; | |
LBB72_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB72_4; | |
bra.uni LBB72_2; | |
LBB72_4: | |
add.u64 %rd9, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd17, %r2, 4; | |
add.s64 %rd4, %rd16, %rd17; | |
cvta.shared.u64 %rd19, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd21, %rd19, %rd9, %p3; | |
ld.f32 %f43, [%rd21]; | |
shfl.sync.down.b32 %f44, %f43, 16, 31, -1; | |
add.rn.f32 %f45, %f43, %f44; | |
shfl.sync.down.b32 %f46, %f45, 8, 31, -1; | |
add.rn.f32 %f47, %f45, %f46; | |
shfl.sync.down.b32 %f48, %f47, 4, 31, -1; | |
add.rn.f32 %f49, %f47, %f48; | |
shfl.sync.down.b32 %f50, %f49, 2, 31, -1; | |
add.rn.f32 %f51, %f49, %f50; | |
shfl.sync.down.b32 %f52, %f51, 1, 31, -1; | |
add.rn.f32 %f53, %f51, %f52; | |
st.f32 [%rd21], %f53; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB72_2; | |
ld.param.u64 %rd6, [fusion_2194_param_1]; | |
cvta.to.global.u64 %rd7, %rd6; | |
mul.wide.u32 %rd14, %r4, 4; | |
add.s64 %rd2, %rd7, %rd14; | |
ld.shared.f32 %f54, [%rd4]; | |
atom.global.add.f32 %f55, [%rd2], %f54; | |
LBB72_2: | |
ret; | |
} | |
// .globl fusion_2191 | |
.visible .entry fusion_2191( | |
.param .u64 fusion_2191_param_0, | |
.param .u64 fusion_2191_param_1, | |
.param .u64 fusion_2191_param_2, | |
.param .u64 fusion_2191_param_3 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot73[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<5>; | |
.reg .b16 %h<17>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<90>; | |
.reg .b32 %r<7>; | |
.reg .b64 %rd<25>; | |
mov.u64 %SPL, __local_depot73; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd5, [fusion_2191_param_0]; | |
ld.param.u64 %rd6, [fusion_2191_param_2]; | |
cvta.to.global.u64 %rd7, %rd6; | |
cvta.to.global.u64 %rd10, %rd5; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r4, %ctaid.x; | |
shl.b32 %r5, %r1, 1; | |
mul.wide.u32 %rd12, %r4, 2048; | |
add.s64 %rd13, %rd10, %rd12; | |
mul.wide.u32 %rd14, %r5, 2; | |
add.s64 %rd15, %rd13, %rd14; | |
ld.global.nc.b32 %hh1, [%rd15]; | |
mov.b32 {%h1, %h2}, %hh1; | |
cvt.f32.f16 %f2, %h1; | |
mul.wide.u32 %rd16, %r4, 4; | |
add.s64 %rd17, %rd7, %rd16; | |
ld.global.nc.f32 %f3, [%rd17]; | |
mul.rn.f32 %f4, %f3, 0f3A800000; | |
sub.rn.f32 %f5, %f2, %f4; | |
mul.rn.f32 %f6, %f5, %f5; | |
add.rn.f32 %f7, %f6, 0f00000000; | |
cvt.f32.f16 %f8, %h2; | |
sub.rn.f32 %f9, %f8, %f4; | |
mul.rn.f32 %f10, %f9, %f9; | |
add.rn.f32 %f11, %f7, %f10; | |
ld.global.nc.b32 %hh2, [%rd15+256]; | |
mov.b32 {%h3, %h4}, %hh2; | |
cvt.f32.f16 %f12, %h3; | |
sub.rn.f32 %f13, %f12, %f4; | |
mul.rn.f32 %f14, %f13, %f13; | |
add.rn.f32 %f15, %f11, %f14; | |
cvt.f32.f16 %f16, %h4; | |
sub.rn.f32 %f17, %f16, %f4; | |
mul.rn.f32 %f18, %f17, %f17; | |
add.rn.f32 %f19, %f15, %f18; | |
ld.global.nc.b32 %hh3, [%rd15+512]; | |
mov.b32 {%h5, %h6}, %hh3; | |
cvt.f32.f16 %f20, %h5; | |
sub.rn.f32 %f21, %f20, %f4; | |
mul.rn.f32 %f22, %f21, %f21; | |
add.rn.f32 %f23, %f19, %f22; | |
cvt.f32.f16 %f24, %h6; | |
sub.rn.f32 %f25, %f24, %f4; | |
mul.rn.f32 %f26, %f25, %f25; | |
add.rn.f32 %f27, %f23, %f26; | |
ld.global.nc.b32 %hh4, [%rd15+768]; | |
mov.b32 {%h7, %h8}, %hh4; | |
cvt.f32.f16 %f28, %h7; | |
sub.rn.f32 %f29, %f28, %f4; | |
mul.rn.f32 %f30, %f29, %f29; | |
add.rn.f32 %f31, %f27, %f30; | |
cvt.f32.f16 %f32, %h8; | |
sub.rn.f32 %f33, %f32, %f4; | |
mul.rn.f32 %f34, %f33, %f33; | |
add.rn.f32 %f35, %f31, %f34; | |
ld.global.nc.b32 %hh5, [%rd15+1024]; | |
mov.b32 {%h9, %h10}, %hh5; | |
cvt.f32.f16 %f36, %h9; | |
sub.rn.f32 %f37, %f36, %f4; | |
mul.rn.f32 %f38, %f37, %f37; | |
add.rn.f32 %f39, %f35, %f38; | |
cvt.f32.f16 %f40, %h10; | |
sub.rn.f32 %f41, %f40, %f4; | |
mul.rn.f32 %f42, %f41, %f41; | |
add.rn.f32 %f43, %f39, %f42; | |
ld.global.nc.b32 %hh6, [%rd15+1280]; | |
mov.b32 {%h11, %h12}, %hh6; | |
cvt.f32.f16 %f44, %h11; | |
sub.rn.f32 %f45, %f44, %f4; | |
mul.rn.f32 %f46, %f45, %f45; | |
add.rn.f32 %f47, %f43, %f46; | |
cvt.f32.f16 %f48, %h12; | |
sub.rn.f32 %f49, %f48, %f4; | |
mul.rn.f32 %f50, %f49, %f49; | |
add.rn.f32 %f51, %f47, %f50; | |
ld.global.nc.b32 %hh7, [%rd15+1536]; | |
mov.b32 {%h13, %h14}, %hh7; | |
cvt.f32.f16 %f52, %h13; | |
sub.rn.f32 %f53, %f52, %f4; | |
mul.rn.f32 %f54, %f53, %f53; | |
add.rn.f32 %f55, %f51, %f54; | |
cvt.f32.f16 %f56, %h14; | |
sub.rn.f32 %f57, %f56, %f4; | |
mul.rn.f32 %f58, %f57, %f57; | |
add.rn.f32 %f59, %f55, %f58; | |
ld.global.nc.b32 %hh8, [%rd15+1792]; | |
mov.b32 {%h15, %h16}, %hh8; | |
cvt.f32.f16 %f60, %h15; | |
sub.rn.f32 %f61, %f60, %f4; | |
mul.rn.f32 %f62, %f61, %f61; | |
add.rn.f32 %f63, %f59, %f62; | |
cvt.f32.f16 %f64, %h16; | |
sub.rn.f32 %f65, %f64, %f4; | |
mul.rn.f32 %f66, %f65, %f65; | |
add.rn.f32 %f67, %f63, %f66; | |
and.b32 %r2, %r1, 31; | |
shfl.sync.down.b32 %f68, %f67, 16, 31, -1; | |
add.rn.f32 %f69, %f68, %f67; | |
shfl.sync.down.b32 %f70, %f69, 8, 31, -1; | |
add.rn.f32 %f71, %f70, %f69; | |
shfl.sync.down.b32 %f72, %f71, 4, 31, -1; | |
add.rn.f32 %f73, %f72, %f71; | |
shfl.sync.down.b32 %f74, %f73, 2, 31, -1; | |
add.rn.f32 %f75, %f74, %f73; | |
shfl.sync.down.b32 %f76, %f75, 1, 31, -1; | |
shr.u32 %r3, %r1, 5; | |
setp.eq.s32 %p1, %r2, 0; | |
mov.u64 %rd19, shared_cache_017; | |
@%p1 bra LBB73_3; | |
bra.uni LBB73_1; | |
LBB73_3: | |
mul.wide.u32 %rd18, %r3, 4; | |
add.s64 %rd3, %rd19, %rd18; | |
add.rn.f32 %f1, %f76, %f75; | |
st.shared.f32 [%rd3], %f1; | |
LBB73_1: | |
bar.sync 0; | |
setp.eq.s32 %p2, %r3, 0; | |
@%p2 bra LBB73_4; | |
bra.uni LBB73_2; | |
LBB73_4: | |
add.u64 %rd11, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mul.wide.u32 %rd20, %r2, 4; | |
add.s64 %rd4, %rd19, %rd20; | |
cvta.shared.u64 %rd22, %rd4; | |
mov.u32 %r6, 0; | |
st.local.u32 [%rd1], %r6; | |
setp.lt.u32 %p3, %r1, 2; | |
selp.b64 %rd24, %rd22, %rd11, %p3; | |
ld.f32 %f77, [%rd24]; | |
shfl.sync.down.b32 %f78, %f77, 16, 31, -1; | |
add.rn.f32 %f79, %f77, %f78; | |
shfl.sync.down.b32 %f80, %f79, 8, 31, -1; | |
add.rn.f32 %f81, %f79, %f80; | |
shfl.sync.down.b32 %f82, %f81, 4, 31, -1; | |
add.rn.f32 %f83, %f81, %f82; | |
shfl.sync.down.b32 %f84, %f83, 2, 31, -1; | |
add.rn.f32 %f85, %f83, %f84; | |
shfl.sync.down.b32 %f86, %f85, 1, 31, -1; | |
add.rn.f32 %f87, %f85, %f86; | |
st.f32 [%rd24], %f87; | |
setp.ne.s32 %p4, %r1, 0; | |
@%p4 bra LBB73_2; | |
ld.param.u64 %rd8, [fusion_2191_param_1]; | |
cvta.to.global.u64 %rd9, %rd8; | |
add.s64 %rd2, %rd9, %rd16; | |
ld.shared.f32 %f88, [%rd4]; | |
atom.global.add.f32 %f89, [%rd2], %f88; | |
LBB73_2: | |
ret; | |
} | |
// .globl fusion_2187 | |
.visible .entry fusion_2187( | |
.param .u64 fusion_2187_param_0, | |
.param .u64 fusion_2187_param_1, | |
.param .u64 fusion_2187_param_2, | |
.param .u64 fusion_2187_param_3, | |
.param .u64 fusion_2187_param_4, | |
.param .u64 fusion_2187_param_5, | |
.param .u64 fusion_2187_param_6 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<39>; | |
.reg .b32 %r<8>; | |
.reg .b64 %rd<28>; | |
ld.param.u64 %rd1, [fusion_2187_param_0]; | |
ld.param.u64 %rd2, [fusion_2187_param_5]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2187_param_1]; | |
ld.param.u64 %rd5, [fusion_2187_param_4]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2187_param_2]; | |
ld.param.u64 %rd8, [fusion_2187_param_3]; | |
cvta.to.global.u64 %rd9, %rd8; | |
cvta.to.global.u64 %rd10, %rd7; | |
cvta.to.global.u64 %rd11, %rd4; | |
cvta.to.global.u64 %rd12, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
mul.wide.u32 %rd13, %r5, 2; | |
add.s64 %rd14, %rd11, %rd13; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd14]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
cvt.f32.f16 %f1, %h5; | |
mul.wide.u32 %rd15, %r1, 4; | |
add.s64 %rd16, %rd10, %rd15; | |
ld.global.nc.f32 %f2, [%rd16]; | |
mul.rn.f32 %f3, %f2, 0f3A800000; | |
add.rn.f32 %f4, %f3, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f5, %f4; | |
mul.wide.u32 %rd17, %r4, 4; | |
add.s64 %rd18, %rd3, %rd17; | |
ld.global.nc.f32 %f6, [%rd18]; | |
mul.rn.f32 %f7, %f5, %f6; | |
mul.rn.f32 %f8, %f7, %f1; | |
add.s64 %rd19, %rd6, %rd17; | |
ld.global.nc.f32 %f9, [%rd19]; | |
add.s64 %rd20, %rd9, %rd15; | |
ld.global.nc.f32 %f10, [%rd20]; | |
mul.rn.f32 %f11, %f10, 0f3A800000; | |
mul.rn.f32 %f12, %f7, %f11; | |
sub.rn.f32 %f13, %f9, %f12; | |
add.rn.f32 %f14, %f8, %f13; | |
cvt.rn.f16.f32 %h9, %f14; | |
add.s64 %rd21, %rd12, %rd13; | |
cvt.f32.f16 %f15, %h6; | |
mul.wide.u32 %rd22, %r6, 4; | |
add.s64 %rd23, %rd3, %rd22; | |
ld.global.nc.f32 %f16, [%rd23]; | |
mul.rn.f32 %f17, %f5, %f16; | |
mul.rn.f32 %f18, %f17, %f15; | |
add.s64 %rd24, %rd6, %rd22; | |
ld.global.nc.f32 %f19, [%rd24]; | |
mul.rn.f32 %f20, %f11, %f17; | |
sub.rn.f32 %f21, %f19, %f20; | |
add.rn.f32 %f22, %f18, %f21; | |
cvt.rn.f16.f32 %h10, %f22; | |
cvt.f32.f16 %f23, %h7; | |
mul.wide.u32 %rd25, %r7, 4; | |
add.s64 %rd26, %rd3, %rd25; | |
ld.global.nc.f32 %f24, [%rd26]; | |
mul.rn.f32 %f25, %f5, %f24; | |
mul.rn.f32 %f26, %f25, %f23; | |
add.s64 %rd27, %rd6, %rd25; | |
ld.global.nc.f32 %f27, [%rd27]; | |
mul.rn.f32 %f28, %f11, %f25; | |
sub.rn.f32 %f29, %f27, %f28; | |
add.rn.f32 %f30, %f26, %f29; | |
cvt.rn.f16.f32 %h11, %f30; | |
cvt.f32.f16 %f31, %h8; | |
ld.global.nc.f32 %f32, [%rd18+12]; | |
mul.rn.f32 %f33, %f5, %f32; | |
mul.rn.f32 %f34, %f33, %f31; | |
ld.global.nc.f32 %f35, [%rd19+12]; | |
mul.rn.f32 %f36, %f11, %f33; | |
sub.rn.f32 %f37, %f35, %f36; | |
add.rn.f32 %f38, %f34, %f37; | |
cvt.rn.f16.f32 %h12, %f38; | |
st.global.v4.b16 [%rd21], {%h9, %h10, %h11, %h12}; | |
ret; | |
} | |
// .globl convert_1585 | |
.visible .entry convert_1585( | |
.param .u64 convert_1585_param_0, | |
.param .u64 convert_1585_param_1, | |
.param .u64 convert_1585_param_2 | |
) | |
.reqntid 128, 1, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %h<29>; | |
.reg .f32 %f<29>; | |
.reg .b32 %r<9>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd3, [convert_1585_param_0]; | |
ld.param.u64 %rd4, [convert_1585_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd3; | |
mov.u32 %r3, %ctaid.x; | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r1, %r3, 9; | |
shl.b32 %r2, %r4, 2; | |
or.b32 %r5, %r1, %r2; | |
mul.wide.u32 %rd7, %r5, 4; | |
add.s64 %rd1, %rd5, %rd7; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd8, %r5, 2; | |
add.s64 %rd2, %rd6, %rd8; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4}; | |
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440]; | |
cvt.rn.f16.f32 %h5, %f5; | |
cvt.rn.f16.f32 %h6, %f6; | |
cvt.rn.f16.f32 %h7, %f7; | |
cvt.rn.f16.f32 %h8, %f8; | |
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8}; | |
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880]; | |
cvt.rn.f16.f32 %h9, %f9; | |
cvt.rn.f16.f32 %h10, %f10; | |
cvt.rn.f16.f32 %h11, %f11; | |
cvt.rn.f16.f32 %h12, %f12; | |
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12}; | |
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320]; | |
cvt.rn.f16.f32 %h13, %f13; | |
cvt.rn.f16.f32 %h14, %f14; | |
cvt.rn.f16.f32 %h15, %f15; | |
cvt.rn.f16.f32 %h16, %f16; | |
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16}; | |
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760]; | |
cvt.rn.f16.f32 %h17, %f17; | |
cvt.rn.f16.f32 %h18, %f18; | |
cvt.rn.f16.f32 %h19, %f19; | |
cvt.rn.f16.f32 %h20, %f20; | |
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20}; | |
add.s32 %r6, %r5, 3276800; | |
setp.gt.u32 %p1, %r6, 4194303; | |
@%p1 bra LBB75_2; | |
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200]; | |
cvt.rn.f16.f32 %h21, %f21; | |
cvt.rn.f16.f32 %h22, %f22; | |
cvt.rn.f16.f32 %h23, %f23; | |
cvt.rn.f16.f32 %h24, %f24; | |
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24}; | |
LBB75_2: | |
add.s32 %r7, %r1, 3932160; | |
or.b32 %r8, %r7, %r2; | |
setp.gt.u32 %p2, %r8, 4194303; | |
@%p2 bra LBB75_4; | |
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640]; | |
cvt.rn.f16.f32 %h25, %f25; | |
cvt.rn.f16.f32 %h26, %f26; | |
cvt.rn.f16.f32 %h27, %f27; | |
cvt.rn.f16.f32 %h28, %f28; | |
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28}; | |
LBB75_4: | |
ret; | |
} | |
// .globl fusion_2182 | |
.visible .entry fusion_2182( | |
.param .u64 fusion_2182_param_0, | |
.param .u64 fusion_2182_param_1, | |
.param .u64 fusion_2182_param_2, | |
.param .u64 fusion_2182_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<21>; | |
.reg .b16 %h<21>; | |
.reg .b32 %hh<3>; | |
.reg .f32 %f<150>; | |
.reg .b32 %r<25>; | |
.reg .b64 %rd<18>; | |
ld.param.u64 %rd1, [fusion_2182_param_0]; | |
ld.param.u64 %rd2, [fusion_2182_param_2]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2182_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r5, 1; | |
or.b32 %r7, %r5, 2; | |
or.b32 %r8, %r5, 3; | |
and.b32 %r9, %r8, 4095; | |
and.b32 %r10, %r7, 4094; | |
and.b32 %r11, %r6, 4093; | |
and.b32 %r12, %r5, 4092; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd5, %rd7; | |
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd8]; | |
mov.b32 %hh1, {%h3, %h4}; | |
mov.b32 %hh2, {%h1, %h2}; | |
mov.b32 {%h5, %h6}, %hh2; | |
mov.b32 {%h7, %h8}, %hh1; | |
mul.wide.u32 %rd9, %r12, 4; | |
add.s64 %rd10, %rd3, %rd9; | |
ld.global.nc.f32 %f1, [%rd10]; | |
cvt.rn.f16.f32 %h9, %f1; | |
add.rn.f16 %h10, %h5, %h9; | |
cvt.f32.f16 %f2, %h10; | |
mul.rn.f32 %f3, %f2, %f2; | |
mul.rn.f32 %f4, %f3, %f2; | |
mul.rn.f32 %f5, %f4, 0f3D372713; | |
add.rn.f32 %f6, %f5, %f2; | |
mul.rn.f32 %f7, %f6, 0f3F4C422A; | |
abs.f32 %f8, %f7; | |
setp.lt.f32 %p1, %f8, 0f39D1B717; | |
setp.lt.f32 %p2, %f7, 0fC1100000; | |
selp.f32 %f9, 0fC1100000, %f7, %p2; | |
setp.gt.f32 %p3, %f9, 0f41100000; | |
selp.f32 %f10, 0f41100000, %f9, %p3; | |
mul.rn.f32 %f11, %f10, %f10; | |
mul.rn.f32 %f12, %f11, 0f259F25C0; | |
mov.f32 %f13, 0f2A61337E; | |
sub.rn.f32 %f14, %f13, %f12; | |
mul.rn.f32 %f15, %f11, %f14; | |
add.rn.f32 %f16, %f15, 0fAEBD37FF; | |
mul.rn.f32 %f17, %f11, %f16; | |
add.rn.f32 %f18, %f17, 0f335C0041; | |
mul.rn.f32 %f19, %f11, %f18; | |
add.rn.f32 %f20, %f19, 0f3779434A; | |
mul.rn.f32 %f21, %f11, %f20; | |
add.rn.f32 %f22, %f21, 0f3A270DED; | |
mul.rn.f32 %f23, %f11, %f22; | |
add.rn.f32 %f24, %f23, 0f3BA059DC; | |
mul.rn.f32 %f25, %f10, %f24; | |
mul.rn.f32 %f26, %f11, 0f35A0D3D8; | |
add.rn.f32 %f27, %f26, 0f38F895D6; | |
mul.rn.f32 %f28, %f11, %f27; | |
add.rn.f32 %f29, %f28, 0f3B14AA05; | |
mul.rn.f32 %f30, %f11, %f29; | |
add.rn.f32 %f31, %f30, 0f3BA059DD; | |
div.full.f32 %f32, %f25, %f31; | |
selp.f32 %f33, %f7, %f32, %p1; | |
mov.b32 %r13, %f7; | |
shr.u32 %r14, %r13, 31; | |
and.b32 %r15, %r14, 1; | |
setp.eq.b32 %p4, %r15, 1; | |
selp.f32 %f34, 0fBF800000, 0f3F800000, %p4; | |
setp.ltu.f32 %p5, %f8, 0f41A00000; | |
selp.f32 %f35, %f33, %f34, %p5; | |
add.rn.f32 %f36, %f35, 0f3F800000; | |
mul.rn.f32 %f37, %f36, 0f3F000000; | |
mul.rn.f32 %f38, %f37, %f2; | |
cvt.rn.f16.f32 %h11, %f38; | |
add.s64 %rd11, %rd6, %rd7; | |
mul.wide.u32 %rd12, %r11, 4; | |
add.s64 %rd13, %rd3, %rd12; | |
ld.global.nc.f32 %f39, [%rd13]; | |
cvt.rn.f16.f32 %h12, %f39; | |
add.rn.f16 %h13, %h6, %h12; | |
cvt.f32.f16 %f40, %h13; | |
mul.rn.f32 %f41, %f40, %f40; | |
mul.rn.f32 %f42, %f41, %f40; | |
mul.rn.f32 %f43, %f42, 0f3D372713; | |
add.rn.f32 %f44, %f43, %f40; | |
mul.rn.f32 %f45, %f44, 0f3F4C422A; | |
abs.f32 %f46, %f45; | |
setp.lt.f32 %p6, %f46, 0f39D1B717; | |
setp.lt.f32 %p7, %f45, 0fC1100000; | |
selp.f32 %f47, 0fC1100000, %f45, %p7; | |
setp.gt.f32 %p8, %f47, 0f41100000; | |
selp.f32 %f48, 0f41100000, %f47, %p8; | |
mul.rn.f32 %f49, %f48, %f48; | |
mul.rn.f32 %f50, %f49, 0f259F25C0; | |
sub.rn.f32 %f51, %f13, %f50; | |
mul.rn.f32 %f52, %f49, %f51; | |
add.rn.f32 %f53, %f52, 0fAEBD37FF; | |
mul.rn.f32 %f54, %f49, %f53; | |
add.rn.f32 %f55, %f54, 0f335C0041; | |
mul.rn.f32 %f56, %f49, %f55; | |
add.rn.f32 %f57, %f56, 0f3779434A; | |
mul.rn.f32 %f58, %f49, %f57; | |
add.rn.f32 %f59, %f58, 0f3A270DED; | |
mul.rn.f32 %f60, %f49, %f59; | |
add.rn.f32 %f61, %f60, 0f3BA059DC; | |
mul.rn.f32 %f62, %f48, %f61; | |
mul.rn.f32 %f63, %f49, 0f35A0D3D8; | |
add.rn.f32 %f64, %f63, 0f38F895D6; | |
mul.rn.f32 %f65, %f49, %f64; | |
add.rn.f32 %f66, %f65, 0f3B14AA05; | |
mul.rn.f32 %f67, %f49, %f66; | |
add.rn.f32 %f68, %f67, 0f3BA059DD; | |
div.full.f32 %f69, %f62, %f68; | |
selp.f32 %f70, %f45, %f69, %p6; | |
mov.b32 %r16, %f45; | |
shr.u32 %r17, %r16, 31; | |
and.b32 %r18, %r17, 1; | |
setp.eq.b32 %p9, %r18, 1; | |
selp.f32 %f71, 0fBF800000, 0f3F800000, %p9; | |
setp.ltu.f32 %p10, %f46, 0f41A00000; | |
selp.f32 %f72, %f70, %f71, %p10; | |
add.rn.f32 %f73, %f72, 0f3F800000; | |
mul.rn.f32 %f74, %f73, 0f3F000000; | |
mul.rn.f32 %f75, %f74, %f40; | |
cvt.rn.f16.f32 %h14, %f75; | |
mul.wide.u32 %rd14, %r10, 4; | |
add.s64 %rd15, %rd3, %rd14; | |
ld.global.nc.f32 %f76, [%rd15]; | |
cvt.rn.f16.f32 %h15, %f76; | |
add.rn.f16 %h16, %h7, %h15; | |
cvt.f32.f16 %f77, %h16; | |
mul.rn.f32 %f78, %f77, %f77; | |
mul.rn.f32 %f79, %f78, %f77; | |
mul.rn.f32 %f80, %f79, 0f3D372713; | |
add.rn.f32 %f81, %f80, %f77; | |
mul.rn.f32 %f82, %f81, 0f3F4C422A; | |
abs.f32 %f83, %f82; | |
setp.lt.f32 %p11, %f83, 0f39D1B717; | |
setp.lt.f32 %p12, %f82, 0fC1100000; | |
selp.f32 %f84, 0fC1100000, %f82, %p12; | |
setp.gt.f32 %p13, %f84, 0f41100000; | |
selp.f32 %f85, 0f41100000, %f84, %p13; | |
mul.rn.f32 %f86, %f85, %f85; | |
mul.rn.f32 %f87, %f86, 0f259F25C0; | |
sub.rn.f32 %f88, %f13, %f87; | |
mul.rn.f32 %f89, %f86, %f88; | |
add.rn.f32 %f90, %f89, 0fAEBD37FF; | |
mul.rn.f32 %f91, %f86, %f90; | |
add.rn.f32 %f92, %f91, 0f335C0041; | |
mul.rn.f32 %f93, %f86, %f92; | |
add.rn.f32 %f94, %f93, 0f3779434A; | |
mul.rn.f32 %f95, %f86, %f94; | |
add.rn.f32 %f96, %f95, 0f3A270DED; | |
mul.rn.f32 %f97, %f86, %f96; | |
add.rn.f32 %f98, %f97, 0f3BA059DC; | |
mul.rn.f32 %f99, %f85, %f98; | |
mul.rn.f32 %f100, %f86, 0f35A0D3D8; | |
add.rn.f32 %f101, %f100, 0f38F895D6; | |
mul.rn.f32 %f102, %f86, %f101; | |
add.rn.f32 %f103, %f102, 0f3B14AA05; | |
mul.rn.f32 %f104, %f86, %f103; | |
add.rn.f32 %f105, %f104, 0f3BA059DD; | |
div.full.f32 %f106, %f99, %f105; | |
selp.f32 %f107, %f82, %f106, %p11; | |
mov.b32 %r19, %f82; | |
shr.u32 %r20, %r19, 31; | |
and.b32 %r21, %r20, 1; | |
setp.eq.b32 %p14, %r21, 1; | |
selp.f32 %f108, 0fBF800000, 0f3F800000, %p14; | |
setp.ltu.f32 %p15, %f83, 0f41A00000; | |
selp.f32 %f109, %f107, %f108, %p15; | |
add.rn.f32 %f110, %f109, 0f3F800000; | |
mul.rn.f32 %f111, %f110, 0f3F000000; | |
mul.rn.f32 %f112, %f111, %f77; | |
cvt.rn.f16.f32 %h17, %f112; | |
mul.wide.u32 %rd16, %r9, 4; | |
add.s64 %rd17, %rd3, %rd16; | |
ld.global.nc.f32 %f113, [%rd17]; | |
cvt.rn.f16.f32 %h18, %f113; | |
add.rn.f16 %h19, %h8, %h18; | |
cvt.f32.f16 %f114, %h19; | |
mul.rn.f32 %f115, %f114, %f114; | |
mul.rn.f32 %f116, %f115, %f114; | |
mul.rn.f32 %f117, %f116, 0f3D372713; | |
add.rn.f32 %f118, %f117, %f114; | |
mul.rn.f32 %f119, %f118, 0f3F4C422A; | |
abs.f32 %f120, %f119; | |
setp.lt.f32 %p16, %f120, 0f39D1B717; | |
setp.lt.f32 %p17, %f119, 0fC1100000; | |
selp.f32 %f121, 0fC1100000, %f119, %p17; | |
setp.gt.f32 %p18, %f121, 0f41100000; | |
selp.f32 %f122, 0f41100000, %f121, %p18; | |
mul.rn.f32 %f123, %f122, %f122; | |
mul.rn.f32 %f124, %f123, 0f259F25C0; | |
sub.rn.f32 %f125, %f13, %f124; | |
mul.rn.f32 %f126, %f123, %f125; | |
add.rn.f32 %f127, %f126, 0fAEBD37FF; | |
mul.rn.f32 %f128, %f123, %f127; | |
add.rn.f32 %f129, %f128, 0f335C0041; | |
mul.rn.f32 %f130, %f123, %f129; | |
add.rn.f32 %f131, %f130, 0f3779434A; | |
mul.rn.f32 %f132, %f123, %f131; | |
add.rn.f32 %f133, %f132, 0f3A270DED; | |
mul.rn.f32 %f134, %f123, %f133; | |
add.rn.f32 %f135, %f134, 0f3BA059DC; | |
mul.rn.f32 %f136, %f122, %f135; | |
mul.rn.f32 %f137, %f123, 0f35A0D3D8; | |
add.rn.f32 %f138, %f137, 0f38F895D6; | |
mul.rn.f32 %f139, %f123, %f138; | |
add.rn.f32 %f140, %f139, 0f3B14AA05; | |
mul.rn.f32 %f141, %f123, %f140; | |
add.rn.f32 %f142, %f141, 0f3BA059DD; | |
div.full.f32 %f143, %f136, %f142; | |
selp.f32 %f144, %f119, %f143, %p16; | |
mov.b32 %r22, %f119; | |
shr.u32 %r23, %r22, 31; | |
and.b32 %r24, %r23, 1; | |
setp.eq.b32 %p19, %r24, 1; | |
selp.f32 %f145, 0fBF800000, 0f3F800000, %p19; | |
setp.ltu.f32 %p20, %f120, 0f41A00000; | |
selp.f32 %f146, %f144, %f145, %p20; | |
add.rn.f32 %f147, %f146, 0f3F800000; | |
mul.rn.f32 %f148, %f147, 0f3F000000; | |
mul.rn.f32 %f149, %f148, %f114; | |
cvt.rn.f16.f32 %h20, %f149; | |
st.global.v4.b16 [%rd11], {%h11, %h14, %h17, %h20}; | |
ret; | |
} | |
// .globl convert_1587 | |
.visible .entry convert_1587( | |
.param .u64 convert_1587_param_0, | |
.param .u64 convert_1587_param_1, | |
.param .u64 convert_1587_param_2 | |
) | |
.reqntid 128, 1, 1 | |
{ | |
.reg .pred %p<3>; | |
.reg .b16 %h<29>; | |
.reg .f32 %f<29>; | |
.reg .b32 %r<9>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd3, [convert_1587_param_0]; | |
ld.param.u64 %rd4, [convert_1587_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd3; | |
mov.u32 %r3, %ctaid.x; | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r1, %r3, 9; | |
shl.b32 %r2, %r4, 2; | |
or.b32 %r5, %r1, %r2; | |
mul.wide.u32 %rd7, %r5, 4; | |
add.s64 %rd1, %rd5, %rd7; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd8, %r5, 2; | |
add.s64 %rd2, %rd6, %rd8; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4}; | |
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440]; | |
cvt.rn.f16.f32 %h5, %f5; | |
cvt.rn.f16.f32 %h6, %f6; | |
cvt.rn.f16.f32 %h7, %f7; | |
cvt.rn.f16.f32 %h8, %f8; | |
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8}; | |
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880]; | |
cvt.rn.f16.f32 %h9, %f9; | |
cvt.rn.f16.f32 %h10, %f10; | |
cvt.rn.f16.f32 %h11, %f11; | |
cvt.rn.f16.f32 %h12, %f12; | |
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12}; | |
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320]; | |
cvt.rn.f16.f32 %h13, %f13; | |
cvt.rn.f16.f32 %h14, %f14; | |
cvt.rn.f16.f32 %h15, %f15; | |
cvt.rn.f16.f32 %h16, %f16; | |
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16}; | |
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760]; | |
cvt.rn.f16.f32 %h17, %f17; | |
cvt.rn.f16.f32 %h18, %f18; | |
cvt.rn.f16.f32 %h19, %f19; | |
cvt.rn.f16.f32 %h20, %f20; | |
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20}; | |
add.s32 %r6, %r5, 3276800; | |
setp.gt.u32 %p1, %r6, 4194303; | |
@%p1 bra LBB77_2; | |
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200]; | |
cvt.rn.f16.f32 %h21, %f21; | |
cvt.rn.f16.f32 %h22, %f22; | |
cvt.rn.f16.f32 %h23, %f23; | |
cvt.rn.f16.f32 %h24, %f24; | |
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24}; | |
LBB77_2: | |
add.s32 %r7, %r1, 3932160; | |
or.b32 %r8, %r7, %r2; | |
setp.gt.u32 %p2, %r8, 4194303; | |
@%p2 bra LBB77_4; | |
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640]; | |
cvt.rn.f16.f32 %h25, %f25; | |
cvt.rn.f16.f32 %h26, %f26; | |
cvt.rn.f16.f32 %h27, %f27; | |
cvt.rn.f16.f32 %h28, %f28; | |
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28}; | |
LBB77_4: | |
ret; | |
} | |
// .globl rng_get_and_update_state_38 | |
.visible .entry rng_get_and_update_state_38( | |
.param .u64 rng_get_and_update_state_38_param_0, | |
.param .u64 rng_get_and_update_state_38_param_1 | |
) | |
{ | |
.reg .pred %p<3>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [rng_get_and_update_state_38_param_0]; | |
cvta.to.global.u64 %rd2, %rd1; | |
ld.global.u64 %rd3, [rng_state+8]; | |
ld.global.u64 %rd4, [rng_state]; | |
add.s64 %rd5, %rd4, 524288; | |
setp.lt.u64 %p1, %rd5, %rd4; | |
selp.u64 %rd6, 1, 0, %p1; | |
setp.lt.u64 %p2, %rd5, 524288; | |
selp.b64 %rd7, 1, %rd6, %p2; | |
add.s64 %rd8, %rd3, %rd7; | |
st.global.u64 [rng_state], %rd5; | |
st.global.u64 [rng_state+8], %rd8; | |
st.global.u64 [%rd2+8], %rd3; | |
st.global.u64 [%rd2], %rd4; | |
ret; | |
} | |
// .globl fusion_2180 | |
.visible .entry fusion_2180( | |
.param .u64 fusion_2180_param_0, | |
.param .u64 fusion_2180_param_1, | |
.param .u64 fusion_2180_param_2, | |
.param .u64 fusion_2180_param_3, | |
.param .u64 fusion_2180_param_4, | |
.param .u64 fusion_2180_param_5, | |
.param .u64 fusion_2180_param_6, | |
.param .u64 fusion_2180_param_7, | |
.param .u64 fusion_2180_param_8, | |
.param .u64 fusion_2180_param_9 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot79[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<75>; | |
.reg .b16 %h<145>; | |
.reg .f32 %f<254>; | |
.reg .b32 %r<350>; | |
.reg .b64 %rd<2739>; | |
mov.u64 %SPL, __local_depot79; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd463, [fusion_2180_param_0]; | |
ld.param.u64 %rd464, [fusion_2180_param_8]; | |
cvta.to.global.u64 %rd1, %rd464; | |
ld.param.u64 %rd465, [fusion_2180_param_1]; | |
ld.param.u64 %rd466, [fusion_2180_param_7]; | |
cvta.to.global.u64 %rd2, %rd466; | |
ld.param.u64 %rd467, [fusion_2180_param_2]; | |
ld.param.u64 %rd468, [fusion_2180_param_6]; | |
cvta.to.global.u64 %rd3, %rd468; | |
ld.param.u64 %rd470, [fusion_2180_param_5]; | |
cvta.to.global.u64 %rd4, %rd470; | |
ld.param.u64 %rd471, [fusion_2180_param_4]; | |
cvta.to.global.u64 %rd5, %rd471; | |
cvta.to.global.u64 %rd7, %rd467; | |
cvta.to.global.u64 %rd8, %rd465; | |
cvta.to.global.u64 %rd9, %rd463; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 10; | |
or.b32 %r48, %r4, %r3; | |
shr.u32 %r49, %r48, 2; | |
and.b32 %r5, %r1, 1; | |
setp.eq.s32 %p1, %r5, 0; | |
ld.global.nc.u64 %rd11, [%rd7]; | |
cvt.u64.u32 %rd473, %r49; | |
add.s64 %rd12, %rd11, %rd473; | |
setp.lt.u64 %p69, %rd12, %rd11; | |
and.b64 %rd2384, %rd12, 4294967295; | |
@%p1 bra LBB79_1; | |
bra.uni LBB79_4; | |
LBB79_1: | |
mul.lo.s64 %rd2446, %rd2384, 3528531795; | |
ld.global.nc.u64 %rd2461, [%rd7+8]; | |
selp.u64 %rd516, 1, 0, %p69; | |
add.s64 %rd517, %rd2461, %rd516; | |
xor.b64 %rd518, %rd517, %rd2446; | |
shr.u64 %rd519, %rd518, 32; | |
mul.lo.s64 %rd2449, %rd519, 3449720151; | |
shr.u64 %rd520, %rd2449, 32; | |
and.b64 %rd521, %rd517, 4294967295; | |
mul.lo.s64 %rd522, %rd521, 3449720151; | |
and.b64 %rd523, %rd522, 4294967295; | |
xor.b64 %rd524, %rd523, %rd520; | |
xor.b64 %rd525, %rd524, 2654435769; | |
mul.lo.s64 %rd2452, %rd525, 3528531795; | |
xor.b64 %rd2442, %rd522, %rd12; | |
mov.u32 %r312, -1879881855; | |
mov.u32 %r311, -845247145; | |
mov.u32 %r310, 534103459; | |
mov.u64 %rd2460, 3678237736; | |
mov.u64 %rd2459, 3041712726; | |
mov.u64 %rd2458, 1401181199; | |
mov.u64 %rd2457, 2835769497; | |
mov.u64 %rd2456, 1684936478; | |
mov.u64 %rd2455, 2027808484; | |
mov.u64 %rd2454, 387276957; | |
mov.u64 %rd2453, 842468239; | |
mov.u64 %rd2451, 3986602516; | |
mov.u64 %rd2450, 1013904242; | |
mov.u64 %rd2448, 3668340011; | |
mov.u64 %rd2447, 3144134277; | |
mov.u64 %rd2445, 3449720151; | |
mov.u64 %rd2444, 1993301258; | |
mov.u64 %rd2443, 3528531795; | |
bra.uni LBB79_5; | |
LBB79_4: | |
mov.u32 %r311, -766435501; | |
mov.u64 %rd2459, 1684936478; | |
mov.u64 %rd2458, 534103459; | |
mov.u64 %rd2457, 387276957; | |
mov.u64 %rd2456, 3041712726; | |
mov.u64 %rd2455, 3986602516; | |
mov.u64 %rd2454, 2835769497; | |
mov.u64 %rd2453, 3668340011; | |
mov.u64 %rd2451, 2027808484; | |
mov.u64 %rd2450, 1993301258; | |
mov.u64 %rd2448, 842468239; | |
mov.u64 %rd2447, 2654435769; | |
mov.u64 %rd2445, 3528531795; | |
mov.u64 %rd2444, 1013904242; | |
mov.u64 %rd2443, 3449720151; | |
mov.u32 %r312, -1767562579; | |
mov.u32 %r310, 1401181199; | |
mov.u64 %rd2460, 4055616968; | |
ld.global.nc.u64 %rd2461, [%rd7+8]; | |
selp.u64 %rd489, 1, 0, %p69; | |
add.s64 %rd490, %rd2461, %rd489; | |
and.b64 %rd491, %rd490, 4294967295; | |
mul.lo.s64 %rd2446, %rd491, 3449720151; | |
xor.b64 %rd492, %rd2446, %rd12; | |
shr.u64 %rd493, %rd492, 32; | |
mul.lo.s64 %rd2449, %rd493, 3528531795; | |
shr.u64 %rd494, %rd2449, 32; | |
mul.lo.s64 %rd496, %rd2384, 3528531795; | |
and.b64 %rd497, %rd496, 4294967295; | |
xor.b64 %rd498, %rd497, %rd494; | |
xor.b64 %rd499, %rd498, 3144134277; | |
mul.lo.s64 %rd2452, %rd499, 3449720151; | |
xor.b64 %rd2442, %rd490, %rd496; | |
LBB79_5: | |
shr.u64 %rd526, %rd2452, 32; | |
shr.u64 %rd527, %rd2442, 32; | |
mul.lo.s64 %rd528, %rd527, %rd2443; | |
and.b64 %rd529, %rd528, 4294967295; | |
xor.b64 %rd530, %rd529, %rd526; | |
xor.b64 %rd531, %rd530, %rd2444; | |
mul.lo.s64 %rd532, %rd531, %rd2445; | |
shr.u64 %rd533, %rd532, 32; | |
shr.u64 %rd534, %rd528, 32; | |
and.b64 %rd535, %rd2446, 4294967295; | |
xor.b64 %rd536, %rd535, %rd534; | |
xor.b64 %rd537, %rd536, %rd2447; | |
mul.lo.s64 %rd538, %rd537, %rd2445; | |
and.b64 %rd539, %rd538, 4294967295; | |
xor.b64 %rd540, %rd539, %rd533; | |
xor.b64 %rd541, %rd540, %rd2448; | |
mul.lo.s64 %rd542, %rd541, %rd2443; | |
shr.u64 %rd543, %rd542, 32; | |
shr.u64 %rd544, %rd538, 32; | |
and.b64 %rd545, %rd2449, 4294967295; | |
xor.b64 %rd546, %rd545, %rd544; | |
xor.b64 %rd547, %rd546, %rd2450; | |
mul.lo.s64 %rd548, %rd547, %rd2443; | |
and.b64 %rd549, %rd548, 4294967295; | |
xor.b64 %rd550, %rd549, %rd543; | |
xor.b64 %rd551, %rd550, %rd2451; | |
mul.lo.s64 %rd552, %rd551, %rd2445; | |
shr.u64 %rd553, %rd552, 32; | |
shr.u64 %rd554, %rd548, 32; | |
and.b64 %rd555, %rd2452, 4294967295; | |
xor.b64 %rd556, %rd555, %rd554; | |
xor.b64 %rd557, %rd556, %rd2453; | |
mul.lo.s64 %rd558, %rd557, %rd2445; | |
and.b64 %rd559, %rd558, 4294967295; | |
xor.b64 %rd560, %rd559, %rd553; | |
xor.b64 %rd561, %rd560, %rd2454; | |
mul.lo.s64 %rd562, %rd561, %rd2443; | |
shr.u64 %rd563, %rd562, 32; | |
shr.u64 %rd564, %rd558, 32; | |
and.b64 %rd565, %rd532, 4294967295; | |
xor.b64 %rd566, %rd565, %rd564; | |
xor.b64 %rd567, %rd566, %rd2455; | |
mul.lo.s64 %rd568, %rd567, %rd2443; | |
and.b64 %rd569, %rd568, 4294967295; | |
xor.b64 %rd570, %rd569, %rd563; | |
xor.b64 %rd571, %rd570, %rd2456; | |
mul.lo.s64 %rd572, %rd571, %rd2445; | |
shr.u64 %rd573, %rd572, 32; | |
shr.u64 %rd574, %rd568, 32; | |
and.b64 %rd575, %rd542, 4294967295; | |
xor.b64 %rd576, %rd575, %rd574; | |
xor.b64 %rd577, %rd576, %rd2457; | |
mul.lo.s64 %rd578, %rd577, %rd2445; | |
and.b64 %rd579, %rd578, 4294967295; | |
xor.b64 %rd580, %rd579, %rd573; | |
xor.b64 %rd581, %rd580, %rd2458; | |
mul.lo.s64 %rd582, %rd581, %rd2443; | |
shr.u64 %rd583, %rd582, 32; | |
shr.u64 %rd584, %rd578, 32; | |
and.b64 %rd585, %rd552, 4294967295; | |
xor.b64 %rd586, %rd585, %rd584; | |
xor.b64 %rd587, %rd586, %rd2459; | |
mul.lo.s64 %rd588, %rd587, %rd2443; | |
and.b64 %rd589, %rd588, 4294967295; | |
xor.b64 %rd590, %rd589, %rd583; | |
xor.b64 %rd591, %rd590, %rd2460; | |
mul.lo.s64 %rd592, %rd591, %rd2445; | |
shr.u64 %rd593, %rd592, 32; | |
cvt.u32.u64 %r56, %rd593; | |
shr.u64 %rd594, %rd588, 32; | |
xor.b64 %rd595, %rd594, %rd562; | |
cvt.u32.u64 %r57, %rd595; | |
xor.b32 %r58, %r310, %r57; | |
mul.lo.s32 %r59, %r58, %r311; | |
xor.b32 %r60, %r59, %r56; | |
xor.b32 %r61, %r60, %r312; | |
shr.u32 %r62, %r61, 9; | |
cvt.rn.f32.u32 %f19, %r62; | |
mul.rn.f32 %f20, %f19, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f20; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p4, %h1, %h2; | |
mul.wide.u32 %rd596, %r2, 2048; | |
add.s64 %rd597, %rd9, %rd596; | |
mul.wide.u32 %rd598, %r3, 2; | |
add.s64 %rd44, %rd597, %rd598; | |
ld.global.nc.b16 %h3, [%rd44]; | |
mul.wide.u32 %rd599, %r3, 4; | |
add.s64 %rd45, %rd1, %rd599; | |
ld.global.nc.f32 %f21, [%rd45]; | |
cvt.rn.f16.f32 %h4, %f21; | |
add.rn.f16 %h5, %h3, %h4; | |
mov.b16 %h6, 0x3C72; | |
mul.rn.f16 %h7, %h5, %h6; | |
selp.b16 %h8, %h7, 0x0000, %p4; | |
cvt.f32.f16 %f22, %h8; | |
add.s64 %rd600, %rd8, %rd596; | |
add.s64 %rd46, %rd600, %rd598; | |
ld.global.nc.b16 %h9, [%rd46]; | |
cvt.f32.f16 %f23, %h9; | |
mul.wide.u32 %rd601, %r2, 4; | |
add.s64 %rd602, %rd5, %rd601; | |
ld.global.nc.f32 %f24, [%rd602]; | |
mul.rn.f32 %f25, %f24, 0f3A800000; | |
add.rn.f32 %f26, %f25, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f1, %f26; | |
add.s64 %rd47, %rd2, %rd599; | |
ld.global.nc.f32 %f27, [%rd47]; | |
mul.rn.f32 %f28, %f1, %f27; | |
mul.rn.f32 %f29, %f28, %f23; | |
add.s64 %rd48, %rd3, %rd599; | |
ld.global.nc.f32 %f30, [%rd48]; | |
add.s64 %rd603, %rd4, %rd601; | |
ld.global.nc.f32 %f31, [%rd603]; | |
mul.rn.f32 %f2, %f31, 0f3A800000; | |
mul.rn.f32 %f32, %f28, %f2; | |
sub.rn.f32 %f33, %f30, %f32; | |
add.rn.f32 %f34, %f29, %f33; | |
add.rn.f32 %f35, %f34, %f22; | |
add.rn.f32 %f3, %f35, 0f00000000; | |
or.b32 %r63, %r3, 1; | |
and.b32 %r64, %r63, 3; | |
setp.ne.s32 %p5, %r64, 1; | |
@%p5 bra LBB79_7; | |
mul.lo.s64 %rd2466, %rd2384, 3528531795; | |
selp.u64 %rd644, 1, 0, %p69; | |
add.s64 %rd645, %rd2461, %rd644; | |
xor.b64 %rd646, %rd645, %rd2466; | |
shr.u64 %rd647, %rd646, 32; | |
mul.lo.s64 %rd2469, %rd647, 3449720151; | |
shr.u64 %rd648, %rd2469, 32; | |
and.b64 %rd649, %rd645, 4294967295; | |
mul.lo.s64 %rd650, %rd649, 3449720151; | |
and.b64 %rd651, %rd650, 4294967295; | |
xor.b64 %rd652, %rd651, %rd648; | |
xor.b64 %rd653, %rd652, 2654435769; | |
mul.lo.s64 %rd2472, %rd653, 3528531795; | |
xor.b64 %rd2462, %rd650, %rd12; | |
mov.u32 %r314, -845247145; | |
mov.u32 %r313, -616729560; | |
mov.u64 %rd2479, 3041712726; | |
mov.u64 %rd2478, 1401181199; | |
mov.u64 %rd2477, 2835769497; | |
mov.u64 %rd2476, 1684936478; | |
mov.u64 %rd2475, 2027808484; | |
mov.u64 %rd2474, 387276957; | |
mov.u64 %rd2473, 842468239; | |
mov.u64 %rd2471, 3986602516; | |
mov.u64 %rd2470, 1013904242; | |
mov.u64 %rd2468, 3668340011; | |
mov.u64 %rd2467, 3144134277; | |
mov.u64 %rd2465, 3449720151; | |
mov.u64 %rd2464, 1993301258; | |
mov.u64 %rd2463, 3528531795; | |
bra.uni LBB79_8; | |
LBB79_7: | |
mov.u32 %r313, -239350328; | |
selp.u64 %rd618, 1, 0, %p69; | |
add.s64 %rd619, %rd2461, %rd618; | |
and.b64 %rd620, %rd619, 4294967295; | |
mul.lo.s64 %rd2466, %rd620, 3449720151; | |
xor.b64 %rd621, %rd2466, %rd12; | |
shr.u64 %rd622, %rd621, 32; | |
mul.lo.s64 %rd2469, %rd622, 3528531795; | |
shr.u64 %rd623, %rd2469, 32; | |
mul.lo.s64 %rd625, %rd2384, 3528531795; | |
and.b64 %rd626, %rd625, 4294967295; | |
xor.b64 %rd627, %rd626, %rd623; | |
xor.b64 %rd628, %rd627, 3144134277; | |
mul.lo.s64 %rd2472, %rd628, 3449720151; | |
xor.b64 %rd2462, %rd619, %rd625; | |
mov.u32 %r314, -766435501; | |
mov.u64 %rd2479, 1684936478; | |
mov.u64 %rd2478, 534103459; | |
mov.u64 %rd2477, 387276957; | |
mov.u64 %rd2476, 3041712726; | |
mov.u64 %rd2475, 3986602516; | |
mov.u64 %rd2474, 2835769497; | |
mov.u64 %rd2473, 3668340011; | |
mov.u64 %rd2471, 2027808484; | |
mov.u64 %rd2470, 1993301258; | |
mov.u64 %rd2468, 842468239; | |
mov.u64 %rd2467, 2654435769; | |
mov.u64 %rd2465, 3528531795; | |
mov.u64 %rd2464, 1013904242; | |
mov.u64 %rd2463, 3449720151; | |
LBB79_8: | |
setp.ne.s32 %p8, %r5, 0; | |
shr.u64 %rd654, %rd2472, 32; | |
shr.u64 %rd655, %rd2462, 32; | |
mul.lo.s64 %rd656, %rd655, %rd2463; | |
and.b64 %rd657, %rd656, 4294967295; | |
xor.b64 %rd658, %rd657, %rd654; | |
xor.b64 %rd659, %rd658, %rd2464; | |
mul.lo.s64 %rd660, %rd659, %rd2465; | |
shr.u64 %rd661, %rd660, 32; | |
shr.u64 %rd662, %rd656, 32; | |
and.b64 %rd663, %rd2466, 4294967295; | |
xor.b64 %rd664, %rd663, %rd662; | |
xor.b64 %rd665, %rd664, %rd2467; | |
mul.lo.s64 %rd666, %rd665, %rd2465; | |
and.b64 %rd667, %rd666, 4294967295; | |
xor.b64 %rd668, %rd667, %rd661; | |
xor.b64 %rd669, %rd668, %rd2468; | |
mul.lo.s64 %rd670, %rd669, %rd2463; | |
shr.u64 %rd671, %rd670, 32; | |
shr.u64 %rd672, %rd666, 32; | |
and.b64 %rd673, %rd2469, 4294967295; | |
xor.b64 %rd674, %rd673, %rd672; | |
xor.b64 %rd675, %rd674, %rd2470; | |
mul.lo.s64 %rd676, %rd675, %rd2463; | |
and.b64 %rd677, %rd676, 4294967295; | |
xor.b64 %rd678, %rd677, %rd671; | |
xor.b64 %rd679, %rd678, %rd2471; | |
mul.lo.s64 %rd680, %rd679, %rd2465; | |
shr.u64 %rd681, %rd680, 32; | |
shr.u64 %rd682, %rd676, 32; | |
and.b64 %rd683, %rd2472, 4294967295; | |
xor.b64 %rd684, %rd683, %rd682; | |
xor.b64 %rd685, %rd684, %rd2473; | |
mul.lo.s64 %rd686, %rd685, %rd2465; | |
and.b64 %rd687, %rd686, 4294967295; | |
xor.b64 %rd688, %rd687, %rd681; | |
xor.b64 %rd689, %rd688, %rd2474; | |
mul.lo.s64 %rd690, %rd689, %rd2463; | |
shr.u64 %rd691, %rd690, 32; | |
shr.u64 %rd692, %rd686, 32; | |
and.b64 %rd693, %rd660, 4294967295; | |
xor.b64 %rd694, %rd693, %rd692; | |
xor.b64 %rd695, %rd694, %rd2475; | |
mul.lo.s64 %rd696, %rd695, %rd2463; | |
and.b64 %rd697, %rd696, 4294967295; | |
xor.b64 %rd698, %rd697, %rd691; | |
xor.b64 %rd699, %rd698, %rd2476; | |
mul.lo.s64 %rd700, %rd699, %rd2465; | |
shr.u64 %rd701, %rd700, 32; | |
shr.u64 %rd702, %rd696, 32; | |
and.b64 %rd703, %rd670, 4294967295; | |
xor.b64 %rd704, %rd703, %rd702; | |
xor.b64 %rd705, %rd704, %rd2477; | |
mul.lo.s64 %rd706, %rd705, %rd2465; | |
and.b64 %rd707, %rd706, 4294967295; | |
xor.b64 %rd708, %rd707, %rd701; | |
xor.b64 %rd709, %rd708, %rd2478; | |
mul.lo.s64 %rd710, %rd709, %rd2463; | |
shr.u64 %rd711, %rd710, 32; | |
shr.u64 %rd712, %rd706, 32; | |
xor.b64 %rd713, %rd680, %rd712; | |
xor.b64 %rd714, %rd713, %rd2479; | |
mul.lo.s64 %rd715, %rd714, %rd2463; | |
xor.b64 %rd716, %rd711, %rd715; | |
cvt.u32.u64 %r69, %rd716; | |
xor.b32 %r70, %r313, %r69; | |
mul.lo.s32 %r71, %r70, %r314; | |
shr.u32 %r72, %r71, 9; | |
cvt.rn.f32.u32 %f36, %r72; | |
mul.rn.f32 %f37, %f36, 0f34000000; | |
cvt.rn.f16.f32 %h10, %f37; | |
mov.b16 %h11, 0x2E66; | |
setp.ge.f16 %p9, %h10, %h11; | |
ld.global.nc.b16 %h12, [%rd44+2]; | |
ld.global.nc.f32 %f38, [%rd45+4]; | |
cvt.rn.f16.f32 %h13, %f38; | |
add.rn.f16 %h14, %h12, %h13; | |
mov.b16 %h15, 0x3C72; | |
mul.rn.f16 %h16, %h14, %h15; | |
selp.b16 %h17, %h16, 0x0000, %p9; | |
cvt.f32.f16 %f39, %h17; | |
ld.global.nc.b16 %h18, [%rd46+2]; | |
cvt.f32.f16 %f40, %h18; | |
ld.global.nc.f32 %f41, [%rd47+4]; | |
mul.rn.f32 %f42, %f1, %f41; | |
mul.rn.f32 %f43, %f42, %f40; | |
ld.global.nc.f32 %f44, [%rd48+4]; | |
mul.rn.f32 %f45, %f2, %f42; | |
sub.rn.f32 %f46, %f44, %f45; | |
add.rn.f32 %f47, %f43, %f46; | |
add.rn.f32 %f48, %f47, %f39; | |
add.rn.f32 %f4, %f3, %f48; | |
or.b32 %r73, %r3, %r4; | |
or.b32 %r74, %r73, 128; | |
shr.u32 %r75, %r74, 2; | |
cvt.u64.u32 %rd717, %r75; | |
add.s64 %rd75, %rd11, %rd717; | |
and.b64 %rd2433, %rd75, 4294967295; | |
setp.lt.u64 %p74, %rd75, %rd11; | |
@%p8 bra LBB79_10; | |
mul.lo.s64 %rd2484, %rd2433, 3528531795; | |
selp.u64 %rd760, 1, 0, %p74; | |
add.s64 %rd761, %rd2461, %rd760; | |
xor.b64 %rd762, %rd761, %rd2484; | |
shr.u64 %rd763, %rd762, 32; | |
mul.lo.s64 %rd2487, %rd763, 3449720151; | |
shr.u64 %rd764, %rd2487, 32; | |
and.b64 %rd765, %rd761, 4294967295; | |
mul.lo.s64 %rd766, %rd765, 3449720151; | |
and.b64 %rd767, %rd766, 4294967295; | |
xor.b64 %rd768, %rd767, %rd764; | |
xor.b64 %rd769, %rd768, 2654435769; | |
mul.lo.s64 %rd2490, %rd769, 3528531795; | |
xor.b64 %rd2480, %rd766, %rd75; | |
mov.u32 %r317, -1879881855; | |
mov.u32 %r316, -845247145; | |
mov.u32 %r315, 534103459; | |
mov.u64 %rd2498, 3678237736; | |
mov.u64 %rd2497, 3041712726; | |
mov.u64 %rd2496, 1401181199; | |
mov.u64 %rd2495, 2835769497; | |
mov.u64 %rd2494, 1684936478; | |
mov.u64 %rd2493, 2027808484; | |
mov.u64 %rd2492, 387276957; | |
mov.u64 %rd2491, 842468239; | |
mov.u64 %rd2489, 3986602516; | |
mov.u64 %rd2488, 1013904242; | |
mov.u64 %rd2486, 3668340011; | |
mov.u64 %rd2485, 3144134277; | |
mov.u64 %rd2483, 3449720151; | |
mov.u64 %rd2482, 1993301258; | |
mov.u64 %rd2481, 3528531795; | |
bra.uni LBB79_11; | |
LBB79_10: | |
selp.u64 %rd733, 1, 0, %p74; | |
add.s64 %rd734, %rd2461, %rd733; | |
and.b64 %rd735, %rd734, 4294967295; | |
mul.lo.s64 %rd2484, %rd735, 3449720151; | |
xor.b64 %rd736, %rd2484, %rd75; | |
shr.u64 %rd737, %rd736, 32; | |
mul.lo.s64 %rd2487, %rd737, 3528531795; | |
shr.u64 %rd738, %rd2487, 32; | |
mul.lo.s64 %rd740, %rd2433, 3528531795; | |
and.b64 %rd741, %rd740, 4294967295; | |
xor.b64 %rd742, %rd741, %rd738; | |
xor.b64 %rd743, %rd742, 3144134277; | |
mul.lo.s64 %rd2490, %rd743, 3449720151; | |
xor.b64 %rd2480, %rd734, %rd740; | |
mov.u32 %r317, -1767562579; | |
mov.u32 %r316, -766435501; | |
mov.u32 %r315, 1401181199; | |
mov.u64 %rd2498, 4055616968; | |
mov.u64 %rd2497, 1684936478; | |
mov.u64 %rd2496, 534103459; | |
mov.u64 %rd2495, 387276957; | |
mov.u64 %rd2494, 3041712726; | |
mov.u64 %rd2493, 3986602516; | |
mov.u64 %rd2492, 2835769497; | |
mov.u64 %rd2491, 3668340011; | |
mov.u64 %rd2489, 2027808484; | |
mov.u64 %rd2488, 1993301258; | |
mov.u64 %rd2486, 842468239; | |
mov.u64 %rd2485, 2654435769; | |
mov.u64 %rd2483, 3528531795; | |
mov.u64 %rd2482, 1013904242; | |
mov.u64 %rd2481, 3449720151; | |
LBB79_11: | |
shr.u64 %rd770, %rd2490, 32; | |
shr.u64 %rd771, %rd2480, 32; | |
mul.lo.s64 %rd772, %rd771, %rd2481; | |
and.b64 %rd773, %rd772, 4294967295; | |
xor.b64 %rd774, %rd773, %rd770; | |
xor.b64 %rd775, %rd774, %rd2482; | |
mul.lo.s64 %rd776, %rd775, %rd2483; | |
shr.u64 %rd777, %rd776, 32; | |
shr.u64 %rd778, %rd772, 32; | |
and.b64 %rd779, %rd2484, 4294967295; | |
xor.b64 %rd780, %rd779, %rd778; | |
xor.b64 %rd781, %rd780, %rd2485; | |
mul.lo.s64 %rd782, %rd781, %rd2483; | |
and.b64 %rd783, %rd782, 4294967295; | |
xor.b64 %rd784, %rd783, %rd777; | |
xor.b64 %rd785, %rd784, %rd2486; | |
mul.lo.s64 %rd786, %rd785, %rd2481; | |
shr.u64 %rd787, %rd786, 32; | |
shr.u64 %rd788, %rd782, 32; | |
and.b64 %rd789, %rd2487, 4294967295; | |
xor.b64 %rd790, %rd789, %rd788; | |
xor.b64 %rd791, %rd790, %rd2488; | |
mul.lo.s64 %rd792, %rd791, %rd2481; | |
and.b64 %rd793, %rd792, 4294967295; | |
xor.b64 %rd794, %rd793, %rd787; | |
xor.b64 %rd795, %rd794, %rd2489; | |
mul.lo.s64 %rd796, %rd795, %rd2483; | |
shr.u64 %rd797, %rd796, 32; | |
shr.u64 %rd798, %rd792, 32; | |
and.b64 %rd799, %rd2490, 4294967295; | |
xor.b64 %rd800, %rd799, %rd798; | |
xor.b64 %rd801, %rd800, %rd2491; | |
mul.lo.s64 %rd802, %rd801, %rd2483; | |
and.b64 %rd803, %rd802, 4294967295; | |
xor.b64 %rd804, %rd803, %rd797; | |
xor.b64 %rd805, %rd804, %rd2492; | |
mul.lo.s64 %rd806, %rd805, %rd2481; | |
shr.u64 %rd807, %rd806, 32; | |
shr.u64 %rd808, %rd802, 32; | |
and.b64 %rd809, %rd776, 4294967295; | |
xor.b64 %rd810, %rd809, %rd808; | |
xor.b64 %rd811, %rd810, %rd2493; | |
mul.lo.s64 %rd812, %rd811, %rd2481; | |
and.b64 %rd813, %rd812, 4294967295; | |
xor.b64 %rd814, %rd813, %rd807; | |
xor.b64 %rd815, %rd814, %rd2494; | |
mul.lo.s64 %rd816, %rd815, %rd2483; | |
shr.u64 %rd817, %rd816, 32; | |
shr.u64 %rd818, %rd812, 32; | |
and.b64 %rd819, %rd786, 4294967295; | |
xor.b64 %rd820, %rd819, %rd818; | |
xor.b64 %rd821, %rd820, %rd2495; | |
mul.lo.s64 %rd822, %rd821, %rd2483; | |
and.b64 %rd823, %rd822, 4294967295; | |
xor.b64 %rd824, %rd823, %rd817; | |
xor.b64 %rd825, %rd824, %rd2496; | |
mul.lo.s64 %rd826, %rd825, %rd2481; | |
shr.u64 %rd827, %rd826, 32; | |
shr.u64 %rd828, %rd822, 32; | |
and.b64 %rd829, %rd796, 4294967295; | |
xor.b64 %rd830, %rd829, %rd828; | |
xor.b64 %rd831, %rd830, %rd2497; | |
mul.lo.s64 %rd832, %rd831, %rd2481; | |
and.b64 %rd833, %rd832, 4294967295; | |
xor.b64 %rd834, %rd833, %rd827; | |
xor.b64 %rd835, %rd834, %rd2498; | |
mul.lo.s64 %rd836, %rd835, %rd2483; | |
shr.u64 %rd837, %rd836, 32; | |
cvt.u32.u64 %r82, %rd837; | |
shr.u64 %rd838, %rd832, 32; | |
xor.b64 %rd839, %rd838, %rd806; | |
cvt.u32.u64 %r83, %rd839; | |
xor.b32 %r84, %r315, %r83; | |
mul.lo.s32 %r85, %r84, %r316; | |
xor.b32 %r86, %r85, %r82; | |
xor.b32 %r87, %r86, %r317; | |
shr.u32 %r88, %r87, 9; | |
cvt.rn.f32.u32 %f49, %r88; | |
mul.rn.f32 %f50, %f49, 0f34000000; | |
cvt.rn.f16.f32 %h19, %f50; | |
mov.b16 %h20, 0x2E66; | |
setp.ge.f16 %p12, %h19, %h20; | |
ld.global.nc.b16 %h21, [%rd44+256]; | |
ld.global.nc.f32 %f51, [%rd45+512]; | |
cvt.rn.f16.f32 %h22, %f51; | |
add.rn.f16 %h23, %h21, %h22; | |
mov.b16 %h24, 0x3C72; | |
mul.rn.f16 %h25, %h23, %h24; | |
selp.b16 %h26, %h25, 0x0000, %p12; | |
cvt.f32.f16 %f52, %h26; | |
ld.global.nc.b16 %h27, [%rd46+256]; | |
cvt.f32.f16 %f53, %h27; | |
ld.global.nc.f32 %f54, [%rd47+512]; | |
mul.rn.f32 %f55, %f1, %f54; | |
mul.rn.f32 %f56, %f55, %f53; | |
ld.global.nc.f32 %f57, [%rd48+512]; | |
mul.rn.f32 %f58, %f2, %f55; | |
sub.rn.f32 %f59, %f57, %f58; | |
add.rn.f32 %f60, %f56, %f59; | |
add.rn.f32 %f61, %f60, %f52; | |
add.rn.f32 %f5, %f4, %f61; | |
or.b32 %r89, %r3, 129; | |
or.b32 %r90, %r89, %r4; | |
and.b32 %r91, %r89, 3; | |
shr.u32 %r92, %r90, 2; | |
setp.ne.s32 %p13, %r91, 1; | |
cvt.u64.u32 %rd840, %r92; | |
add.s64 %rd103, %rd11, %rd840; | |
and.b64 %rd2430, %rd103, 4294967295; | |
setp.lt.u64 %p73, %rd103, %rd11; | |
@%p13 bra LBB79_13; | |
mul.lo.s64 %rd2503, %rd2430, 3528531795; | |
selp.u64 %rd881, 1, 0, %p73; | |
add.s64 %rd882, %rd2461, %rd881; | |
xor.b64 %rd883, %rd882, %rd2503; | |
shr.u64 %rd884, %rd883, 32; | |
mul.lo.s64 %rd2506, %rd884, 3449720151; | |
shr.u64 %rd885, %rd2506, 32; | |
and.b64 %rd886, %rd882, 4294967295; | |
mul.lo.s64 %rd887, %rd886, 3449720151; | |
and.b64 %rd888, %rd887, 4294967295; | |
xor.b64 %rd889, %rd888, %rd885; | |
xor.b64 %rd890, %rd889, 2654435769; | |
mul.lo.s64 %rd2509, %rd890, 3528531795; | |
xor.b64 %rd2499, %rd887, %rd103; | |
mov.u32 %r319, -845247145; | |
mov.u32 %r318, -616729560; | |
mov.u64 %rd2516, 3041712726; | |
mov.u64 %rd2515, 1401181199; | |
mov.u64 %rd2514, 2835769497; | |
mov.u64 %rd2513, 1684936478; | |
mov.u64 %rd2512, 2027808484; | |
mov.u64 %rd2511, 387276957; | |
mov.u64 %rd2510, 842468239; | |
mov.u64 %rd2508, 3986602516; | |
mov.u64 %rd2507, 1013904242; | |
mov.u64 %rd2505, 3668340011; | |
mov.u64 %rd2504, 3144134277; | |
mov.u64 %rd2502, 3449720151; | |
mov.u64 %rd2501, 1993301258; | |
mov.u64 %rd2500, 3528531795; | |
bra.uni LBB79_14; | |
LBB79_13: | |
selp.u64 %rd855, 1, 0, %p73; | |
add.s64 %rd856, %rd2461, %rd855; | |
and.b64 %rd857, %rd856, 4294967295; | |
mul.lo.s64 %rd2503, %rd857, 3449720151; | |
xor.b64 %rd858, %rd2503, %rd103; | |
shr.u64 %rd859, %rd858, 32; | |
mul.lo.s64 %rd2506, %rd859, 3528531795; | |
shr.u64 %rd860, %rd2506, 32; | |
mul.lo.s64 %rd862, %rd2430, 3528531795; | |
and.b64 %rd863, %rd862, 4294967295; | |
xor.b64 %rd864, %rd863, %rd860; | |
xor.b64 %rd865, %rd864, 3144134277; | |
mul.lo.s64 %rd2509, %rd865, 3449720151; | |
xor.b64 %rd2499, %rd856, %rd862; | |
mov.u32 %r319, -766435501; | |
mov.u32 %r318, -239350328; | |
mov.u64 %rd2516, 1684936478; | |
mov.u64 %rd2515, 534103459; | |
mov.u64 %rd2514, 387276957; | |
mov.u64 %rd2513, 3041712726; | |
mov.u64 %rd2512, 3986602516; | |
mov.u64 %rd2511, 2835769497; | |
mov.u64 %rd2510, 3668340011; | |
mov.u64 %rd2508, 2027808484; | |
mov.u64 %rd2507, 1993301258; | |
mov.u64 %rd2505, 842468239; | |
mov.u64 %rd2504, 2654435769; | |
mov.u64 %rd2502, 3528531795; | |
mov.u64 %rd2501, 1013904242; | |
mov.u64 %rd2500, 3449720151; | |
LBB79_14: | |
shr.u64 %rd891, %rd2509, 32; | |
shr.u64 %rd892, %rd2499, 32; | |
mul.lo.s64 %rd893, %rd892, %rd2500; | |
and.b64 %rd894, %rd893, 4294967295; | |
xor.b64 %rd895, %rd894, %rd891; | |
xor.b64 %rd896, %rd895, %rd2501; | |
mul.lo.s64 %rd897, %rd896, %rd2502; | |
shr.u64 %rd898, %rd897, 32; | |
shr.u64 %rd899, %rd893, 32; | |
and.b64 %rd900, %rd2503, 4294967295; | |
xor.b64 %rd901, %rd900, %rd899; | |
xor.b64 %rd902, %rd901, %rd2504; | |
mul.lo.s64 %rd903, %rd902, %rd2502; | |
and.b64 %rd904, %rd903, 4294967295; | |
xor.b64 %rd905, %rd904, %rd898; | |
xor.b64 %rd906, %rd905, %rd2505; | |
mul.lo.s64 %rd907, %rd906, %rd2500; | |
shr.u64 %rd908, %rd907, 32; | |
shr.u64 %rd909, %rd903, 32; | |
and.b64 %rd910, %rd2506, 4294967295; | |
xor.b64 %rd911, %rd910, %rd909; | |
xor.b64 %rd912, %rd911, %rd2507; | |
mul.lo.s64 %rd913, %rd912, %rd2500; | |
and.b64 %rd914, %rd913, 4294967295; | |
xor.b64 %rd915, %rd914, %rd908; | |
xor.b64 %rd916, %rd915, %rd2508; | |
mul.lo.s64 %rd917, %rd916, %rd2502; | |
shr.u64 %rd918, %rd917, 32; | |
shr.u64 %rd919, %rd913, 32; | |
and.b64 %rd920, %rd2509, 4294967295; | |
xor.b64 %rd921, %rd920, %rd919; | |
xor.b64 %rd922, %rd921, %rd2510; | |
mul.lo.s64 %rd923, %rd922, %rd2502; | |
and.b64 %rd924, %rd923, 4294967295; | |
xor.b64 %rd925, %rd924, %rd918; | |
xor.b64 %rd926, %rd925, %rd2511; | |
mul.lo.s64 %rd927, %rd926, %rd2500; | |
shr.u64 %rd928, %rd927, 32; | |
shr.u64 %rd929, %rd923, 32; | |
and.b64 %rd930, %rd897, 4294967295; | |
xor.b64 %rd931, %rd930, %rd929; | |
xor.b64 %rd932, %rd931, %rd2512; | |
mul.lo.s64 %rd933, %rd932, %rd2500; | |
and.b64 %rd934, %rd933, 4294967295; | |
xor.b64 %rd935, %rd934, %rd928; | |
xor.b64 %rd936, %rd935, %rd2513; | |
mul.lo.s64 %rd937, %rd936, %rd2502; | |
shr.u64 %rd938, %rd937, 32; | |
shr.u64 %rd939, %rd933, 32; | |
and.b64 %rd940, %rd907, 4294967295; | |
xor.b64 %rd941, %rd940, %rd939; | |
xor.b64 %rd942, %rd941, %rd2514; | |
mul.lo.s64 %rd943, %rd942, %rd2502; | |
and.b64 %rd944, %rd943, 4294967295; | |
xor.b64 %rd945, %rd944, %rd938; | |
xor.b64 %rd946, %rd945, %rd2515; | |
mul.lo.s64 %rd947, %rd946, %rd2500; | |
shr.u64 %rd948, %rd947, 32; | |
shr.u64 %rd949, %rd943, 32; | |
xor.b64 %rd950, %rd917, %rd949; | |
xor.b64 %rd951, %rd950, %rd2516; | |
mul.lo.s64 %rd952, %rd951, %rd2500; | |
xor.b64 %rd953, %rd948, %rd952; | |
cvt.u32.u64 %r97, %rd953; | |
xor.b32 %r98, %r318, %r97; | |
mul.lo.s32 %r99, %r98, %r319; | |
shr.u32 %r100, %r99, 9; | |
cvt.rn.f32.u32 %f62, %r100; | |
mul.rn.f32 %f63, %f62, 0f34000000; | |
cvt.rn.f16.f32 %h28, %f63; | |
mov.b16 %h29, 0x2E66; | |
setp.ge.f16 %p17, %h28, %h29; | |
ld.global.nc.b16 %h30, [%rd44+258]; | |
ld.global.nc.f32 %f64, [%rd45+516]; | |
cvt.rn.f16.f32 %h31, %f64; | |
add.rn.f16 %h32, %h30, %h31; | |
mov.b16 %h33, 0x3C72; | |
mul.rn.f16 %h34, %h32, %h33; | |
selp.b16 %h35, %h34, 0x0000, %p17; | |
cvt.f32.f16 %f65, %h35; | |
ld.global.nc.b16 %h36, [%rd46+258]; | |
cvt.f32.f16 %f66, %h36; | |
ld.global.nc.f32 %f67, [%rd47+516]; | |
mul.rn.f32 %f68, %f1, %f67; | |
mul.rn.f32 %f69, %f68, %f66; | |
ld.global.nc.f32 %f70, [%rd48+516]; | |
mul.rn.f32 %f71, %f2, %f68; | |
sub.rn.f32 %f72, %f70, %f71; | |
add.rn.f32 %f73, %f69, %f72; | |
add.rn.f32 %f74, %f73, %f65; | |
add.rn.f32 %f6, %f5, %f74; | |
or.b32 %r102, %r73, 256; | |
shr.u32 %r103, %r102, 2; | |
cvt.u64.u32 %rd954, %r103; | |
add.s64 %rd130, %rd11, %rd954; | |
and.b64 %rd2426, %rd130, 4294967295; | |
setp.lt.u64 %p72, %rd130, %rd11; | |
@%p8 bra LBB79_16; | |
mul.lo.s64 %rd2521, %rd2426, 3528531795; | |
selp.u64 %rd997, 1, 0, %p72; | |
add.s64 %rd998, %rd2461, %rd997; | |
xor.b64 %rd999, %rd998, %rd2521; | |
shr.u64 %rd1000, %rd999, 32; | |
mul.lo.s64 %rd2524, %rd1000, 3449720151; | |
shr.u64 %rd1001, %rd2524, 32; | |
and.b64 %rd1002, %rd998, 4294967295; | |
mul.lo.s64 %rd1003, %rd1002, 3449720151; | |
and.b64 %rd1004, %rd1003, 4294967295; | |
xor.b64 %rd1005, %rd1004, %rd1001; | |
xor.b64 %rd1006, %rd1005, 2654435769; | |
mul.lo.s64 %rd2527, %rd1006, 3528531795; | |
xor.b64 %rd2517, %rd1003, %rd130; | |
mov.u32 %r322, -1879881855; | |
mov.u32 %r321, -845247145; | |
mov.u32 %r320, 534103459; | |
mov.u64 %rd2535, 3678237736; | |
mov.u64 %rd2534, 3041712726; | |
mov.u64 %rd2533, 1401181199; | |
mov.u64 %rd2532, 2835769497; | |
mov.u64 %rd2531, 1684936478; | |
mov.u64 %rd2530, 2027808484; | |
mov.u64 %rd2529, 387276957; | |
mov.u64 %rd2528, 842468239; | |
mov.u64 %rd2526, 3986602516; | |
mov.u64 %rd2525, 1013904242; | |
mov.u64 %rd2523, 3668340011; | |
mov.u64 %rd2522, 3144134277; | |
mov.u64 %rd2520, 3449720151; | |
mov.u64 %rd2519, 1993301258; | |
mov.u64 %rd2518, 3528531795; | |
bra.uni LBB79_17; | |
LBB79_16: | |
selp.u64 %rd970, 1, 0, %p72; | |
add.s64 %rd971, %rd2461, %rd970; | |
and.b64 %rd972, %rd971, 4294967295; | |
mul.lo.s64 %rd2521, %rd972, 3449720151; | |
xor.b64 %rd973, %rd2521, %rd130; | |
shr.u64 %rd974, %rd973, 32; | |
mul.lo.s64 %rd2524, %rd974, 3528531795; | |
shr.u64 %rd975, %rd2524, 32; | |
mul.lo.s64 %rd977, %rd2426, 3528531795; | |
and.b64 %rd978, %rd977, 4294967295; | |
xor.b64 %rd979, %rd978, %rd975; | |
xor.b64 %rd980, %rd979, 3144134277; | |
mul.lo.s64 %rd2527, %rd980, 3449720151; | |
xor.b64 %rd2517, %rd971, %rd977; | |
mov.u32 %r322, -1767562579; | |
mov.u32 %r321, -766435501; | |
mov.u32 %r320, 1401181199; | |
mov.u64 %rd2535, 4055616968; | |
mov.u64 %rd2534, 1684936478; | |
mov.u64 %rd2533, 534103459; | |
mov.u64 %rd2532, 387276957; | |
mov.u64 %rd2531, 3041712726; | |
mov.u64 %rd2530, 3986602516; | |
mov.u64 %rd2529, 2835769497; | |
mov.u64 %rd2528, 3668340011; | |
mov.u64 %rd2526, 2027808484; | |
mov.u64 %rd2525, 1993301258; | |
mov.u64 %rd2523, 842468239; | |
mov.u64 %rd2522, 2654435769; | |
mov.u64 %rd2520, 3528531795; | |
mov.u64 %rd2519, 1013904242; | |
mov.u64 %rd2518, 3449720151; | |
LBB79_17: | |
shr.u64 %rd1007, %rd2527, 32; | |
shr.u64 %rd1008, %rd2517, 32; | |
mul.lo.s64 %rd1009, %rd1008, %rd2518; | |
and.b64 %rd1010, %rd1009, 4294967295; | |
xor.b64 %rd1011, %rd1010, %rd1007; | |
xor.b64 %rd1012, %rd1011, %rd2519; | |
mul.lo.s64 %rd1013, %rd1012, %rd2520; | |
shr.u64 %rd1014, %rd1013, 32; | |
shr.u64 %rd1015, %rd1009, 32; | |
and.b64 %rd1016, %rd2521, 4294967295; | |
xor.b64 %rd1017, %rd1016, %rd1015; | |
xor.b64 %rd1018, %rd1017, %rd2522; | |
mul.lo.s64 %rd1019, %rd1018, %rd2520; | |
and.b64 %rd1020, %rd1019, 4294967295; | |
xor.b64 %rd1021, %rd1020, %rd1014; | |
xor.b64 %rd1022, %rd1021, %rd2523; | |
mul.lo.s64 %rd1023, %rd1022, %rd2518; | |
shr.u64 %rd1024, %rd1023, 32; | |
shr.u64 %rd1025, %rd1019, 32; | |
and.b64 %rd1026, %rd2524, 4294967295; | |
xor.b64 %rd1027, %rd1026, %rd1025; | |
xor.b64 %rd1028, %rd1027, %rd2525; | |
mul.lo.s64 %rd1029, %rd1028, %rd2518; | |
and.b64 %rd1030, %rd1029, 4294967295; | |
xor.b64 %rd1031, %rd1030, %rd1024; | |
xor.b64 %rd1032, %rd1031, %rd2526; | |
mul.lo.s64 %rd1033, %rd1032, %rd2520; | |
shr.u64 %rd1034, %rd1033, 32; | |
shr.u64 %rd1035, %rd1029, 32; | |
and.b64 %rd1036, %rd2527, 4294967295; | |
xor.b64 %rd1037, %rd1036, %rd1035; | |
xor.b64 %rd1038, %rd1037, %rd2528; | |
mul.lo.s64 %rd1039, %rd1038, %rd2520; | |
and.b64 %rd1040, %rd1039, 4294967295; | |
xor.b64 %rd1041, %rd1040, %rd1034; | |
xor.b64 %rd1042, %rd1041, %rd2529; | |
mul.lo.s64 %rd1043, %rd1042, %rd2518; | |
shr.u64 %rd1044, %rd1043, 32; | |
shr.u64 %rd1045, %rd1039, 32; | |
and.b64 %rd1046, %rd1013, 4294967295; | |
xor.b64 %rd1047, %rd1046, %rd1045; | |
xor.b64 %rd1048, %rd1047, %rd2530; | |
mul.lo.s64 %rd1049, %rd1048, %rd2518; | |
and.b64 %rd1050, %rd1049, 4294967295; | |
xor.b64 %rd1051, %rd1050, %rd1044; | |
xor.b64 %rd1052, %rd1051, %rd2531; | |
mul.lo.s64 %rd1053, %rd1052, %rd2520; | |
shr.u64 %rd1054, %rd1053, 32; | |
shr.u64 %rd1055, %rd1049, 32; | |
and.b64 %rd1056, %rd1023, 4294967295; | |
xor.b64 %rd1057, %rd1056, %rd1055; | |
xor.b64 %rd1058, %rd1057, %rd2532; | |
mul.lo.s64 %rd1059, %rd1058, %rd2520; | |
and.b64 %rd1060, %rd1059, 4294967295; | |
xor.b64 %rd1061, %rd1060, %rd1054; | |
xor.b64 %rd1062, %rd1061, %rd2533; | |
mul.lo.s64 %rd1063, %rd1062, %rd2518; | |
shr.u64 %rd1064, %rd1063, 32; | |
shr.u64 %rd1065, %rd1059, 32; | |
and.b64 %rd1066, %rd1033, 4294967295; | |
xor.b64 %rd1067, %rd1066, %rd1065; | |
xor.b64 %rd1068, %rd1067, %rd2534; | |
mul.lo.s64 %rd1069, %rd1068, %rd2518; | |
and.b64 %rd1070, %rd1069, 4294967295; | |
xor.b64 %rd1071, %rd1070, %rd1064; | |
xor.b64 %rd1072, %rd1071, %rd2535; | |
mul.lo.s64 %rd1073, %rd1072, %rd2520; | |
shr.u64 %rd1074, %rd1073, 32; | |
cvt.u32.u64 %r110, %rd1074; | |
shr.u64 %rd1075, %rd1069, 32; | |
xor.b64 %rd1076, %rd1075, %rd1043; | |
cvt.u32.u64 %r111, %rd1076; | |
xor.b32 %r112, %r320, %r111; | |
mul.lo.s32 %r113, %r112, %r321; | |
xor.b32 %r114, %r113, %r110; | |
xor.b32 %r115, %r114, %r322; | |
shr.u32 %r116, %r115, 9; | |
cvt.rn.f32.u32 %f75, %r116; | |
mul.rn.f32 %f76, %f75, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f76; | |
mov.b16 %h38, 0x2E66; | |
setp.ge.f16 %p20, %h37, %h38; | |
ld.global.nc.b16 %h39, [%rd44+512]; | |
ld.global.nc.f32 %f77, [%rd45+1024]; | |
cvt.rn.f16.f32 %h40, %f77; | |
add.rn.f16 %h41, %h39, %h40; | |
mov.b16 %h42, 0x3C72; | |
mul.rn.f16 %h43, %h41, %h42; | |
selp.b16 %h44, %h43, 0x0000, %p20; | |
cvt.f32.f16 %f78, %h44; | |
ld.global.nc.b16 %h45, [%rd46+512]; | |
cvt.f32.f16 %f79, %h45; | |
ld.global.nc.f32 %f80, [%rd47+1024]; | |
mul.rn.f32 %f81, %f1, %f80; | |
mul.rn.f32 %f82, %f81, %f79; | |
ld.global.nc.f32 %f83, [%rd48+1024]; | |
mul.rn.f32 %f84, %f2, %f81; | |
sub.rn.f32 %f85, %f83, %f84; | |
add.rn.f32 %f86, %f82, %f85; | |
add.rn.f32 %f87, %f86, %f78; | |
add.rn.f32 %f7, %f6, %f87; | |
or.b32 %r117, %r3, 257; | |
or.b32 %r118, %r117, %r4; | |
and.b32 %r119, %r117, 3; | |
shr.u32 %r120, %r118, 2; | |
setp.ne.s32 %p21, %r119, 1; | |
cvt.u64.u32 %rd1077, %r120; | |
add.s64 %rd158, %rd11, %rd1077; | |
and.b64 %rd2423, %rd158, 4294967295; | |
setp.lt.u64 %p71, %rd158, %rd11; | |
@%p21 bra LBB79_19; | |
mul.lo.s64 %rd2540, %rd2423, 3528531795; | |
selp.u64 %rd1118, 1, 0, %p71; | |
add.s64 %rd1119, %rd2461, %rd1118; | |
xor.b64 %rd1120, %rd1119, %rd2540; | |
shr.u64 %rd1121, %rd1120, 32; | |
mul.lo.s64 %rd2543, %rd1121, 3449720151; | |
shr.u64 %rd1122, %rd2543, 32; | |
and.b64 %rd1123, %rd1119, 4294967295; | |
mul.lo.s64 %rd1124, %rd1123, 3449720151; | |
and.b64 %rd1125, %rd1124, 4294967295; | |
xor.b64 %rd1126, %rd1125, %rd1122; | |
xor.b64 %rd1127, %rd1126, 2654435769; | |
mul.lo.s64 %rd2546, %rd1127, 3528531795; | |
xor.b64 %rd2536, %rd1124, %rd158; | |
mov.u32 %r324, -845247145; | |
mov.u32 %r323, -616729560; | |
mov.u64 %rd2553, 3041712726; | |
mov.u64 %rd2552, 1401181199; | |
mov.u64 %rd2551, 2835769497; | |
mov.u64 %rd2550, 1684936478; | |
mov.u64 %rd2549, 2027808484; | |
mov.u64 %rd2548, 387276957; | |
mov.u64 %rd2547, 842468239; | |
mov.u64 %rd2545, 3986602516; | |
mov.u64 %rd2544, 1013904242; | |
mov.u64 %rd2542, 3668340011; | |
mov.u64 %rd2541, 3144134277; | |
mov.u64 %rd2539, 3449720151; | |
mov.u64 %rd2538, 1993301258; | |
mov.u64 %rd2537, 3528531795; | |
bra.uni LBB79_20; | |
LBB79_19: | |
selp.u64 %rd1092, 1, 0, %p71; | |
add.s64 %rd1093, %rd2461, %rd1092; | |
and.b64 %rd1094, %rd1093, 4294967295; | |
mul.lo.s64 %rd2540, %rd1094, 3449720151; | |
xor.b64 %rd1095, %rd2540, %rd158; | |
shr.u64 %rd1096, %rd1095, 32; | |
mul.lo.s64 %rd2543, %rd1096, 3528531795; | |
shr.u64 %rd1097, %rd2543, 32; | |
mul.lo.s64 %rd1099, %rd2423, 3528531795; | |
and.b64 %rd1100, %rd1099, 4294967295; | |
xor.b64 %rd1101, %rd1100, %rd1097; | |
xor.b64 %rd1102, %rd1101, 3144134277; | |
mul.lo.s64 %rd2546, %rd1102, 3449720151; | |
xor.b64 %rd2536, %rd1093, %rd1099; | |
mov.u32 %r324, -766435501; | |
mov.u32 %r323, -239350328; | |
mov.u64 %rd2553, 1684936478; | |
mov.u64 %rd2552, 534103459; | |
mov.u64 %rd2551, 387276957; | |
mov.u64 %rd2550, 3041712726; | |
mov.u64 %rd2549, 3986602516; | |
mov.u64 %rd2548, 2835769497; | |
mov.u64 %rd2547, 3668340011; | |
mov.u64 %rd2545, 2027808484; | |
mov.u64 %rd2544, 1993301258; | |
mov.u64 %rd2542, 842468239; | |
mov.u64 %rd2541, 2654435769; | |
mov.u64 %rd2539, 3528531795; | |
mov.u64 %rd2538, 1013904242; | |
mov.u64 %rd2537, 3449720151; | |
LBB79_20: | |
shr.u64 %rd1128, %rd2546, 32; | |
shr.u64 %rd1129, %rd2536, 32; | |
mul.lo.s64 %rd1130, %rd1129, %rd2537; | |
and.b64 %rd1131, %rd1130, 4294967295; | |
xor.b64 %rd1132, %rd1131, %rd1128; | |
xor.b64 %rd1133, %rd1132, %rd2538; | |
mul.lo.s64 %rd1134, %rd1133, %rd2539; | |
shr.u64 %rd1135, %rd1134, 32; | |
shr.u64 %rd1136, %rd1130, 32; | |
and.b64 %rd1137, %rd2540, 4294967295; | |
xor.b64 %rd1138, %rd1137, %rd1136; | |
xor.b64 %rd1139, %rd1138, %rd2541; | |
mul.lo.s64 %rd1140, %rd1139, %rd2539; | |
and.b64 %rd1141, %rd1140, 4294967295; | |
xor.b64 %rd1142, %rd1141, %rd1135; | |
xor.b64 %rd1143, %rd1142, %rd2542; | |
mul.lo.s64 %rd1144, %rd1143, %rd2537; | |
shr.u64 %rd1145, %rd1144, 32; | |
shr.u64 %rd1146, %rd1140, 32; | |
and.b64 %rd1147, %rd2543, 4294967295; | |
xor.b64 %rd1148, %rd1147, %rd1146; | |
xor.b64 %rd1149, %rd1148, %rd2544; | |
mul.lo.s64 %rd1150, %rd1149, %rd2537; | |
and.b64 %rd1151, %rd1150, 4294967295; | |
xor.b64 %rd1152, %rd1151, %rd1145; | |
xor.b64 %rd1153, %rd1152, %rd2545; | |
mul.lo.s64 %rd1154, %rd1153, %rd2539; | |
shr.u64 %rd1155, %rd1154, 32; | |
shr.u64 %rd1156, %rd1150, 32; | |
and.b64 %rd1157, %rd2546, 4294967295; | |
xor.b64 %rd1158, %rd1157, %rd1156; | |
xor.b64 %rd1159, %rd1158, %rd2547; | |
mul.lo.s64 %rd1160, %rd1159, %rd2539; | |
and.b64 %rd1161, %rd1160, 4294967295; | |
xor.b64 %rd1162, %rd1161, %rd1155; | |
xor.b64 %rd1163, %rd1162, %rd2548; | |
mul.lo.s64 %rd1164, %rd1163, %rd2537; | |
shr.u64 %rd1165, %rd1164, 32; | |
shr.u64 %rd1166, %rd1160, 32; | |
and.b64 %rd1167, %rd1134, 4294967295; | |
xor.b64 %rd1168, %rd1167, %rd1166; | |
xor.b64 %rd1169, %rd1168, %rd2549; | |
mul.lo.s64 %rd1170, %rd1169, %rd2537; | |
and.b64 %rd1171, %rd1170, 4294967295; | |
xor.b64 %rd1172, %rd1171, %rd1165; | |
xor.b64 %rd1173, %rd1172, %rd2550; | |
mul.lo.s64 %rd1174, %rd1173, %rd2539; | |
shr.u64 %rd1175, %rd1174, 32; | |
shr.u64 %rd1176, %rd1170, 32; | |
and.b64 %rd1177, %rd1144, 4294967295; | |
xor.b64 %rd1178, %rd1177, %rd1176; | |
xor.b64 %rd1179, %rd1178, %rd2551; | |
mul.lo.s64 %rd1180, %rd1179, %rd2539; | |
and.b64 %rd1181, %rd1180, 4294967295; | |
xor.b64 %rd1182, %rd1181, %rd1175; | |
xor.b64 %rd1183, %rd1182, %rd2552; | |
mul.lo.s64 %rd1184, %rd1183, %rd2537; | |
shr.u64 %rd1185, %rd1184, 32; | |
shr.u64 %rd1186, %rd1180, 32; | |
xor.b64 %rd1187, %rd1154, %rd1186; | |
xor.b64 %rd1188, %rd1187, %rd2553; | |
mul.lo.s64 %rd1189, %rd1188, %rd2537; | |
xor.b64 %rd1190, %rd1185, %rd1189; | |
cvt.u32.u64 %r125, %rd1190; | |
xor.b32 %r126, %r323, %r125; | |
mul.lo.s32 %r127, %r126, %r324; | |
shr.u32 %r128, %r127, 9; | |
cvt.rn.f32.u32 %f88, %r128; | |
mul.rn.f32 %f89, %f88, 0f34000000; | |
cvt.rn.f16.f32 %h46, %f89; | |
mov.b16 %h47, 0x2E66; | |
setp.ge.f16 %p25, %h46, %h47; | |
ld.global.nc.b16 %h48, [%rd44+514]; | |
ld.global.nc.f32 %f90, [%rd45+1028]; | |
cvt.rn.f16.f32 %h49, %f90; | |
add.rn.f16 %h50, %h48, %h49; | |
mov.b16 %h51, 0x3C72; | |
mul.rn.f16 %h52, %h50, %h51; | |
selp.b16 %h53, %h52, 0x0000, %p25; | |
cvt.f32.f16 %f91, %h53; | |
ld.global.nc.b16 %h54, [%rd46+514]; | |
cvt.f32.f16 %f92, %h54; | |
ld.global.nc.f32 %f93, [%rd47+1028]; | |
mul.rn.f32 %f94, %f1, %f93; | |
mul.rn.f32 %f95, %f94, %f92; | |
ld.global.nc.f32 %f96, [%rd48+1028]; | |
mul.rn.f32 %f97, %f2, %f94; | |
sub.rn.f32 %f98, %f96, %f97; | |
add.rn.f32 %f99, %f95, %f98; | |
add.rn.f32 %f100, %f99, %f91; | |
add.rn.f32 %f8, %f7, %f100; | |
or.b32 %r130, %r73, 384; | |
shr.u32 %r131, %r130, 2; | |
cvt.u64.u32 %rd1191, %r131; | |
add.s64 %rd185, %rd11, %rd1191; | |
and.b64 %rd2419, %rd185, 4294967295; | |
setp.lt.u64 %p70, %rd185, %rd11; | |
@%p8 bra LBB79_22; | |
mul.lo.s64 %rd2558, %rd2419, 3528531795; | |
selp.u64 %rd1234, 1, 0, %p70; | |
add.s64 %rd1235, %rd2461, %rd1234; | |
xor.b64 %rd1236, %rd1235, %rd2558; | |
shr.u64 %rd1237, %rd1236, 32; | |
mul.lo.s64 %rd2561, %rd1237, 3449720151; | |
shr.u64 %rd1238, %rd2561, 32; | |
and.b64 %rd1239, %rd1235, 4294967295; | |
mul.lo.s64 %rd1240, %rd1239, 3449720151; | |
and.b64 %rd1241, %rd1240, 4294967295; | |
xor.b64 %rd1242, %rd1241, %rd1238; | |
xor.b64 %rd1243, %rd1242, 2654435769; | |
mul.lo.s64 %rd2564, %rd1243, 3528531795; | |
xor.b64 %rd2554, %rd1240, %rd185; | |
mov.u32 %r327, -1879881855; | |
mov.u32 %r326, -845247145; | |
mov.u32 %r325, 534103459; | |
mov.u64 %rd2572, 3678237736; | |
mov.u64 %rd2571, 3041712726; | |
mov.u64 %rd2570, 1401181199; | |
mov.u64 %rd2569, 2835769497; | |
mov.u64 %rd2568, 1684936478; | |
mov.u64 %rd2567, 2027808484; | |
mov.u64 %rd2566, 387276957; | |
mov.u64 %rd2565, 842468239; | |
mov.u64 %rd2563, 3986602516; | |
mov.u64 %rd2562, 1013904242; | |
mov.u64 %rd2560, 3668340011; | |
mov.u64 %rd2559, 3144134277; | |
mov.u64 %rd2557, 3449720151; | |
mov.u64 %rd2556, 1993301258; | |
mov.u64 %rd2555, 3528531795; | |
bra.uni LBB79_23; | |
LBB79_22: | |
selp.u64 %rd1207, 1, 0, %p70; | |
add.s64 %rd1208, %rd2461, %rd1207; | |
and.b64 %rd1209, %rd1208, 4294967295; | |
mul.lo.s64 %rd2558, %rd1209, 3449720151; | |
xor.b64 %rd1210, %rd2558, %rd185; | |
shr.u64 %rd1211, %rd1210, 32; | |
mul.lo.s64 %rd2561, %rd1211, 3528531795; | |
shr.u64 %rd1212, %rd2561, 32; | |
mul.lo.s64 %rd1214, %rd2419, 3528531795; | |
and.b64 %rd1215, %rd1214, 4294967295; | |
xor.b64 %rd1216, %rd1215, %rd1212; | |
xor.b64 %rd1217, %rd1216, 3144134277; | |
mul.lo.s64 %rd2564, %rd1217, 3449720151; | |
xor.b64 %rd2554, %rd1208, %rd1214; | |
mov.u32 %r327, -1767562579; | |
mov.u32 %r326, -766435501; | |
mov.u32 %r325, 1401181199; | |
mov.u64 %rd2572, 4055616968; | |
mov.u64 %rd2571, 1684936478; | |
mov.u64 %rd2570, 534103459; | |
mov.u64 %rd2569, 387276957; | |
mov.u64 %rd2568, 3041712726; | |
mov.u64 %rd2567, 3986602516; | |
mov.u64 %rd2566, 2835769497; | |
mov.u64 %rd2565, 3668340011; | |
mov.u64 %rd2563, 2027808484; | |
mov.u64 %rd2562, 1993301258; | |
mov.u64 %rd2560, 842468239; | |
mov.u64 %rd2559, 2654435769; | |
mov.u64 %rd2557, 3528531795; | |
mov.u64 %rd2556, 1013904242; | |
mov.u64 %rd2555, 3449720151; | |
LBB79_23: | |
shr.u64 %rd1244, %rd2564, 32; | |
shr.u64 %rd1245, %rd2554, 32; | |
mul.lo.s64 %rd1246, %rd1245, %rd2555; | |
and.b64 %rd1247, %rd1246, 4294967295; | |
xor.b64 %rd1248, %rd1247, %rd1244; | |
xor.b64 %rd1249, %rd1248, %rd2556; | |
mul.lo.s64 %rd1250, %rd1249, %rd2557; | |
shr.u64 %rd1251, %rd1250, 32; | |
shr.u64 %rd1252, %rd1246, 32; | |
and.b64 %rd1253, %rd2558, 4294967295; | |
xor.b64 %rd1254, %rd1253, %rd1252; | |
xor.b64 %rd1255, %rd1254, %rd2559; | |
mul.lo.s64 %rd1256, %rd1255, %rd2557; | |
and.b64 %rd1257, %rd1256, 4294967295; | |
xor.b64 %rd1258, %rd1257, %rd1251; | |
xor.b64 %rd1259, %rd1258, %rd2560; | |
mul.lo.s64 %rd1260, %rd1259, %rd2555; | |
shr.u64 %rd1261, %rd1260, 32; | |
shr.u64 %rd1262, %rd1256, 32; | |
and.b64 %rd1263, %rd2561, 4294967295; | |
xor.b64 %rd1264, %rd1263, %rd1262; | |
xor.b64 %rd1265, %rd1264, %rd2562; | |
mul.lo.s64 %rd1266, %rd1265, %rd2555; | |
and.b64 %rd1267, %rd1266, 4294967295; | |
xor.b64 %rd1268, %rd1267, %rd1261; | |
xor.b64 %rd1269, %rd1268, %rd2563; | |
mul.lo.s64 %rd1270, %rd1269, %rd2557; | |
shr.u64 %rd1271, %rd1270, 32; | |
shr.u64 %rd1272, %rd1266, 32; | |
and.b64 %rd1273, %rd2564, 4294967295; | |
xor.b64 %rd1274, %rd1273, %rd1272; | |
xor.b64 %rd1275, %rd1274, %rd2565; | |
mul.lo.s64 %rd1276, %rd1275, %rd2557; | |
and.b64 %rd1277, %rd1276, 4294967295; | |
xor.b64 %rd1278, %rd1277, %rd1271; | |
xor.b64 %rd1279, %rd1278, %rd2566; | |
mul.lo.s64 %rd1280, %rd1279, %rd2555; | |
shr.u64 %rd1281, %rd1280, 32; | |
shr.u64 %rd1282, %rd1276, 32; | |
and.b64 %rd1283, %rd1250, 4294967295; | |
xor.b64 %rd1284, %rd1283, %rd1282; | |
xor.b64 %rd1285, %rd1284, %rd2567; | |
mul.lo.s64 %rd1286, %rd1285, %rd2555; | |
and.b64 %rd1287, %rd1286, 4294967295; | |
xor.b64 %rd1288, %rd1287, %rd1281; | |
xor.b64 %rd1289, %rd1288, %rd2568; | |
mul.lo.s64 %rd1290, %rd1289, %rd2557; | |
shr.u64 %rd1291, %rd1290, 32; | |
shr.u64 %rd1292, %rd1286, 32; | |
and.b64 %rd1293, %rd1260, 4294967295; | |
xor.b64 %rd1294, %rd1293, %rd1292; | |
xor.b64 %rd1295, %rd1294, %rd2569; | |
mul.lo.s64 %rd1296, %rd1295, %rd2557; | |
and.b64 %rd1297, %rd1296, 4294967295; | |
xor.b64 %rd1298, %rd1297, %rd1291; | |
xor.b64 %rd1299, %rd1298, %rd2570; | |
mul.lo.s64 %rd1300, %rd1299, %rd2555; | |
shr.u64 %rd1301, %rd1300, 32; | |
shr.u64 %rd1302, %rd1296, 32; | |
and.b64 %rd1303, %rd1270, 4294967295; | |
xor.b64 %rd1304, %rd1303, %rd1302; | |
xor.b64 %rd1305, %rd1304, %rd2571; | |
mul.lo.s64 %rd1306, %rd1305, %rd2555; | |
and.b64 %rd1307, %rd1306, 4294967295; | |
xor.b64 %rd1308, %rd1307, %rd1301; | |
xor.b64 %rd1309, %rd1308, %rd2572; | |
mul.lo.s64 %rd1310, %rd1309, %rd2557; | |
shr.u64 %rd1311, %rd1310, 32; | |
cvt.u32.u64 %r138, %rd1311; | |
shr.u64 %rd1312, %rd1306, 32; | |
xor.b64 %rd1313, %rd1312, %rd1280; | |
cvt.u32.u64 %r139, %rd1313; | |
xor.b32 %r140, %r325, %r139; | |
mul.lo.s32 %r141, %r140, %r326; | |
xor.b32 %r142, %r141, %r138; | |
xor.b32 %r143, %r142, %r327; | |
shr.u32 %r144, %r143, 9; | |
cvt.rn.f32.u32 %f101, %r144; | |
mul.rn.f32 %f102, %f101, 0f34000000; | |
cvt.rn.f16.f32 %h55, %f102; | |
mov.b16 %h56, 0x2E66; | |
setp.ge.f16 %p28, %h55, %h56; | |
ld.global.nc.b16 %h57, [%rd44+768]; | |
ld.global.nc.f32 %f103, [%rd45+1536]; | |
cvt.rn.f16.f32 %h58, %f103; | |
add.rn.f16 %h59, %h57, %h58; | |
mov.b16 %h60, 0x3C72; | |
mul.rn.f16 %h61, %h59, %h60; | |
selp.b16 %h62, %h61, 0x0000, %p28; | |
cvt.f32.f16 %f104, %h62; | |
ld.global.nc.b16 %h63, [%rd46+768]; | |
cvt.f32.f16 %f105, %h63; | |
ld.global.nc.f32 %f106, [%rd47+1536]; | |
mul.rn.f32 %f107, %f1, %f106; | |
mul.rn.f32 %f108, %f107, %f105; | |
ld.global.nc.f32 %f109, [%rd48+1536]; | |
mul.rn.f32 %f110, %f2, %f107; | |
sub.rn.f32 %f111, %f109, %f110; | |
add.rn.f32 %f112, %f108, %f111; | |
add.rn.f32 %f113, %f112, %f104; | |
add.rn.f32 %f9, %f8, %f113; | |
or.b32 %r145, %r3, 385; | |
or.b32 %r146, %r145, %r4; | |
and.b32 %r147, %r145, 3; | |
shr.u32 %r148, %r146, 2; | |
setp.ne.s32 %p29, %r147, 1; | |
cvt.u64.u32 %rd1314, %r148; | |
add.s64 %rd213, %rd11, %rd1314; | |
@%p29 bra LBB79_25; | |
and.b64 %rd1354, %rd213, 4294967295; | |
mul.lo.s64 %rd2577, %rd1354, 3528531795; | |
setp.lt.u64 %p31, %rd213, %rd11; | |
selp.u64 %rd1355, 1, 0, %p31; | |
add.s64 %rd1356, %rd2461, %rd1355; | |
xor.b64 %rd1357, %rd1356, %rd2577; | |
shr.u64 %rd1358, %rd1357, 32; | |
mul.lo.s64 %rd2580, %rd1358, 3449720151; | |
shr.u64 %rd1359, %rd2580, 32; | |
and.b64 %rd1360, %rd1356, 4294967295; | |
mul.lo.s64 %rd1361, %rd1360, 3449720151; | |
and.b64 %rd1362, %rd1361, 4294967295; | |
xor.b64 %rd1363, %rd1362, %rd1359; | |
xor.b64 %rd1364, %rd1363, 2654435769; | |
mul.lo.s64 %rd2583, %rd1364, 3528531795; | |
xor.b64 %rd2573, %rd1361, %rd213; | |
mov.u32 %r329, -845247145; | |
mov.u32 %r328, -616729560; | |
mov.u64 %rd2590, 3041712726; | |
mov.u64 %rd2589, 1401181199; | |
mov.u64 %rd2588, 2835769497; | |
mov.u64 %rd2587, 1684936478; | |
mov.u64 %rd2586, 2027808484; | |
mov.u64 %rd2585, 387276957; | |
mov.u64 %rd2584, 842468239; | |
mov.u64 %rd2582, 3986602516; | |
mov.u64 %rd2581, 1013904242; | |
mov.u64 %rd2579, 3668340011; | |
mov.u64 %rd2578, 3144134277; | |
mov.u64 %rd2576, 3449720151; | |
mov.u64 %rd2575, 1993301258; | |
mov.u64 %rd2574, 3528531795; | |
bra.uni LBB79_26; | |
LBB79_25: | |
setp.lt.u64 %p30, %rd213, %rd11; | |
selp.u64 %rd1329, 1, 0, %p30; | |
add.s64 %rd1330, %rd2461, %rd1329; | |
and.b64 %rd1331, %rd1330, 4294967295; | |
mul.lo.s64 %rd2577, %rd1331, 3449720151; | |
xor.b64 %rd1332, %rd2577, %rd213; | |
shr.u64 %rd1333, %rd1332, 32; | |
mul.lo.s64 %rd2580, %rd1333, 3528531795; | |
shr.u64 %rd1334, %rd2580, 32; | |
and.b64 %rd1335, %rd213, 4294967295; | |
mul.lo.s64 %rd1336, %rd1335, 3528531795; | |
and.b64 %rd1337, %rd1336, 4294967295; | |
xor.b64 %rd1338, %rd1337, %rd1334; | |
xor.b64 %rd1339, %rd1338, 3144134277; | |
mul.lo.s64 %rd2583, %rd1339, 3449720151; | |
xor.b64 %rd2573, %rd1330, %rd1336; | |
mov.u32 %r329, -766435501; | |
mov.u32 %r328, -239350328; | |
mov.u64 %rd2590, 1684936478; | |
mov.u64 %rd2589, 534103459; | |
mov.u64 %rd2588, 387276957; | |
mov.u64 %rd2587, 3041712726; | |
mov.u64 %rd2586, 3986602516; | |
mov.u64 %rd2585, 2835769497; | |
mov.u64 %rd2584, 3668340011; | |
mov.u64 %rd2582, 2027808484; | |
mov.u64 %rd2581, 1993301258; | |
mov.u64 %rd2579, 842468239; | |
mov.u64 %rd2578, 2654435769; | |
mov.u64 %rd2576, 3528531795; | |
mov.u64 %rd2575, 1013904242; | |
mov.u64 %rd2574, 3449720151; | |
LBB79_26: | |
shr.u64 %rd1365, %rd2583, 32; | |
shr.u64 %rd1366, %rd2573, 32; | |
mul.lo.s64 %rd1367, %rd1366, %rd2574; | |
and.b64 %rd1368, %rd1367, 4294967295; | |
xor.b64 %rd1369, %rd1368, %rd1365; | |
xor.b64 %rd1370, %rd1369, %rd2575; | |
mul.lo.s64 %rd1371, %rd1370, %rd2576; | |
shr.u64 %rd1372, %rd1371, 32; | |
shr.u64 %rd1373, %rd1367, 32; | |
and.b64 %rd1374, %rd2577, 4294967295; | |
xor.b64 %rd1375, %rd1374, %rd1373; | |
xor.b64 %rd1376, %rd1375, %rd2578; | |
mul.lo.s64 %rd1377, %rd1376, %rd2576; | |
and.b64 %rd1378, %rd1377, 4294967295; | |
xor.b64 %rd1379, %rd1378, %rd1372; | |
xor.b64 %rd1380, %rd1379, %rd2579; | |
mul.lo.s64 %rd1381, %rd1380, %rd2574; | |
shr.u64 %rd1382, %rd1381, 32; | |
shr.u64 %rd1383, %rd1377, 32; | |
and.b64 %rd1384, %rd2580, 4294967295; | |
xor.b64 %rd1385, %rd1384, %rd1383; | |
xor.b64 %rd1386, %rd1385, %rd2581; | |
mul.lo.s64 %rd1387, %rd1386, %rd2574; | |
and.b64 %rd1388, %rd1387, 4294967295; | |
xor.b64 %rd1389, %rd1388, %rd1382; | |
xor.b64 %rd1390, %rd1389, %rd2582; | |
mul.lo.s64 %rd1391, %rd1390, %rd2576; | |
shr.u64 %rd1392, %rd1391, 32; | |
shr.u64 %rd1393, %rd1387, 32; | |
and.b64 %rd1394, %rd2583, 4294967295; | |
xor.b64 %rd1395, %rd1394, %rd1393; | |
xor.b64 %rd1396, %rd1395, %rd2584; | |
mul.lo.s64 %rd1397, %rd1396, %rd2576; | |
and.b64 %rd1398, %rd1397, 4294967295; | |
xor.b64 %rd1399, %rd1398, %rd1392; | |
xor.b64 %rd1400, %rd1399, %rd2585; | |
mul.lo.s64 %rd1401, %rd1400, %rd2574; | |
shr.u64 %rd1402, %rd1401, 32; | |
shr.u64 %rd1403, %rd1397, 32; | |
and.b64 %rd1404, %rd1371, 4294967295; | |
xor.b64 %rd1405, %rd1404, %rd1403; | |
xor.b64 %rd1406, %rd1405, %rd2586; | |
mul.lo.s64 %rd1407, %rd1406, %rd2574; | |
and.b64 %rd1408, %rd1407, 4294967295; | |
xor.b64 %rd1409, %rd1408, %rd1402; | |
xor.b64 %rd1410, %rd1409, %rd2587; | |
mul.lo.s64 %rd1411, %rd1410, %rd2576; | |
shr.u64 %rd1412, %rd1411, 32; | |
shr.u64 %rd1413, %rd1407, 32; | |
and.b64 %rd1414, %rd1381, 4294967295; | |
xor.b64 %rd1415, %rd1414, %rd1413; | |
xor.b64 %rd1416, %rd1415, %rd2588; | |
mul.lo.s64 %rd1417, %rd1416, %rd2576; | |
and.b64 %rd1418, %rd1417, 4294967295; | |
xor.b64 %rd1419, %rd1418, %rd1412; | |
xor.b64 %rd1420, %rd1419, %rd2589; | |
mul.lo.s64 %rd1421, %rd1420, %rd2574; | |
shr.u64 %rd1422, %rd1421, 32; | |
shr.u64 %rd1423, %rd1417, 32; | |
xor.b64 %rd1424, %rd1391, %rd1423; | |
xor.b64 %rd1425, %rd1424, %rd2590; | |
mul.lo.s64 %rd1426, %rd1425, %rd2574; | |
xor.b64 %rd1427, %rd1422, %rd1426; | |
cvt.u32.u64 %r153, %rd1427; | |
xor.b32 %r154, %r328, %r153; | |
mul.lo.s32 %r155, %r154, %r329; | |
shr.u32 %r156, %r155, 9; | |
cvt.rn.f32.u32 %f114, %r156; | |
mul.rn.f32 %f115, %f114, 0f34000000; | |
cvt.rn.f16.f32 %h64, %f115; | |
mov.b16 %h65, 0x2E66; | |
setp.ge.f16 %p33, %h64, %h65; | |
ld.global.nc.b16 %h66, [%rd44+770]; | |
ld.global.nc.f32 %f116, [%rd45+1540]; | |
cvt.rn.f16.f32 %h67, %f116; | |
add.rn.f16 %h68, %h66, %h67; | |
mov.b16 %h69, 0x3C72; | |
mul.rn.f16 %h70, %h68, %h69; | |
selp.b16 %h71, %h70, 0x0000, %p33; | |
cvt.f32.f16 %f117, %h71; | |
ld.global.nc.b16 %h72, [%rd46+770]; | |
cvt.f32.f16 %f118, %h72; | |
ld.global.nc.f32 %f119, [%rd47+1540]; | |
mul.rn.f32 %f120, %f1, %f119; | |
mul.rn.f32 %f121, %f120, %f118; | |
ld.global.nc.f32 %f122, [%rd48+1540]; | |
mul.rn.f32 %f123, %f2, %f120; | |
sub.rn.f32 %f124, %f122, %f123; | |
add.rn.f32 %f125, %f121, %f124; | |
add.rn.f32 %f126, %f125, %f117; | |
add.rn.f32 %f10, %f9, %f126; | |
or.b32 %r158, %r73, 512; | |
shr.u32 %r159, %r158, 2; | |
cvt.u64.u32 %rd1428, %r159; | |
add.s64 %rd240, %rd11, %rd1428; | |
@%p8 bra LBB79_28; | |
and.b64 %rd1470, %rd240, 4294967295; | |
mul.lo.s64 %rd2595, %rd1470, 3528531795; | |
setp.lt.u64 %p35, %rd240, %rd11; | |
selp.u64 %rd1471, 1, 0, %p35; | |
add.s64 %rd1472, %rd2461, %rd1471; | |
xor.b64 %rd1473, %rd1472, %rd2595; | |
shr.u64 %rd1474, %rd1473, 32; | |
mul.lo.s64 %rd2598, %rd1474, 3449720151; | |
shr.u64 %rd1475, %rd2598, 32; | |
and.b64 %rd1476, %rd1472, 4294967295; | |
mul.lo.s64 %rd1477, %rd1476, 3449720151; | |
and.b64 %rd1478, %rd1477, 4294967295; | |
xor.b64 %rd1479, %rd1478, %rd1475; | |
xor.b64 %rd1480, %rd1479, 2654435769; | |
mul.lo.s64 %rd2601, %rd1480, 3528531795; | |
xor.b64 %rd2591, %rd1477, %rd240; | |
mov.u32 %r332, -1879881855; | |
mov.u32 %r331, -845247145; | |
mov.u32 %r330, 534103459; | |
mov.u64 %rd2609, 3678237736; | |
mov.u64 %rd2608, 3041712726; | |
mov.u64 %rd2607, 1401181199; | |
mov.u64 %rd2606, 2835769497; | |
mov.u64 %rd2605, 1684936478; | |
mov.u64 %rd2604, 2027808484; | |
mov.u64 %rd2603, 387276957; | |
mov.u64 %rd2602, 842468239; | |
mov.u64 %rd2600, 3986602516; | |
mov.u64 %rd2599, 1013904242; | |
mov.u64 %rd2597, 3668340011; | |
mov.u64 %rd2596, 3144134277; | |
mov.u64 %rd2594, 3449720151; | |
mov.u64 %rd2593, 1993301258; | |
mov.u64 %rd2592, 3528531795; | |
bra.uni LBB79_29; | |
LBB79_28: | |
setp.lt.u64 %p34, %rd240, %rd11; | |
selp.u64 %rd1444, 1, 0, %p34; | |
add.s64 %rd1445, %rd2461, %rd1444; | |
and.b64 %rd1446, %rd1445, 4294967295; | |
mul.lo.s64 %rd2595, %rd1446, 3449720151; | |
xor.b64 %rd1447, %rd2595, %rd240; | |
shr.u64 %rd1448, %rd1447, 32; | |
mul.lo.s64 %rd2598, %rd1448, 3528531795; | |
shr.u64 %rd1449, %rd2598, 32; | |
and.b64 %rd1450, %rd240, 4294967295; | |
mul.lo.s64 %rd1451, %rd1450, 3528531795; | |
and.b64 %rd1452, %rd1451, 4294967295; | |
xor.b64 %rd1453, %rd1452, %rd1449; | |
xor.b64 %rd1454, %rd1453, 3144134277; | |
mul.lo.s64 %rd2601, %rd1454, 3449720151; | |
xor.b64 %rd2591, %rd1445, %rd1451; | |
mov.u32 %r332, -1767562579; | |
mov.u32 %r331, -766435501; | |
mov.u32 %r330, 1401181199; | |
mov.u64 %rd2609, 4055616968; | |
mov.u64 %rd2608, 1684936478; | |
mov.u64 %rd2607, 534103459; | |
mov.u64 %rd2606, 387276957; | |
mov.u64 %rd2605, 3041712726; | |
mov.u64 %rd2604, 3986602516; | |
mov.u64 %rd2603, 2835769497; | |
mov.u64 %rd2602, 3668340011; | |
mov.u64 %rd2600, 2027808484; | |
mov.u64 %rd2599, 1993301258; | |
mov.u64 %rd2597, 842468239; | |
mov.u64 %rd2596, 2654435769; | |
mov.u64 %rd2594, 3528531795; | |
mov.u64 %rd2593, 1013904242; | |
mov.u64 %rd2592, 3449720151; | |
LBB79_29: | |
shr.u64 %rd1481, %rd2601, 32; | |
shr.u64 %rd1482, %rd2591, 32; | |
mul.lo.s64 %rd1483, %rd1482, %rd2592; | |
and.b64 %rd1484, %rd1483, 4294967295; | |
xor.b64 %rd1485, %rd1484, %rd1481; | |
xor.b64 %rd1486, %rd1485, %rd2593; | |
mul.lo.s64 %rd1487, %rd1486, %rd2594; | |
shr.u64 %rd1488, %rd1487, 32; | |
shr.u64 %rd1489, %rd1483, 32; | |
and.b64 %rd1490, %rd2595, 4294967295; | |
xor.b64 %rd1491, %rd1490, %rd1489; | |
xor.b64 %rd1492, %rd1491, %rd2596; | |
mul.lo.s64 %rd1493, %rd1492, %rd2594; | |
and.b64 %rd1494, %rd1493, 4294967295; | |
xor.b64 %rd1495, %rd1494, %rd1488; | |
xor.b64 %rd1496, %rd1495, %rd2597; | |
mul.lo.s64 %rd1497, %rd1496, %rd2592; | |
shr.u64 %rd1498, %rd1497, 32; | |
shr.u64 %rd1499, %rd1493, 32; | |
and.b64 %rd1500, %rd2598, 4294967295; | |
xor.b64 %rd1501, %rd1500, %rd1499; | |
xor.b64 %rd1502, %rd1501, %rd2599; | |
mul.lo.s64 %rd1503, %rd1502, %rd2592; | |
and.b64 %rd1504, %rd1503, 4294967295; | |
xor.b64 %rd1505, %rd1504, %rd1498; | |
xor.b64 %rd1506, %rd1505, %rd2600; | |
mul.lo.s64 %rd1507, %rd1506, %rd2594; | |
shr.u64 %rd1508, %rd1507, 32; | |
shr.u64 %rd1509, %rd1503, 32; | |
and.b64 %rd1510, %rd2601, 4294967295; | |
xor.b64 %rd1511, %rd1510, %rd1509; | |
xor.b64 %rd1512, %rd1511, %rd2602; | |
mul.lo.s64 %rd1513, %rd1512, %rd2594; | |
and.b64 %rd1514, %rd1513, 4294967295; | |
xor.b64 %rd1515, %rd1514, %rd1508; | |
xor.b64 %rd1516, %rd1515, %rd2603; | |
mul.lo.s64 %rd1517, %rd1516, %rd2592; | |
shr.u64 %rd1518, %rd1517, 32; | |
shr.u64 %rd1519, %rd1513, 32; | |
and.b64 %rd1520, %rd1487, 4294967295; | |
xor.b64 %rd1521, %rd1520, %rd1519; | |
xor.b64 %rd1522, %rd1521, %rd2604; | |
mul.lo.s64 %rd1523, %rd1522, %rd2592; | |
and.b64 %rd1524, %rd1523, 4294967295; | |
xor.b64 %rd1525, %rd1524, %rd1518; | |
xor.b64 %rd1526, %rd1525, %rd2605; | |
mul.lo.s64 %rd1527, %rd1526, %rd2594; | |
shr.u64 %rd1528, %rd1527, 32; | |
shr.u64 %rd1529, %rd1523, 32; | |
and.b64 %rd1530, %rd1497, 4294967295; | |
xor.b64 %rd1531, %rd1530, %rd1529; | |
xor.b64 %rd1532, %rd1531, %rd2606; | |
mul.lo.s64 %rd1533, %rd1532, %rd2594; | |
and.b64 %rd1534, %rd1533, 4294967295; | |
xor.b64 %rd1535, %rd1534, %rd1528; | |
xor.b64 %rd1536, %rd1535, %rd2607; | |
mul.lo.s64 %rd1537, %rd1536, %rd2592; | |
shr.u64 %rd1538, %rd1537, 32; | |
shr.u64 %rd1539, %rd1533, 32; | |
and.b64 %rd1540, %rd1507, 4294967295; | |
xor.b64 %rd1541, %rd1540, %rd1539; | |
xor.b64 %rd1542, %rd1541, %rd2608; | |
mul.lo.s64 %rd1543, %rd1542, %rd2592; | |
and.b64 %rd1544, %rd1543, 4294967295; | |
xor.b64 %rd1545, %rd1544, %rd1538; | |
xor.b64 %rd1546, %rd1545, %rd2609; | |
mul.lo.s64 %rd1547, %rd1546, %rd2594; | |
shr.u64 %rd1548, %rd1547, 32; | |
cvt.u32.u64 %r166, %rd1548; | |
shr.u64 %rd1549, %rd1543, 32; | |
xor.b64 %rd1550, %rd1549, %rd1517; | |
cvt.u32.u64 %r167, %rd1550; | |
xor.b32 %r168, %r330, %r167; | |
mul.lo.s32 %r169, %r168, %r331; | |
xor.b32 %r170, %r169, %r166; | |
xor.b32 %r171, %r170, %r332; | |
shr.u32 %r172, %r171, 9; | |
cvt.rn.f32.u32 %f127, %r172; | |
mul.rn.f32 %f128, %f127, 0f34000000; | |
cvt.rn.f16.f32 %h73, %f128; | |
mov.b16 %h74, 0x2E66; | |
setp.ge.f16 %p36, %h73, %h74; | |
ld.global.nc.b16 %h75, [%rd44+1024]; | |
ld.global.nc.f32 %f129, [%rd45+2048]; | |
cvt.rn.f16.f32 %h76, %f129; | |
add.rn.f16 %h77, %h75, %h76; | |
mov.b16 %h78, 0x3C72; | |
mul.rn.f16 %h79, %h77, %h78; | |
selp.b16 %h80, %h79, 0x0000, %p36; | |
cvt.f32.f16 %f130, %h80; | |
ld.global.nc.b16 %h81, [%rd46+1024]; | |
cvt.f32.f16 %f131, %h81; | |
ld.global.nc.f32 %f132, [%rd47+2048]; | |
mul.rn.f32 %f133, %f1, %f132; | |
mul.rn.f32 %f134, %f133, %f131; | |
ld.global.nc.f32 %f135, [%rd48+2048]; | |
mul.rn.f32 %f136, %f2, %f133; | |
sub.rn.f32 %f137, %f135, %f136; | |
add.rn.f32 %f138, %f134, %f137; | |
add.rn.f32 %f139, %f138, %f130; | |
add.rn.f32 %f11, %f10, %f139; | |
or.b32 %r173, %r3, 513; | |
or.b32 %r174, %r173, %r4; | |
and.b32 %r175, %r173, 3; | |
shr.u32 %r176, %r174, 2; | |
setp.ne.s32 %p37, %r175, 1; | |
cvt.u64.u32 %rd1551, %r176; | |
add.s64 %rd268, %rd11, %rd1551; | |
@%p37 bra LBB79_31; | |
and.b64 %rd1591, %rd268, 4294967295; | |
mul.lo.s64 %rd2614, %rd1591, 3528531795; | |
setp.lt.u64 %p39, %rd268, %rd11; | |
selp.u64 %rd1592, 1, 0, %p39; | |
add.s64 %rd1593, %rd2461, %rd1592; | |
xor.b64 %rd1594, %rd1593, %rd2614; | |
shr.u64 %rd1595, %rd1594, 32; | |
mul.lo.s64 %rd2617, %rd1595, 3449720151; | |
shr.u64 %rd1596, %rd2617, 32; | |
and.b64 %rd1597, %rd1593, 4294967295; | |
mul.lo.s64 %rd1598, %rd1597, 3449720151; | |
and.b64 %rd1599, %rd1598, 4294967295; | |
xor.b64 %rd1600, %rd1599, %rd1596; | |
xor.b64 %rd1601, %rd1600, 2654435769; | |
mul.lo.s64 %rd2620, %rd1601, 3528531795; | |
xor.b64 %rd2610, %rd1598, %rd268; | |
mov.u32 %r334, -845247145; | |
mov.u32 %r333, -616729560; | |
mov.u64 %rd2627, 3041712726; | |
mov.u64 %rd2626, 1401181199; | |
mov.u64 %rd2625, 2835769497; | |
mov.u64 %rd2624, 1684936478; | |
mov.u64 %rd2623, 2027808484; | |
mov.u64 %rd2622, 387276957; | |
mov.u64 %rd2621, 842468239; | |
mov.u64 %rd2619, 3986602516; | |
mov.u64 %rd2618, 1013904242; | |
mov.u64 %rd2616, 3668340011; | |
mov.u64 %rd2615, 3144134277; | |
mov.u64 %rd2613, 3449720151; | |
mov.u64 %rd2612, 1993301258; | |
mov.u64 %rd2611, 3528531795; | |
bra.uni LBB79_32; | |
LBB79_31: | |
setp.lt.u64 %p38, %rd268, %rd11; | |
selp.u64 %rd1566, 1, 0, %p38; | |
add.s64 %rd1567, %rd2461, %rd1566; | |
and.b64 %rd1568, %rd1567, 4294967295; | |
mul.lo.s64 %rd2614, %rd1568, 3449720151; | |
xor.b64 %rd1569, %rd2614, %rd268; | |
shr.u64 %rd1570, %rd1569, 32; | |
mul.lo.s64 %rd2617, %rd1570, 3528531795; | |
shr.u64 %rd1571, %rd2617, 32; | |
and.b64 %rd1572, %rd268, 4294967295; | |
mul.lo.s64 %rd1573, %rd1572, 3528531795; | |
and.b64 %rd1574, %rd1573, 4294967295; | |
xor.b64 %rd1575, %rd1574, %rd1571; | |
xor.b64 %rd1576, %rd1575, 3144134277; | |
mul.lo.s64 %rd2620, %rd1576, 3449720151; | |
xor.b64 %rd2610, %rd1567, %rd1573; | |
mov.u32 %r334, -766435501; | |
mov.u32 %r333, -239350328; | |
mov.u64 %rd2627, 1684936478; | |
mov.u64 %rd2626, 534103459; | |
mov.u64 %rd2625, 387276957; | |
mov.u64 %rd2624, 3041712726; | |
mov.u64 %rd2623, 3986602516; | |
mov.u64 %rd2622, 2835769497; | |
mov.u64 %rd2621, 3668340011; | |
mov.u64 %rd2619, 2027808484; | |
mov.u64 %rd2618, 1993301258; | |
mov.u64 %rd2616, 842468239; | |
mov.u64 %rd2615, 2654435769; | |
mov.u64 %rd2613, 3528531795; | |
mov.u64 %rd2612, 1013904242; | |
mov.u64 %rd2611, 3449720151; | |
LBB79_32: | |
shr.u64 %rd1602, %rd2620, 32; | |
shr.u64 %rd1603, %rd2610, 32; | |
mul.lo.s64 %rd1604, %rd1603, %rd2611; | |
and.b64 %rd1605, %rd1604, 4294967295; | |
xor.b64 %rd1606, %rd1605, %rd1602; | |
xor.b64 %rd1607, %rd1606, %rd2612; | |
mul.lo.s64 %rd1608, %rd1607, %rd2613; | |
shr.u64 %rd1609, %rd1608, 32; | |
shr.u64 %rd1610, %rd1604, 32; | |
and.b64 %rd1611, %rd2614, 4294967295; | |
xor.b64 %rd1612, %rd1611, %rd1610; | |
xor.b64 %rd1613, %rd1612, %rd2615; | |
mul.lo.s64 %rd1614, %rd1613, %rd2613; | |
and.b64 %rd1615, %rd1614, 4294967295; | |
xor.b64 %rd1616, %rd1615, %rd1609; | |
xor.b64 %rd1617, %rd1616, %rd2616; | |
mul.lo.s64 %rd1618, %rd1617, %rd2611; | |
shr.u64 %rd1619, %rd1618, 32; | |
shr.u64 %rd1620, %rd1614, 32; | |
and.b64 %rd1621, %rd2617, 4294967295; | |
xor.b64 %rd1622, %rd1621, %rd1620; | |
xor.b64 %rd1623, %rd1622, %rd2618; | |
mul.lo.s64 %rd1624, %rd1623, %rd2611; | |
and.b64 %rd1625, %rd1624, 4294967295; | |
xor.b64 %rd1626, %rd1625, %rd1619; | |
xor.b64 %rd1627, %rd1626, %rd2619; | |
mul.lo.s64 %rd1628, %rd1627, %rd2613; | |
shr.u64 %rd1629, %rd1628, 32; | |
shr.u64 %rd1630, %rd1624, 32; | |
and.b64 %rd1631, %rd2620, 4294967295; | |
xor.b64 %rd1632, %rd1631, %rd1630; | |
xor.b64 %rd1633, %rd1632, %rd2621; | |
mul.lo.s64 %rd1634, %rd1633, %rd2613; | |
and.b64 %rd1635, %rd1634, 4294967295; | |
xor.b64 %rd1636, %rd1635, %rd1629; | |
xor.b64 %rd1637, %rd1636, %rd2622; | |
mul.lo.s64 %rd1638, %rd1637, %rd2611; | |
shr.u64 %rd1639, %rd1638, 32; | |
shr.u64 %rd1640, %rd1634, 32; | |
and.b64 %rd1641, %rd1608, 4294967295; | |
xor.b64 %rd1642, %rd1641, %rd1640; | |
xor.b64 %rd1643, %rd1642, %rd2623; | |
mul.lo.s64 %rd1644, %rd1643, %rd2611; | |
and.b64 %rd1645, %rd1644, 4294967295; | |
xor.b64 %rd1646, %rd1645, %rd1639; | |
xor.b64 %rd1647, %rd1646, %rd2624; | |
mul.lo.s64 %rd1648, %rd1647, %rd2613; | |
shr.u64 %rd1649, %rd1648, 32; | |
shr.u64 %rd1650, %rd1644, 32; | |
and.b64 %rd1651, %rd1618, 4294967295; | |
xor.b64 %rd1652, %rd1651, %rd1650; | |
xor.b64 %rd1653, %rd1652, %rd2625; | |
mul.lo.s64 %rd1654, %rd1653, %rd2613; | |
and.b64 %rd1655, %rd1654, 4294967295; | |
xor.b64 %rd1656, %rd1655, %rd1649; | |
xor.b64 %rd1657, %rd1656, %rd2626; | |
mul.lo.s64 %rd1658, %rd1657, %rd2611; | |
shr.u64 %rd1659, %rd1658, 32; | |
shr.u64 %rd1660, %rd1654, 32; | |
xor.b64 %rd1661, %rd1628, %rd1660; | |
xor.b64 %rd1662, %rd1661, %rd2627; | |
mul.lo.s64 %rd1663, %rd1662, %rd2611; | |
xor.b64 %rd1664, %rd1659, %rd1663; | |
cvt.u32.u64 %r181, %rd1664; | |
xor.b32 %r182, %r333, %r181; | |
mul.lo.s32 %r183, %r182, %r334; | |
shr.u32 %r184, %r183, 9; | |
cvt.rn.f32.u32 %f140, %r184; | |
mul.rn.f32 %f141, %f140, 0f34000000; | |
cvt.rn.f16.f32 %h82, %f141; | |
mov.b16 %h83, 0x2E66; | |
setp.ge.f16 %p41, %h82, %h83; | |
ld.global.nc.b16 %h84, [%rd44+1026]; | |
ld.global.nc.f32 %f142, [%rd45+2052]; | |
cvt.rn.f16.f32 %h85, %f142; | |
add.rn.f16 %h86, %h84, %h85; | |
mov.b16 %h87, 0x3C72; | |
mul.rn.f16 %h88, %h86, %h87; | |
selp.b16 %h89, %h88, 0x0000, %p41; | |
cvt.f32.f16 %f143, %h89; | |
ld.global.nc.b16 %h90, [%rd46+1026]; | |
cvt.f32.f16 %f144, %h90; | |
ld.global.nc.f32 %f145, [%rd47+2052]; | |
mul.rn.f32 %f146, %f1, %f145; | |
mul.rn.f32 %f147, %f146, %f144; | |
ld.global.nc.f32 %f148, [%rd48+2052]; | |
mul.rn.f32 %f149, %f2, %f146; | |
sub.rn.f32 %f150, %f148, %f149; | |
add.rn.f32 %f151, %f147, %f150; | |
add.rn.f32 %f152, %f151, %f143; | |
add.rn.f32 %f12, %f11, %f152; | |
or.b32 %r186, %r73, 640; | |
shr.u32 %r187, %r186, 2; | |
cvt.u64.u32 %rd1665, %r187; | |
add.s64 %rd295, %rd11, %rd1665; | |
@%p8 bra LBB79_34; | |
and.b64 %rd1707, %rd295, 4294967295; | |
mul.lo.s64 %rd2632, %rd1707, 3528531795; | |
setp.lt.u64 %p43, %rd295, %rd11; | |
selp.u64 %rd1708, 1, 0, %p43; | |
add.s64 %rd1709, %rd2461, %rd1708; | |
xor.b64 %rd1710, %rd1709, %rd2632; | |
shr.u64 %rd1711, %rd1710, 32; | |
mul.lo.s64 %rd2635, %rd1711, 3449720151; | |
shr.u64 %rd1712, %rd2635, 32; | |
and.b64 %rd1713, %rd1709, 4294967295; | |
mul.lo.s64 %rd1714, %rd1713, 3449720151; | |
and.b64 %rd1715, %rd1714, 4294967295; | |
xor.b64 %rd1716, %rd1715, %rd1712; | |
xor.b64 %rd1717, %rd1716, 2654435769; | |
mul.lo.s64 %rd2638, %rd1717, 3528531795; | |
xor.b64 %rd2628, %rd1714, %rd295; | |
mov.u32 %r337, -1879881855; | |
mov.u32 %r336, -845247145; | |
mov.u32 %r335, 534103459; | |
mov.u64 %rd2646, 3678237736; | |
mov.u64 %rd2645, 3041712726; | |
mov.u64 %rd2644, 1401181199; | |
mov.u64 %rd2643, 2835769497; | |
mov.u64 %rd2642, 1684936478; | |
mov.u64 %rd2641, 2027808484; | |
mov.u64 %rd2640, 387276957; | |
mov.u64 %rd2639, 842468239; | |
mov.u64 %rd2637, 3986602516; | |
mov.u64 %rd2636, 1013904242; | |
mov.u64 %rd2634, 3668340011; | |
mov.u64 %rd2633, 3144134277; | |
mov.u64 %rd2631, 3449720151; | |
mov.u64 %rd2630, 1993301258; | |
mov.u64 %rd2629, 3528531795; | |
bra.uni LBB79_35; | |
LBB79_34: | |
setp.lt.u64 %p42, %rd295, %rd11; | |
selp.u64 %rd1681, 1, 0, %p42; | |
add.s64 %rd1682, %rd2461, %rd1681; | |
and.b64 %rd1683, %rd1682, 4294967295; | |
mul.lo.s64 %rd2632, %rd1683, 3449720151; | |
xor.b64 %rd1684, %rd2632, %rd295; | |
shr.u64 %rd1685, %rd1684, 32; | |
mul.lo.s64 %rd2635, %rd1685, 3528531795; | |
shr.u64 %rd1686, %rd2635, 32; | |
and.b64 %rd1687, %rd295, 4294967295; | |
mul.lo.s64 %rd1688, %rd1687, 3528531795; | |
and.b64 %rd1689, %rd1688, 4294967295; | |
xor.b64 %rd1690, %rd1689, %rd1686; | |
xor.b64 %rd1691, %rd1690, 3144134277; | |
mul.lo.s64 %rd2638, %rd1691, 3449720151; | |
xor.b64 %rd2628, %rd1682, %rd1688; | |
mov.u32 %r337, -1767562579; | |
mov.u32 %r336, -766435501; | |
mov.u32 %r335, 1401181199; | |
mov.u64 %rd2646, 4055616968; | |
mov.u64 %rd2645, 1684936478; | |
mov.u64 %rd2644, 534103459; | |
mov.u64 %rd2643, 387276957; | |
mov.u64 %rd2642, 3041712726; | |
mov.u64 %rd2641, 3986602516; | |
mov.u64 %rd2640, 2835769497; | |
mov.u64 %rd2639, 3668340011; | |
mov.u64 %rd2637, 2027808484; | |
mov.u64 %rd2636, 1993301258; | |
mov.u64 %rd2634, 842468239; | |
mov.u64 %rd2633, 2654435769; | |
mov.u64 %rd2631, 3528531795; | |
mov.u64 %rd2630, 1013904242; | |
mov.u64 %rd2629, 3449720151; | |
LBB79_35: | |
shr.u64 %rd1718, %rd2638, 32; | |
shr.u64 %rd1719, %rd2628, 32; | |
mul.lo.s64 %rd1720, %rd1719, %rd2629; | |
and.b64 %rd1721, %rd1720, 4294967295; | |
xor.b64 %rd1722, %rd1721, %rd1718; | |
xor.b64 %rd1723, %rd1722, %rd2630; | |
mul.lo.s64 %rd1724, %rd1723, %rd2631; | |
shr.u64 %rd1725, %rd1724, 32; | |
shr.u64 %rd1726, %rd1720, 32; | |
and.b64 %rd1727, %rd2632, 4294967295; | |
xor.b64 %rd1728, %rd1727, %rd1726; | |
xor.b64 %rd1729, %rd1728, %rd2633; | |
mul.lo.s64 %rd1730, %rd1729, %rd2631; | |
and.b64 %rd1731, %rd1730, 4294967295; | |
xor.b64 %rd1732, %rd1731, %rd1725; | |
xor.b64 %rd1733, %rd1732, %rd2634; | |
mul.lo.s64 %rd1734, %rd1733, %rd2629; | |
shr.u64 %rd1735, %rd1734, 32; | |
shr.u64 %rd1736, %rd1730, 32; | |
and.b64 %rd1737, %rd2635, 4294967295; | |
xor.b64 %rd1738, %rd1737, %rd1736; | |
xor.b64 %rd1739, %rd1738, %rd2636; | |
mul.lo.s64 %rd1740, %rd1739, %rd2629; | |
and.b64 %rd1741, %rd1740, 4294967295; | |
xor.b64 %rd1742, %rd1741, %rd1735; | |
xor.b64 %rd1743, %rd1742, %rd2637; | |
mul.lo.s64 %rd1744, %rd1743, %rd2631; | |
shr.u64 %rd1745, %rd1744, 32; | |
shr.u64 %rd1746, %rd1740, 32; | |
and.b64 %rd1747, %rd2638, 4294967295; | |
xor.b64 %rd1748, %rd1747, %rd1746; | |
xor.b64 %rd1749, %rd1748, %rd2639; | |
mul.lo.s64 %rd1750, %rd1749, %rd2631; | |
and.b64 %rd1751, %rd1750, 4294967295; | |
xor.b64 %rd1752, %rd1751, %rd1745; | |
xor.b64 %rd1753, %rd1752, %rd2640; | |
mul.lo.s64 %rd1754, %rd1753, %rd2629; | |
shr.u64 %rd1755, %rd1754, 32; | |
shr.u64 %rd1756, %rd1750, 32; | |
and.b64 %rd1757, %rd1724, 4294967295; | |
xor.b64 %rd1758, %rd1757, %rd1756; | |
xor.b64 %rd1759, %rd1758, %rd2641; | |
mul.lo.s64 %rd1760, %rd1759, %rd2629; | |
and.b64 %rd1761, %rd1760, 4294967295; | |
xor.b64 %rd1762, %rd1761, %rd1755; | |
xor.b64 %rd1763, %rd1762, %rd2642; | |
mul.lo.s64 %rd1764, %rd1763, %rd2631; | |
shr.u64 %rd1765, %rd1764, 32; | |
shr.u64 %rd1766, %rd1760, 32; | |
and.b64 %rd1767, %rd1734, 4294967295; | |
xor.b64 %rd1768, %rd1767, %rd1766; | |
xor.b64 %rd1769, %rd1768, %rd2643; | |
mul.lo.s64 %rd1770, %rd1769, %rd2631; | |
and.b64 %rd1771, %rd1770, 4294967295; | |
xor.b64 %rd1772, %rd1771, %rd1765; | |
xor.b64 %rd1773, %rd1772, %rd2644; | |
mul.lo.s64 %rd1774, %rd1773, %rd2629; | |
shr.u64 %rd1775, %rd1774, 32; | |
shr.u64 %rd1776, %rd1770, 32; | |
and.b64 %rd1777, %rd1744, 4294967295; | |
xor.b64 %rd1778, %rd1777, %rd1776; | |
xor.b64 %rd1779, %rd1778, %rd2645; | |
mul.lo.s64 %rd1780, %rd1779, %rd2629; | |
and.b64 %rd1781, %rd1780, 4294967295; | |
xor.b64 %rd1782, %rd1781, %rd1775; | |
xor.b64 %rd1783, %rd1782, %rd2646; | |
mul.lo.s64 %rd1784, %rd1783, %rd2631; | |
shr.u64 %rd1785, %rd1784, 32; | |
cvt.u32.u64 %r194, %rd1785; | |
shr.u64 %rd1786, %rd1780, 32; | |
xor.b64 %rd1787, %rd1786, %rd1754; | |
cvt.u32.u64 %r195, %rd1787; | |
xor.b32 %r196, %r335, %r195; | |
mul.lo.s32 %r197, %r196, %r336; | |
xor.b32 %r198, %r197, %r194; | |
xor.b32 %r199, %r198, %r337; | |
shr.u32 %r200, %r199, 9; | |
cvt.rn.f32.u32 %f153, %r200; | |
mul.rn.f32 %f154, %f153, 0f34000000; | |
cvt.rn.f16.f32 %h91, %f154; | |
mov.b16 %h92, 0x2E66; | |
setp.ge.f16 %p44, %h91, %h92; | |
ld.global.nc.b16 %h93, [%rd44+1280]; | |
ld.global.nc.f32 %f155, [%rd45+2560]; | |
cvt.rn.f16.f32 %h94, %f155; | |
add.rn.f16 %h95, %h93, %h94; | |
mov.b16 %h96, 0x3C72; | |
mul.rn.f16 %h97, %h95, %h96; | |
selp.b16 %h98, %h97, 0x0000, %p44; | |
cvt.f32.f16 %f156, %h98; | |
ld.global.nc.b16 %h99, [%rd46+1280]; | |
cvt.f32.f16 %f157, %h99; | |
ld.global.nc.f32 %f158, [%rd47+2560]; | |
mul.rn.f32 %f159, %f1, %f158; | |
mul.rn.f32 %f160, %f159, %f157; | |
ld.global.nc.f32 %f161, [%rd48+2560]; | |
mul.rn.f32 %f162, %f2, %f159; | |
sub.rn.f32 %f163, %f161, %f162; | |
add.rn.f32 %f164, %f160, %f163; | |
add.rn.f32 %f165, %f164, %f156; | |
add.rn.f32 %f13, %f12, %f165; | |
or.b32 %r201, %r3, 641; | |
or.b32 %r202, %r201, %r4; | |
and.b32 %r203, %r201, 3; | |
shr.u32 %r204, %r202, 2; | |
setp.ne.s32 %p45, %r203, 1; | |
cvt.u64.u32 %rd1788, %r204; | |
add.s64 %rd323, %rd11, %rd1788; | |
@%p45 bra LBB79_37; | |
and.b64 %rd1828, %rd323, 4294967295; | |
mul.lo.s64 %rd2651, %rd1828, 3528531795; | |
setp.lt.u64 %p47, %rd323, %rd11; | |
selp.u64 %rd1829, 1, 0, %p47; | |
add.s64 %rd1830, %rd2461, %rd1829; | |
xor.b64 %rd1831, %rd1830, %rd2651; | |
shr.u64 %rd1832, %rd1831, 32; | |
mul.lo.s64 %rd2654, %rd1832, 3449720151; | |
shr.u64 %rd1833, %rd2654, 32; | |
and.b64 %rd1834, %rd1830, 4294967295; | |
mul.lo.s64 %rd1835, %rd1834, 3449720151; | |
and.b64 %rd1836, %rd1835, 4294967295; | |
xor.b64 %rd1837, %rd1836, %rd1833; | |
xor.b64 %rd1838, %rd1837, 2654435769; | |
mul.lo.s64 %rd2657, %rd1838, 3528531795; | |
xor.b64 %rd2647, %rd1835, %rd323; | |
mov.u32 %r339, -845247145; | |
mov.u32 %r338, -616729560; | |
mov.u64 %rd2664, 3041712726; | |
mov.u64 %rd2663, 1401181199; | |
mov.u64 %rd2662, 2835769497; | |
mov.u64 %rd2661, 1684936478; | |
mov.u64 %rd2660, 2027808484; | |
mov.u64 %rd2659, 387276957; | |
mov.u64 %rd2658, 842468239; | |
mov.u64 %rd2656, 3986602516; | |
mov.u64 %rd2655, 1013904242; | |
mov.u64 %rd2653, 3668340011; | |
mov.u64 %rd2652, 3144134277; | |
mov.u64 %rd2650, 3449720151; | |
mov.u64 %rd2649, 1993301258; | |
mov.u64 %rd2648, 3528531795; | |
bra.uni LBB79_38; | |
LBB79_37: | |
setp.lt.u64 %p46, %rd323, %rd11; | |
selp.u64 %rd1803, 1, 0, %p46; | |
add.s64 %rd1804, %rd2461, %rd1803; | |
and.b64 %rd1805, %rd1804, 4294967295; | |
mul.lo.s64 %rd2651, %rd1805, 3449720151; | |
xor.b64 %rd1806, %rd2651, %rd323; | |
shr.u64 %rd1807, %rd1806, 32; | |
mul.lo.s64 %rd2654, %rd1807, 3528531795; | |
shr.u64 %rd1808, %rd2654, 32; | |
and.b64 %rd1809, %rd323, 4294967295; | |
mul.lo.s64 %rd1810, %rd1809, 3528531795; | |
and.b64 %rd1811, %rd1810, 4294967295; | |
xor.b64 %rd1812, %rd1811, %rd1808; | |
xor.b64 %rd1813, %rd1812, 3144134277; | |
mul.lo.s64 %rd2657, %rd1813, 3449720151; | |
xor.b64 %rd2647, %rd1804, %rd1810; | |
mov.u32 %r339, -766435501; | |
mov.u32 %r338, -239350328; | |
mov.u64 %rd2664, 1684936478; | |
mov.u64 %rd2663, 534103459; | |
mov.u64 %rd2662, 387276957; | |
mov.u64 %rd2661, 3041712726; | |
mov.u64 %rd2660, 3986602516; | |
mov.u64 %rd2659, 2835769497; | |
mov.u64 %rd2658, 3668340011; | |
mov.u64 %rd2656, 2027808484; | |
mov.u64 %rd2655, 1993301258; | |
mov.u64 %rd2653, 842468239; | |
mov.u64 %rd2652, 2654435769; | |
mov.u64 %rd2650, 3528531795; | |
mov.u64 %rd2649, 1013904242; | |
mov.u64 %rd2648, 3449720151; | |
LBB79_38: | |
shr.u64 %rd1839, %rd2657, 32; | |
shr.u64 %rd1840, %rd2647, 32; | |
mul.lo.s64 %rd1841, %rd1840, %rd2648; | |
and.b64 %rd1842, %rd1841, 4294967295; | |
xor.b64 %rd1843, %rd1842, %rd1839; | |
xor.b64 %rd1844, %rd1843, %rd2649; | |
mul.lo.s64 %rd1845, %rd1844, %rd2650; | |
shr.u64 %rd1846, %rd1845, 32; | |
shr.u64 %rd1847, %rd1841, 32; | |
and.b64 %rd1848, %rd2651, 4294967295; | |
xor.b64 %rd1849, %rd1848, %rd1847; | |
xor.b64 %rd1850, %rd1849, %rd2652; | |
mul.lo.s64 %rd1851, %rd1850, %rd2650; | |
and.b64 %rd1852, %rd1851, 4294967295; | |
xor.b64 %rd1853, %rd1852, %rd1846; | |
xor.b64 %rd1854, %rd1853, %rd2653; | |
mul.lo.s64 %rd1855, %rd1854, %rd2648; | |
shr.u64 %rd1856, %rd1855, 32; | |
shr.u64 %rd1857, %rd1851, 32; | |
and.b64 %rd1858, %rd2654, 4294967295; | |
xor.b64 %rd1859, %rd1858, %rd1857; | |
xor.b64 %rd1860, %rd1859, %rd2655; | |
mul.lo.s64 %rd1861, %rd1860, %rd2648; | |
and.b64 %rd1862, %rd1861, 4294967295; | |
xor.b64 %rd1863, %rd1862, %rd1856; | |
xor.b64 %rd1864, %rd1863, %rd2656; | |
mul.lo.s64 %rd1865, %rd1864, %rd2650; | |
shr.u64 %rd1866, %rd1865, 32; | |
shr.u64 %rd1867, %rd1861, 32; | |
and.b64 %rd1868, %rd2657, 4294967295; | |
xor.b64 %rd1869, %rd1868, %rd1867; | |
xor.b64 %rd1870, %rd1869, %rd2658; | |
mul.lo.s64 %rd1871, %rd1870, %rd2650; | |
and.b64 %rd1872, %rd1871, 4294967295; | |
xor.b64 %rd1873, %rd1872, %rd1866; | |
xor.b64 %rd1874, %rd1873, %rd2659; | |
mul.lo.s64 %rd1875, %rd1874, %rd2648; | |
shr.u64 %rd1876, %rd1875, 32; | |
shr.u64 %rd1877, %rd1871, 32; | |
and.b64 %rd1878, %rd1845, 4294967295; | |
xor.b64 %rd1879, %rd1878, %rd1877; | |
xor.b64 %rd1880, %rd1879, %rd2660; | |
mul.lo.s64 %rd1881, %rd1880, %rd2648; | |
and.b64 %rd1882, %rd1881, 4294967295; | |
xor.b64 %rd1883, %rd1882, %rd1876; | |
xor.b64 %rd1884, %rd1883, %rd2661; | |
mul.lo.s64 %rd1885, %rd1884, %rd2650; | |
shr.u64 %rd1886, %rd1885, 32; | |
shr.u64 %rd1887, %rd1881, 32; | |
and.b64 %rd1888, %rd1855, 4294967295; | |
xor.b64 %rd1889, %rd1888, %rd1887; | |
xor.b64 %rd1890, %rd1889, %rd2662; | |
mul.lo.s64 %rd1891, %rd1890, %rd2650; | |
and.b64 %rd1892, %rd1891, 4294967295; | |
xor.b64 %rd1893, %rd1892, %rd1886; | |
xor.b64 %rd1894, %rd1893, %rd2663; | |
mul.lo.s64 %rd1895, %rd1894, %rd2648; | |
shr.u64 %rd1896, %rd1895, 32; | |
shr.u64 %rd1897, %rd1891, 32; | |
xor.b64 %rd1898, %rd1865, %rd1897; | |
xor.b64 %rd1899, %rd1898, %rd2664; | |
mul.lo.s64 %rd1900, %rd1899, %rd2648; | |
xor.b64 %rd1901, %rd1896, %rd1900; | |
cvt.u32.u64 %r209, %rd1901; | |
xor.b32 %r210, %r338, %r209; | |
mul.lo.s32 %r211, %r210, %r339; | |
shr.u32 %r212, %r211, 9; | |
cvt.rn.f32.u32 %f166, %r212; | |
mul.rn.f32 %f167, %f166, 0f34000000; | |
cvt.rn.f16.f32 %h100, %f167; | |
mov.b16 %h101, 0x2E66; | |
setp.ge.f16 %p49, %h100, %h101; | |
ld.global.nc.b16 %h102, [%rd44+1282]; | |
ld.global.nc.f32 %f168, [%rd45+2564]; | |
cvt.rn.f16.f32 %h103, %f168; | |
add.rn.f16 %h104, %h102, %h103; | |
mov.b16 %h105, 0x3C72; | |
mul.rn.f16 %h106, %h104, %h105; | |
selp.b16 %h107, %h106, 0x0000, %p49; | |
cvt.f32.f16 %f169, %h107; | |
ld.global.nc.b16 %h108, [%rd46+1282]; | |
cvt.f32.f16 %f170, %h108; | |
ld.global.nc.f32 %f171, [%rd47+2564]; | |
mul.rn.f32 %f172, %f1, %f171; | |
mul.rn.f32 %f173, %f172, %f170; | |
ld.global.nc.f32 %f174, [%rd48+2564]; | |
mul.rn.f32 %f175, %f2, %f172; | |
sub.rn.f32 %f176, %f174, %f175; | |
add.rn.f32 %f177, %f173, %f176; | |
add.rn.f32 %f178, %f177, %f169; | |
add.rn.f32 %f14, %f13, %f178; | |
or.b32 %r214, %r73, 768; | |
shr.u32 %r215, %r214, 2; | |
cvt.u64.u32 %rd1902, %r215; | |
add.s64 %rd350, %rd11, %rd1902; | |
@%p8 bra LBB79_40; | |
and.b64 %rd1944, %rd350, 4294967295; | |
mul.lo.s64 %rd2669, %rd1944, 3528531795; | |
setp.lt.u64 %p51, %rd350, %rd11; | |
selp.u64 %rd1945, 1, 0, %p51; | |
add.s64 %rd1946, %rd2461, %rd1945; | |
xor.b64 %rd1947, %rd1946, %rd2669; | |
shr.u64 %rd1948, %rd1947, 32; | |
mul.lo.s64 %rd2672, %rd1948, 3449720151; | |
shr.u64 %rd1949, %rd2672, 32; | |
and.b64 %rd1950, %rd1946, 4294967295; | |
mul.lo.s64 %rd1951, %rd1950, 3449720151; | |
and.b64 %rd1952, %rd1951, 4294967295; | |
xor.b64 %rd1953, %rd1952, %rd1949; | |
xor.b64 %rd1954, %rd1953, 2654435769; | |
mul.lo.s64 %rd2675, %rd1954, 3528531795; | |
xor.b64 %rd2665, %rd1951, %rd350; | |
mov.u32 %r342, -1879881855; | |
mov.u32 %r341, -845247145; | |
mov.u32 %r340, 534103459; | |
mov.u64 %rd2683, 3678237736; | |
mov.u64 %rd2682, 3041712726; | |
mov.u64 %rd2681, 1401181199; | |
mov.u64 %rd2680, 2835769497; | |
mov.u64 %rd2679, 1684936478; | |
mov.u64 %rd2678, 2027808484; | |
mov.u64 %rd2677, 387276957; | |
mov.u64 %rd2676, 842468239; | |
mov.u64 %rd2674, 3986602516; | |
mov.u64 %rd2673, 1013904242; | |
mov.u64 %rd2671, 3668340011; | |
mov.u64 %rd2670, 3144134277; | |
mov.u64 %rd2668, 3449720151; | |
mov.u64 %rd2667, 1993301258; | |
mov.u64 %rd2666, 3528531795; | |
bra.uni LBB79_41; | |
LBB79_40: | |
setp.lt.u64 %p50, %rd350, %rd11; | |
selp.u64 %rd1918, 1, 0, %p50; | |
add.s64 %rd1919, %rd2461, %rd1918; | |
and.b64 %rd1920, %rd1919, 4294967295; | |
mul.lo.s64 %rd2669, %rd1920, 3449720151; | |
xor.b64 %rd1921, %rd2669, %rd350; | |
shr.u64 %rd1922, %rd1921, 32; | |
mul.lo.s64 %rd2672, %rd1922, 3528531795; | |
shr.u64 %rd1923, %rd2672, 32; | |
and.b64 %rd1924, %rd350, 4294967295; | |
mul.lo.s64 %rd1925, %rd1924, 3528531795; | |
and.b64 %rd1926, %rd1925, 4294967295; | |
xor.b64 %rd1927, %rd1926, %rd1923; | |
xor.b64 %rd1928, %rd1927, 3144134277; | |
mul.lo.s64 %rd2675, %rd1928, 3449720151; | |
xor.b64 %rd2665, %rd1919, %rd1925; | |
mov.u32 %r342, -1767562579; | |
mov.u32 %r341, -766435501; | |
mov.u32 %r340, 1401181199; | |
mov.u64 %rd2683, 4055616968; | |
mov.u64 %rd2682, 1684936478; | |
mov.u64 %rd2681, 534103459; | |
mov.u64 %rd2680, 387276957; | |
mov.u64 %rd2679, 3041712726; | |
mov.u64 %rd2678, 3986602516; | |
mov.u64 %rd2677, 2835769497; | |
mov.u64 %rd2676, 3668340011; | |
mov.u64 %rd2674, 2027808484; | |
mov.u64 %rd2673, 1993301258; | |
mov.u64 %rd2671, 842468239; | |
mov.u64 %rd2670, 2654435769; | |
mov.u64 %rd2668, 3528531795; | |
mov.u64 %rd2667, 1013904242; | |
mov.u64 %rd2666, 3449720151; | |
LBB79_41: | |
shr.u64 %rd1955, %rd2675, 32; | |
shr.u64 %rd1956, %rd2665, 32; | |
mul.lo.s64 %rd1957, %rd1956, %rd2666; | |
and.b64 %rd1958, %rd1957, 4294967295; | |
xor.b64 %rd1959, %rd1958, %rd1955; | |
xor.b64 %rd1960, %rd1959, %rd2667; | |
mul.lo.s64 %rd1961, %rd1960, %rd2668; | |
shr.u64 %rd1962, %rd1961, 32; | |
shr.u64 %rd1963, %rd1957, 32; | |
and.b64 %rd1964, %rd2669, 4294967295; | |
xor.b64 %rd1965, %rd1964, %rd1963; | |
xor.b64 %rd1966, %rd1965, %rd2670; | |
mul.lo.s64 %rd1967, %rd1966, %rd2668; | |
and.b64 %rd1968, %rd1967, 4294967295; | |
xor.b64 %rd1969, %rd1968, %rd1962; | |
xor.b64 %rd1970, %rd1969, %rd2671; | |
mul.lo.s64 %rd1971, %rd1970, %rd2666; | |
shr.u64 %rd1972, %rd1971, 32; | |
shr.u64 %rd1973, %rd1967, 32; | |
and.b64 %rd1974, %rd2672, 4294967295; | |
xor.b64 %rd1975, %rd1974, %rd1973; | |
xor.b64 %rd1976, %rd1975, %rd2673; | |
mul.lo.s64 %rd1977, %rd1976, %rd2666; | |
and.b64 %rd1978, %rd1977, 4294967295; | |
xor.b64 %rd1979, %rd1978, %rd1972; | |
xor.b64 %rd1980, %rd1979, %rd2674; | |
mul.lo.s64 %rd1981, %rd1980, %rd2668; | |
shr.u64 %rd1982, %rd1981, 32; | |
shr.u64 %rd1983, %rd1977, 32; | |
and.b64 %rd1984, %rd2675, 4294967295; | |
xor.b64 %rd1985, %rd1984, %rd1983; | |
xor.b64 %rd1986, %rd1985, %rd2676; | |
mul.lo.s64 %rd1987, %rd1986, %rd2668; | |
and.b64 %rd1988, %rd1987, 4294967295; | |
xor.b64 %rd1989, %rd1988, %rd1982; | |
xor.b64 %rd1990, %rd1989, %rd2677; | |
mul.lo.s64 %rd1991, %rd1990, %rd2666; | |
shr.u64 %rd1992, %rd1991, 32; | |
shr.u64 %rd1993, %rd1987, 32; | |
and.b64 %rd1994, %rd1961, 4294967295; | |
xor.b64 %rd1995, %rd1994, %rd1993; | |
xor.b64 %rd1996, %rd1995, %rd2678; | |
mul.lo.s64 %rd1997, %rd1996, %rd2666; | |
and.b64 %rd1998, %rd1997, 4294967295; | |
xor.b64 %rd1999, %rd1998, %rd1992; | |
xor.b64 %rd2000, %rd1999, %rd2679; | |
mul.lo.s64 %rd2001, %rd2000, %rd2668; | |
shr.u64 %rd2002, %rd2001, 32; | |
shr.u64 %rd2003, %rd1997, 32; | |
and.b64 %rd2004, %rd1971, 4294967295; | |
xor.b64 %rd2005, %rd2004, %rd2003; | |
xor.b64 %rd2006, %rd2005, %rd2680; | |
mul.lo.s64 %rd2007, %rd2006, %rd2668; | |
and.b64 %rd2008, %rd2007, 4294967295; | |
xor.b64 %rd2009, %rd2008, %rd2002; | |
xor.b64 %rd2010, %rd2009, %rd2681; | |
mul.lo.s64 %rd2011, %rd2010, %rd2666; | |
shr.u64 %rd2012, %rd2011, 32; | |
shr.u64 %rd2013, %rd2007, 32; | |
and.b64 %rd2014, %rd1981, 4294967295; | |
xor.b64 %rd2015, %rd2014, %rd2013; | |
xor.b64 %rd2016, %rd2015, %rd2682; | |
mul.lo.s64 %rd2017, %rd2016, %rd2666; | |
and.b64 %rd2018, %rd2017, 4294967295; | |
xor.b64 %rd2019, %rd2018, %rd2012; | |
xor.b64 %rd2020, %rd2019, %rd2683; | |
mul.lo.s64 %rd2021, %rd2020, %rd2668; | |
shr.u64 %rd2022, %rd2021, 32; | |
cvt.u32.u64 %r222, %rd2022; | |
shr.u64 %rd2023, %rd2017, 32; | |
xor.b64 %rd2024, %rd2023, %rd1991; | |
cvt.u32.u64 %r223, %rd2024; | |
xor.b32 %r224, %r340, %r223; | |
mul.lo.s32 %r225, %r224, %r341; | |
xor.b32 %r226, %r225, %r222; | |
xor.b32 %r227, %r226, %r342; | |
shr.u32 %r228, %r227, 9; | |
cvt.rn.f32.u32 %f179, %r228; | |
mul.rn.f32 %f180, %f179, 0f34000000; | |
cvt.rn.f16.f32 %h109, %f180; | |
mov.b16 %h110, 0x2E66; | |
setp.ge.f16 %p52, %h109, %h110; | |
ld.global.nc.b16 %h111, [%rd44+1536]; | |
ld.global.nc.f32 %f181, [%rd45+3072]; | |
cvt.rn.f16.f32 %h112, %f181; | |
add.rn.f16 %h113, %h111, %h112; | |
mov.b16 %h114, 0x3C72; | |
mul.rn.f16 %h115, %h113, %h114; | |
selp.b16 %h116, %h115, 0x0000, %p52; | |
cvt.f32.f16 %f182, %h116; | |
ld.global.nc.b16 %h117, [%rd46+1536]; | |
cvt.f32.f16 %f183, %h117; | |
ld.global.nc.f32 %f184, [%rd47+3072]; | |
mul.rn.f32 %f185, %f1, %f184; | |
mul.rn.f32 %f186, %f185, %f183; | |
ld.global.nc.f32 %f187, [%rd48+3072]; | |
mul.rn.f32 %f188, %f2, %f185; | |
sub.rn.f32 %f189, %f187, %f188; | |
add.rn.f32 %f190, %f186, %f189; | |
add.rn.f32 %f191, %f190, %f182; | |
add.rn.f32 %f15, %f14, %f191; | |
or.b32 %r229, %r3, 769; | |
or.b32 %r230, %r229, %r4; | |
and.b32 %r231, %r229, 3; | |
shr.u32 %r232, %r230, 2; | |
setp.ne.s32 %p53, %r231, 1; | |
cvt.u64.u32 %rd2025, %r232; | |
add.s64 %rd378, %rd11, %rd2025; | |
@%p53 bra LBB79_43; | |
and.b64 %rd2065, %rd378, 4294967295; | |
mul.lo.s64 %rd2688, %rd2065, 3528531795; | |
setp.lt.u64 %p55, %rd378, %rd11; | |
selp.u64 %rd2066, 1, 0, %p55; | |
add.s64 %rd2067, %rd2461, %rd2066; | |
xor.b64 %rd2068, %rd2067, %rd2688; | |
shr.u64 %rd2069, %rd2068, 32; | |
mul.lo.s64 %rd2691, %rd2069, 3449720151; | |
shr.u64 %rd2070, %rd2691, 32; | |
and.b64 %rd2071, %rd2067, 4294967295; | |
mul.lo.s64 %rd2072, %rd2071, 3449720151; | |
and.b64 %rd2073, %rd2072, 4294967295; | |
xor.b64 %rd2074, %rd2073, %rd2070; | |
xor.b64 %rd2075, %rd2074, 2654435769; | |
mul.lo.s64 %rd2694, %rd2075, 3528531795; | |
xor.b64 %rd2684, %rd2072, %rd378; | |
mov.u32 %r344, -845247145; | |
mov.u32 %r343, -616729560; | |
mov.u64 %rd2701, 3041712726; | |
mov.u64 %rd2700, 1401181199; | |
mov.u64 %rd2699, 2835769497; | |
mov.u64 %rd2698, 1684936478; | |
mov.u64 %rd2697, 2027808484; | |
mov.u64 %rd2696, 387276957; | |
mov.u64 %rd2695, 842468239; | |
mov.u64 %rd2693, 3986602516; | |
mov.u64 %rd2692, 1013904242; | |
mov.u64 %rd2690, 3668340011; | |
mov.u64 %rd2689, 3144134277; | |
mov.u64 %rd2687, 3449720151; | |
mov.u64 %rd2686, 1993301258; | |
mov.u64 %rd2685, 3528531795; | |
bra.uni LBB79_44; | |
LBB79_43: | |
setp.lt.u64 %p54, %rd378, %rd11; | |
selp.u64 %rd2040, 1, 0, %p54; | |
add.s64 %rd2041, %rd2461, %rd2040; | |
and.b64 %rd2042, %rd2041, 4294967295; | |
mul.lo.s64 %rd2688, %rd2042, 3449720151; | |
xor.b64 %rd2043, %rd2688, %rd378; | |
shr.u64 %rd2044, %rd2043, 32; | |
mul.lo.s64 %rd2691, %rd2044, 3528531795; | |
shr.u64 %rd2045, %rd2691, 32; | |
and.b64 %rd2046, %rd378, 4294967295; | |
mul.lo.s64 %rd2047, %rd2046, 3528531795; | |
and.b64 %rd2048, %rd2047, 4294967295; | |
xor.b64 %rd2049, %rd2048, %rd2045; | |
xor.b64 %rd2050, %rd2049, 3144134277; | |
mul.lo.s64 %rd2694, %rd2050, 3449720151; | |
xor.b64 %rd2684, %rd2041, %rd2047; | |
mov.u32 %r344, -766435501; | |
mov.u32 %r343, -239350328; | |
mov.u64 %rd2701, 1684936478; | |
mov.u64 %rd2700, 534103459; | |
mov.u64 %rd2699, 387276957; | |
mov.u64 %rd2698, 3041712726; | |
mov.u64 %rd2697, 3986602516; | |
mov.u64 %rd2696, 2835769497; | |
mov.u64 %rd2695, 3668340011; | |
mov.u64 %rd2693, 2027808484; | |
mov.u64 %rd2692, 1993301258; | |
mov.u64 %rd2690, 842468239; | |
mov.u64 %rd2689, 2654435769; | |
mov.u64 %rd2687, 3528531795; | |
mov.u64 %rd2686, 1013904242; | |
mov.u64 %rd2685, 3449720151; | |
LBB79_44: | |
shr.u64 %rd2076, %rd2694, 32; | |
shr.u64 %rd2077, %rd2684, 32; | |
mul.lo.s64 %rd2078, %rd2077, %rd2685; | |
and.b64 %rd2079, %rd2078, 4294967295; | |
xor.b64 %rd2080, %rd2079, %rd2076; | |
xor.b64 %rd2081, %rd2080, %rd2686; | |
mul.lo.s64 %rd2082, %rd2081, %rd2687; | |
shr.u64 %rd2083, %rd2082, 32; | |
shr.u64 %rd2084, %rd2078, 32; | |
and.b64 %rd2085, %rd2688, 4294967295; | |
xor.b64 %rd2086, %rd2085, %rd2084; | |
xor.b64 %rd2087, %rd2086, %rd2689; | |
mul.lo.s64 %rd2088, %rd2087, %rd2687; | |
and.b64 %rd2089, %rd2088, 4294967295; | |
xor.b64 %rd2090, %rd2089, %rd2083; | |
xor.b64 %rd2091, %rd2090, %rd2690; | |
mul.lo.s64 %rd2092, %rd2091, %rd2685; | |
shr.u64 %rd2093, %rd2092, 32; | |
shr.u64 %rd2094, %rd2088, 32; | |
and.b64 %rd2095, %rd2691, 4294967295; | |
xor.b64 %rd2096, %rd2095, %rd2094; | |
xor.b64 %rd2097, %rd2096, %rd2692; | |
mul.lo.s64 %rd2098, %rd2097, %rd2685; | |
and.b64 %rd2099, %rd2098, 4294967295; | |
xor.b64 %rd2100, %rd2099, %rd2093; | |
xor.b64 %rd2101, %rd2100, %rd2693; | |
mul.lo.s64 %rd2102, %rd2101, %rd2687; | |
shr.u64 %rd2103, %rd2102, 32; | |
shr.u64 %rd2104, %rd2098, 32; | |
and.b64 %rd2105, %rd2694, 4294967295; | |
xor.b64 %rd2106, %rd2105, %rd2104; | |
xor.b64 %rd2107, %rd2106, %rd2695; | |
mul.lo.s64 %rd2108, %rd2107, %rd2687; | |
and.b64 %rd2109, %rd2108, 4294967295; | |
xor.b64 %rd2110, %rd2109, %rd2103; | |
xor.b64 %rd2111, %rd2110, %rd2696; | |
mul.lo.s64 %rd2112, %rd2111, %rd2685; | |
shr.u64 %rd2113, %rd2112, 32; | |
shr.u64 %rd2114, %rd2108, 32; | |
and.b64 %rd2115, %rd2082, 4294967295; | |
xor.b64 %rd2116, %rd2115, %rd2114; | |
xor.b64 %rd2117, %rd2116, %rd2697; | |
mul.lo.s64 %rd2118, %rd2117, %rd2685; | |
and.b64 %rd2119, %rd2118, 4294967295; | |
xor.b64 %rd2120, %rd2119, %rd2113; | |
xor.b64 %rd2121, %rd2120, %rd2698; | |
mul.lo.s64 %rd2122, %rd2121, %rd2687; | |
shr.u64 %rd2123, %rd2122, 32; | |
shr.u64 %rd2124, %rd2118, 32; | |
and.b64 %rd2125, %rd2092, 4294967295; | |
xor.b64 %rd2126, %rd2125, %rd2124; | |
xor.b64 %rd2127, %rd2126, %rd2699; | |
mul.lo.s64 %rd2128, %rd2127, %rd2687; | |
and.b64 %rd2129, %rd2128, 4294967295; | |
xor.b64 %rd2130, %rd2129, %rd2123; | |
xor.b64 %rd2131, %rd2130, %rd2700; | |
mul.lo.s64 %rd2132, %rd2131, %rd2685; | |
shr.u64 %rd2133, %rd2132, 32; | |
shr.u64 %rd2134, %rd2128, 32; | |
xor.b64 %rd2135, %rd2102, %rd2134; | |
xor.b64 %rd2136, %rd2135, %rd2701; | |
mul.lo.s64 %rd2137, %rd2136, %rd2685; | |
xor.b64 %rd2138, %rd2133, %rd2137; | |
cvt.u32.u64 %r237, %rd2138; | |
xor.b32 %r238, %r343, %r237; | |
mul.lo.s32 %r239, %r238, %r344; | |
shr.u32 %r240, %r239, 9; | |
cvt.rn.f32.u32 %f192, %r240; | |
mul.rn.f32 %f193, %f192, 0f34000000; | |
cvt.rn.f16.f32 %h118, %f193; | |
mov.b16 %h119, 0x2E66; | |
setp.ge.f16 %p57, %h118, %h119; | |
ld.global.nc.b16 %h120, [%rd44+1538]; | |
ld.global.nc.f32 %f194, [%rd45+3076]; | |
cvt.rn.f16.f32 %h121, %f194; | |
add.rn.f16 %h122, %h120, %h121; | |
mov.b16 %h123, 0x3C72; | |
mul.rn.f16 %h124, %h122, %h123; | |
selp.b16 %h125, %h124, 0x0000, %p57; | |
cvt.f32.f16 %f195, %h125; | |
ld.global.nc.b16 %h126, [%rd46+1538]; | |
cvt.f32.f16 %f196, %h126; | |
ld.global.nc.f32 %f197, [%rd47+3076]; | |
mul.rn.f32 %f198, %f1, %f197; | |
mul.rn.f32 %f199, %f198, %f196; | |
ld.global.nc.f32 %f200, [%rd48+3076]; | |
mul.rn.f32 %f201, %f2, %f198; | |
sub.rn.f32 %f202, %f200, %f201; | |
add.rn.f32 %f203, %f199, %f202; | |
add.rn.f32 %f204, %f203, %f195; | |
add.rn.f32 %f16, %f15, %f204; | |
or.b32 %r242, %r73, 896; | |
shr.u32 %r243, %r242, 2; | |
cvt.u64.u32 %rd2139, %r243; | |
add.s64 %rd405, %rd11, %rd2139; | |
@%p8 bra LBB79_46; | |
mov.u32 %r347, -1879881855; | |
mov.u32 %r345, 534103459; | |
mov.u64 %rd2720, 3678237736; | |
and.b64 %rd2181, %rd405, 4294967295; | |
mul.lo.s64 %rd2706, %rd2181, 3528531795; | |
setp.lt.u64 %p59, %rd405, %rd11; | |
selp.u64 %rd2182, 1, 0, %p59; | |
add.s64 %rd2183, %rd2461, %rd2182; | |
xor.b64 %rd2184, %rd2183, %rd2706; | |
shr.u64 %rd2185, %rd2184, 32; | |
mul.lo.s64 %rd2709, %rd2185, 3449720151; | |
shr.u64 %rd2186, %rd2709, 32; | |
and.b64 %rd2187, %rd2183, 4294967295; | |
mul.lo.s64 %rd2188, %rd2187, 3449720151; | |
and.b64 %rd2189, %rd2188, 4294967295; | |
xor.b64 %rd2190, %rd2189, %rd2186; | |
xor.b64 %rd2191, %rd2190, 2654435769; | |
mul.lo.s64 %rd2712, %rd2191, 3528531795; | |
xor.b64 %rd2702, %rd2188, %rd405; | |
mov.u32 %r346, -845247145; | |
mov.u64 %rd2719, 3041712726; | |
mov.u64 %rd2718, 1401181199; | |
mov.u64 %rd2717, 2835769497; | |
mov.u64 %rd2716, 1684936478; | |
mov.u64 %rd2715, 2027808484; | |
mov.u64 %rd2714, 387276957; | |
mov.u64 %rd2713, 842468239; | |
mov.u64 %rd2711, 3986602516; | |
mov.u64 %rd2710, 1013904242; | |
mov.u64 %rd2708, 3668340011; | |
mov.u64 %rd2707, 3144134277; | |
mov.u64 %rd2705, 3449720151; | |
mov.u64 %rd2704, 1993301258; | |
mov.u64 %rd2703, 3528531795; | |
bra.uni LBB79_47; | |
LBB79_46: | |
setp.lt.u64 %p58, %rd405, %rd11; | |
selp.u64 %rd2155, 1, 0, %p58; | |
add.s64 %rd2156, %rd2461, %rd2155; | |
and.b64 %rd2157, %rd2156, 4294967295; | |
mul.lo.s64 %rd2706, %rd2157, 3449720151; | |
xor.b64 %rd2158, %rd2706, %rd405; | |
shr.u64 %rd2159, %rd2158, 32; | |
mul.lo.s64 %rd2709, %rd2159, 3528531795; | |
shr.u64 %rd2160, %rd2709, 32; | |
and.b64 %rd2161, %rd405, 4294967295; | |
mul.lo.s64 %rd2162, %rd2161, 3528531795; | |
and.b64 %rd2163, %rd2162, 4294967295; | |
xor.b64 %rd2164, %rd2163, %rd2160; | |
xor.b64 %rd2165, %rd2164, 3144134277; | |
mul.lo.s64 %rd2712, %rd2165, 3449720151; | |
xor.b64 %rd2702, %rd2156, %rd2162; | |
mov.u32 %r347, -1767562579; | |
mov.u32 %r346, -766435501; | |
mov.u32 %r345, 1401181199; | |
mov.u64 %rd2720, 4055616968; | |
mov.u64 %rd2719, 1684936478; | |
mov.u64 %rd2718, 534103459; | |
mov.u64 %rd2717, 387276957; | |
mov.u64 %rd2716, 3041712726; | |
mov.u64 %rd2715, 3986602516; | |
mov.u64 %rd2714, 2835769497; | |
mov.u64 %rd2713, 3668340011; | |
mov.u64 %rd2711, 2027808484; | |
mov.u64 %rd2710, 1993301258; | |
mov.u64 %rd2708, 842468239; | |
mov.u64 %rd2707, 2654435769; | |
mov.u64 %rd2705, 3528531795; | |
mov.u64 %rd2704, 1013904242; | |
mov.u64 %rd2703, 3449720151; | |
LBB79_47: | |
shr.u64 %rd2192, %rd2712, 32; | |
shr.u64 %rd2193, %rd2702, 32; | |
mul.lo.s64 %rd2194, %rd2193, %rd2703; | |
and.b64 %rd2195, %rd2194, 4294967295; | |
xor.b64 %rd2196, %rd2195, %rd2192; | |
xor.b64 %rd2197, %rd2196, %rd2704; | |
mul.lo.s64 %rd2198, %rd2197, %rd2705; | |
shr.u64 %rd2199, %rd2198, 32; | |
shr.u64 %rd2200, %rd2194, 32; | |
and.b64 %rd2201, %rd2706, 4294967295; | |
xor.b64 %rd2202, %rd2201, %rd2200; | |
xor.b64 %rd2203, %rd2202, %rd2707; | |
mul.lo.s64 %rd2204, %rd2203, %rd2705; | |
and.b64 %rd2205, %rd2204, 4294967295; | |
xor.b64 %rd2206, %rd2205, %rd2199; | |
xor.b64 %rd2207, %rd2206, %rd2708; | |
mul.lo.s64 %rd2208, %rd2207, %rd2703; | |
shr.u64 %rd2209, %rd2208, 32; | |
shr.u64 %rd2210, %rd2204, 32; | |
and.b64 %rd2211, %rd2709, 4294967295; | |
xor.b64 %rd2212, %rd2211, %rd2210; | |
xor.b64 %rd2213, %rd2212, %rd2710; | |
mul.lo.s64 %rd2214, %rd2213, %rd2703; | |
and.b64 %rd2215, %rd2214, 4294967295; | |
xor.b64 %rd2216, %rd2215, %rd2209; | |
xor.b64 %rd2217, %rd2216, %rd2711; | |
mul.lo.s64 %rd2218, %rd2217, %rd2705; | |
shr.u64 %rd2219, %rd2218, 32; | |
shr.u64 %rd2220, %rd2214, 32; | |
and.b64 %rd2221, %rd2712, 4294967295; | |
xor.b64 %rd2222, %rd2221, %rd2220; | |
xor.b64 %rd2223, %rd2222, %rd2713; | |
mul.lo.s64 %rd2224, %rd2223, %rd2705; | |
and.b64 %rd2225, %rd2224, 4294967295; | |
xor.b64 %rd2226, %rd2225, %rd2219; | |
xor.b64 %rd2227, %rd2226, %rd2714; | |
mul.lo.s64 %rd2228, %rd2227, %rd2703; | |
shr.u64 %rd2229, %rd2228, 32; | |
shr.u64 %rd2230, %rd2224, 32; | |
and.b64 %rd2231, %rd2198, 4294967295; | |
xor.b64 %rd2232, %rd2231, %rd2230; | |
xor.b64 %rd2233, %rd2232, %rd2715; | |
mul.lo.s64 %rd2234, %rd2233, %rd2703; | |
and.b64 %rd2235, %rd2234, 4294967295; | |
xor.b64 %rd2236, %rd2235, %rd2229; | |
xor.b64 %rd2237, %rd2236, %rd2716; | |
mul.lo.s64 %rd2238, %rd2237, %rd2705; | |
shr.u64 %rd2239, %rd2238, 32; | |
shr.u64 %rd2240, %rd2234, 32; | |
and.b64 %rd2241, %rd2208, 4294967295; | |
xor.b64 %rd2242, %rd2241, %rd2240; | |
xor.b64 %rd2243, %rd2242, %rd2717; | |
mul.lo.s64 %rd2244, %rd2243, %rd2705; | |
and.b64 %rd2245, %rd2244, 4294967295; | |
xor.b64 %rd2246, %rd2245, %rd2239; | |
xor.b64 %rd2247, %rd2246, %rd2718; | |
mul.lo.s64 %rd2248, %rd2247, %rd2703; | |
shr.u64 %rd2249, %rd2248, 32; | |
shr.u64 %rd2250, %rd2244, 32; | |
and.b64 %rd2251, %rd2218, 4294967295; | |
xor.b64 %rd2252, %rd2251, %rd2250; | |
xor.b64 %rd2253, %rd2252, %rd2719; | |
mul.lo.s64 %rd2254, %rd2253, %rd2703; | |
and.b64 %rd2255, %rd2254, 4294967295; | |
xor.b64 %rd2256, %rd2255, %rd2249; | |
xor.b64 %rd2257, %rd2256, %rd2720; | |
mul.lo.s64 %rd2258, %rd2257, %rd2705; | |
shr.u64 %rd2259, %rd2258, 32; | |
cvt.u32.u64 %r250, %rd2259; | |
shr.u64 %rd2260, %rd2254, 32; | |
xor.b64 %rd2261, %rd2260, %rd2228; | |
cvt.u32.u64 %r251, %rd2261; | |
xor.b32 %r252, %r345, %r251; | |
mul.lo.s32 %r253, %r252, %r346; | |
xor.b32 %r254, %r253, %r250; | |
xor.b32 %r255, %r254, %r347; | |
shr.u32 %r256, %r255, 9; | |
cvt.rn.f32.u32 %f205, %r256; | |
mul.rn.f32 %f206, %f205, 0f34000000; | |
cvt.rn.f16.f32 %h127, %f206; | |
mov.b16 %h128, 0x2E66; | |
setp.ge.f16 %p60, %h127, %h128; | |
ld.global.nc.b16 %h129, [%rd44+1792]; | |
ld.global.nc.f32 %f207, [%rd45+3584]; | |
cvt.rn.f16.f32 %h130, %f207; | |
add.rn.f16 %h131, %h129, %h130; | |
mov.b16 %h132, 0x3C72; | |
mul.rn.f16 %h133, %h131, %h132; | |
selp.b16 %h134, %h133, 0x0000, %p60; | |
cvt.f32.f16 %f208, %h134; | |
ld.global.nc.b16 %h135, [%rd46+1792]; | |
cvt.f32.f16 %f209, %h135; | |
ld.global.nc.f32 %f210, [%rd47+3584]; | |
mul.rn.f32 %f211, %f1, %f210; | |
mul.rn.f32 %f212, %f211, %f209; | |
ld.global.nc.f32 %f213, [%rd48+3584]; | |
mul.rn.f32 %f214, %f2, %f211; | |
sub.rn.f32 %f215, %f213, %f214; | |
add.rn.f32 %f216, %f212, %f215; | |
add.rn.f32 %f217, %f216, %f208; | |
add.rn.f32 %f17, %f16, %f217; | |
or.b32 %r257, %r3, 897; | |
or.b32 %r258, %r257, %r4; | |
and.b32 %r259, %r257, 3; | |
shr.u32 %r260, %r258, 2; | |
setp.ne.s32 %p61, %r259, 1; | |
cvt.u64.u32 %rd2262, %r260; | |
add.s64 %rd433, %rd11, %rd2262; | |
@%p61 bra LBB79_49; | |
mov.u32 %r349, -845247145; | |
mov.u64 %rd2737, 1401181199; | |
mov.u64 %rd2726, 3144134277; | |
mov.u32 %r348, -616729560; | |
and.b64 %rd2302, %rd433, 4294967295; | |
mul.lo.s64 %rd2725, %rd2302, 3528531795; | |
setp.lt.u64 %p63, %rd433, %rd11; | |
selp.u64 %rd2303, 1, 0, %p63; | |
add.s64 %rd2304, %rd2461, %rd2303; | |
xor.b64 %rd2305, %rd2304, %rd2725; | |
shr.u64 %rd2306, %rd2305, 32; | |
mul.lo.s64 %rd2728, %rd2306, 3449720151; | |
shr.u64 %rd2307, %rd2728, 32; | |
and.b64 %rd2308, %rd2304, 4294967295; | |
mul.lo.s64 %rd2309, %rd2308, 3449720151; | |
and.b64 %rd2310, %rd2309, 4294967295; | |
xor.b64 %rd2311, %rd2310, %rd2307; | |
xor.b64 %rd2312, %rd2311, 2654435769; | |
mul.lo.s64 %rd2731, %rd2312, 3528531795; | |
xor.b64 %rd2721, %rd2309, %rd433; | |
mov.u64 %rd2738, 3041712726; | |
mov.u64 %rd2736, 2835769497; | |
mov.u64 %rd2735, 1684936478; | |
mov.u64 %rd2734, 2027808484; | |
mov.u64 %rd2733, 387276957; | |
mov.u64 %rd2732, 842468239; | |
mov.u64 %rd2730, 3986602516; | |
mov.u64 %rd2729, 1013904242; | |
mov.u64 %rd2727, 3668340011; | |
mov.u64 %rd2724, 3449720151; | |
mov.u64 %rd2723, 1993301258; | |
mov.u64 %rd2722, 3528531795; | |
bra.uni LBB79_50; | |
LBB79_49: | |
setp.lt.u64 %p62, %rd433, %rd11; | |
selp.u64 %rd2277, 1, 0, %p62; | |
add.s64 %rd2278, %rd2461, %rd2277; | |
and.b64 %rd2279, %rd2278, 4294967295; | |
mul.lo.s64 %rd2725, %rd2279, 3449720151; | |
xor.b64 %rd2280, %rd2725, %rd433; | |
shr.u64 %rd2281, %rd2280, 32; | |
mul.lo.s64 %rd2728, %rd2281, 3528531795; | |
shr.u64 %rd2282, %rd2728, 32; | |
and.b64 %rd2283, %rd433, 4294967295; | |
mul.lo.s64 %rd2284, %rd2283, 3528531795; | |
and.b64 %rd2285, %rd2284, 4294967295; | |
xor.b64 %rd2286, %rd2285, %rd2282; | |
xor.b64 %rd2287, %rd2286, 3144134277; | |
mul.lo.s64 %rd2731, %rd2287, 3449720151; | |
xor.b64 %rd2721, %rd2278, %rd2284; | |
mov.u32 %r349, -766435501; | |
mov.u32 %r348, -239350328; | |
mov.u64 %rd2738, 1684936478; | |
mov.u64 %rd2737, 534103459; | |
mov.u64 %rd2736, 387276957; | |
mov.u64 %rd2735, 3041712726; | |
mov.u64 %rd2734, 3986602516; | |
mov.u64 %rd2733, 2835769497; | |
mov.u64 %rd2732, 3668340011; | |
mov.u64 %rd2730, 2027808484; | |
mov.u64 %rd2729, 1993301258; | |
mov.u64 %rd2727, 842468239; | |
mov.u64 %rd2726, 2654435769; | |
mov.u64 %rd2724, 3528531795; | |
mov.u64 %rd2723, 1013904242; | |
mov.u64 %rd2722, 3449720151; | |
LBB79_50: | |
shr.u64 %rd2313, %rd2731, 32; | |
shr.u64 %rd2314, %rd2721, 32; | |
mul.lo.s64 %rd2315, %rd2314, %rd2722; | |
and.b64 %rd2316, %rd2315, 4294967295; | |
xor.b64 %rd2317, %rd2316, %rd2313; | |
xor.b64 %rd2318, %rd2317, %rd2723; | |
mul.lo.s64 %rd2319, %rd2318, %rd2724; | |
shr.u64 %rd2320, %rd2319, 32; | |
shr.u64 %rd2321, %rd2315, 32; | |
and.b64 %rd2322, %rd2725, 4294967295; | |
xor.b64 %rd2323, %rd2322, %rd2321; | |
xor.b64 %rd2324, %rd2323, %rd2726; | |
mul.lo.s64 %rd2325, %rd2324, %rd2724; | |
and.b64 %rd2326, %rd2325, 4294967295; | |
xor.b64 %rd2327, %rd2326, %rd2320; | |
xor.b64 %rd2328, %rd2327, %rd2727; | |
mul.lo.s64 %rd2329, %rd2328, %rd2722; | |
shr.u64 %rd2330, %rd2329, 32; | |
shr.u64 %rd2331, %rd2325, 32; | |
and.b64 %rd2332, %rd2728, 4294967295; | |
xor.b64 %rd2333, %rd2332, %rd2331; | |
xor.b64 %rd2334, %rd2333, %rd2729; | |
mul.lo.s64 %rd2335, %rd2334, %rd2722; | |
and.b64 %rd2336, %rd2335, 4294967295; | |
xor.b64 %rd2337, %rd2336, %rd2330; | |
xor.b64 %rd2338, %rd2337, %rd2730; | |
mul.lo.s64 %rd2339, %rd2338, %rd2724; | |
shr.u64 %rd2340, %rd2339, 32; | |
shr.u64 %rd2341, %rd2335, 32; | |
and.b64 %rd2342, %rd2731, 4294967295; | |
xor.b64 %rd2343, %rd2342, %rd2341; | |
xor.b64 %rd2344, %rd2343, %rd2732; | |
mul.lo.s64 %rd2345, %rd2344, %rd2724; | |
and.b64 %rd2346, %rd2345, 4294967295; | |
xor.b64 %rd2347, %rd2346, %rd2340; | |
xor.b64 %rd2348, %rd2347, %rd2733; | |
mul.lo.s64 %rd2349, %rd2348, %rd2722; | |
shr.u64 %rd2350, %rd2349, 32; | |
shr.u64 %rd2351, %rd2345, 32; | |
and.b64 %rd2352, %rd2319, 4294967295; | |
xor.b64 %rd2353, %rd2352, %rd2351; | |
xor.b64 %rd2354, %rd2353, %rd2734; | |
mul.lo.s64 %rd2355, %rd2354, %rd2722; | |
and.b64 %rd2356, %rd2355, 4294967295; | |
xor.b64 %rd2357, %rd2356, %rd2350; | |
xor.b64 %rd2358, %rd2357, %rd2735; | |
mul.lo.s64 %rd2359, %rd2358, %rd2724; | |
shr.u64 %rd2360, %rd2359, 32; | |
shr.u64 %rd2361, %rd2355, 32; | |
and.b64 %rd2362, %rd2329, 4294967295; | |
xor.b64 %rd2363, %rd2362, %rd2361; | |
xor.b64 %rd2364, %rd2363, %rd2736; | |
mul.lo.s64 %rd2365, %rd2364, %rd2724; | |
and.b64 %rd2366, %rd2365, 4294967295; | |
xor.b64 %rd2367, %rd2366, %rd2360; | |
xor.b64 %rd2368, %rd2367, %rd2737; | |
mul.lo.s64 %rd2369, %rd2368, %rd2722; | |
shr.u64 %rd2370, %rd2369, 32; | |
shr.u64 %rd2371, %rd2365, 32; | |
xor.b64 %rd2372, %rd2339, %rd2371; | |
xor.b64 %rd2373, %rd2372, %rd2738; | |
mul.lo.s64 %rd2374, %rd2373, %rd2722; | |
xor.b64 %rd2375, %rd2370, %rd2374; | |
cvt.u32.u64 %r265, %rd2375; | |
xor.b32 %r266, %r348, %r265; | |
mul.lo.s32 %r267, %r266, %r349; | |
shr.u32 %r268, %r267, 9; | |
cvt.rn.f32.u32 %f218, %r268; | |
mul.rn.f32 %f219, %f218, 0f34000000; | |
cvt.rn.f16.f32 %h136, %f219; | |
mov.b16 %h137, 0x2E66; | |
setp.ge.f16 %p64, %h136, %h137; | |
ld.global.nc.b16 %h138, [%rd44+1794]; | |
ld.global.nc.f32 %f220, [%rd45+3588]; | |
cvt.rn.f16.f32 %h139, %f220; | |
add.rn.f16 %h140, %h138, %h139; | |
mov.b16 %h141, 0x3C72; | |
mul.rn.f16 %h142, %h140, %h141; | |
selp.b16 %h143, %h142, 0x0000, %p64; | |
cvt.f32.f16 %f221, %h143; | |
ld.global.nc.b16 %h144, [%rd46+1794]; | |
cvt.f32.f16 %f222, %h144; | |
ld.global.nc.f32 %f223, [%rd47+3588]; | |
mul.rn.f32 %f224, %f1, %f223; | |
mul.rn.f32 %f225, %f224, %f222; | |
ld.global.nc.f32 %f226, [%rd48+3588]; | |
mul.rn.f32 %f227, %f2, %f224; | |
sub.rn.f32 %f228, %f226, %f227; | |
add.rn.f32 %f229, %f225, %f228; | |
add.rn.f32 %f230, %f229, %f221; | |
add.rn.f32 %f231, %f17, %f230; | |
and.b32 %r46, %r1, 31; | |
shfl.sync.down.b32 %f232, %f231, 16, 31, -1; | |
add.rn.f32 %f233, %f232, %f231; | |
shfl.sync.down.b32 %f234, %f233, 8, 31, -1; | |
add.rn.f32 %f235, %f234, %f233; | |
shfl.sync.down.b32 %f236, %f235, 4, 31, -1; | |
add.rn.f32 %f237, %f236, %f235; | |
shfl.sync.down.b32 %f238, %f237, 2, 31, -1; | |
add.rn.f32 %f239, %f238, %f237; | |
shfl.sync.down.b32 %f240, %f239, 1, 31, -1; | |
shr.u32 %r47, %r1, 5; | |
setp.ne.s32 %p65, %r46, 0; | |
mov.u64 %rd2378, shared_cache_018; | |
@%p65 bra LBB79_2; | |
mul.wide.u32 %rd2377, %r47, 4; | |
add.s64 %rd461, %rd2378, %rd2377; | |
add.rn.f32 %f18, %f240, %f239; | |
st.shared.f32 [%rd461], %f18; | |
LBB79_2: | |
bar.sync 0; | |
setp.eq.s32 %p66, %r47, 0; | |
@%p66 bra LBB79_52; | |
bra.uni LBB79_3; | |
LBB79_52: | |
add.u64 %rd472, %SP, 0; | |
add.u64 %rd10, %SPL, 0; | |
mul.wide.u32 %rd2379, %r46, 4; | |
add.s64 %rd462, %rd2378, %rd2379; | |
cvta.shared.u64 %rd2381, %rd462; | |
mov.u32 %r269, 0; | |
st.local.u32 [%rd10], %r269; | |
setp.lt.u32 %p67, %r1, 2; | |
selp.b64 %rd2383, %rd2381, %rd472, %p67; | |
ld.f32 %f241, [%rd2383]; | |
shfl.sync.down.b32 %f242, %f241, 16, 31, -1; | |
add.rn.f32 %f243, %f241, %f242; | |
shfl.sync.down.b32 %f244, %f243, 8, 31, -1; | |
add.rn.f32 %f245, %f243, %f244; | |
shfl.sync.down.b32 %f246, %f245, 4, 31, -1; | |
add.rn.f32 %f247, %f245, %f246; | |
shfl.sync.down.b32 %f248, %f247, 2, 31, -1; | |
add.rn.f32 %f249, %f247, %f248; | |
shfl.sync.down.b32 %f250, %f249, 1, 31, -1; | |
add.rn.f32 %f251, %f249, %f250; | |
st.f32 [%rd2383], %f251; | |
setp.ne.s32 %p68, %r1, 0; | |
@%p68 bra LBB79_3; | |
ld.param.u64 %rd469, [fusion_2180_param_3]; | |
cvt.u64.u32 %rd43, %r2; | |
cvta.to.global.u64 %rd6, %rd469; | |
shl.b64 %rd2376, %rd43, 2; | |
add.s64 %rd460, %rd6, %rd2376; | |
ld.shared.f32 %f252, [%rd462]; | |
atom.global.add.f32 %f253, [%rd460], %f252; | |
LBB79_3: | |
ret; | |
} | |
// .globl fusion_2178 | |
.visible .entry fusion_2178( | |
.param .u64 fusion_2178_param_0, | |
.param .u64 fusion_2178_param_1, | |
.param .u64 fusion_2178_param_2, | |
.param .u64 fusion_2178_param_3, | |
.param .u64 fusion_2178_param_4, | |
.param .u64 fusion_2178_param_5, | |
.param .u64 fusion_2178_param_6, | |
.param .u64 fusion_2178_param_7, | |
.param .u64 fusion_2178_param_8, | |
.param .u64 fusion_2178_param_9, | |
.param .u64 fusion_2178_param_10 | |
) | |
.reqntid 64, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot80[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<75>; | |
.reg .b16 %h<145>; | |
.reg .f32 %f<288>; | |
.reg .b32 %r<350>; | |
.reg .b64 %rd<2742>; | |
mov.u64 %SPL, __local_depot80; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd464, [fusion_2178_param_0]; | |
ld.param.u64 %rd465, [fusion_2178_param_9]; | |
cvta.to.global.u64 %rd1, %rd465; | |
ld.param.u64 %rd466, [fusion_2178_param_1]; | |
ld.param.u64 %rd467, [fusion_2178_param_8]; | |
cvta.to.global.u64 %rd2, %rd467; | |
ld.param.u64 %rd468, [fusion_2178_param_2]; | |
ld.param.u64 %rd469, [fusion_2178_param_7]; | |
cvta.to.global.u64 %rd3, %rd469; | |
ld.param.u64 %rd471, [fusion_2178_param_6]; | |
cvta.to.global.u64 %rd4, %rd471; | |
ld.param.u64 %rd472, [fusion_2178_param_4]; | |
ld.param.u64 %rd473, [fusion_2178_param_5]; | |
cvta.to.global.u64 %rd5, %rd473; | |
cvta.to.global.u64 %rd6, %rd472; | |
cvta.to.global.u64 %rd8, %rd468; | |
cvta.to.global.u64 %rd9, %rd466; | |
cvta.to.global.u64 %rd10, %rd464; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r2, %ctaid.x; | |
shl.b32 %r3, %r1, 1; | |
shl.b32 %r4, %r2, 10; | |
or.b32 %r48, %r4, %r3; | |
shr.u32 %r49, %r48, 2; | |
and.b32 %r5, %r1, 1; | |
setp.eq.s32 %p1, %r5, 0; | |
ld.global.nc.u64 %rd12, [%rd8]; | |
cvt.u64.u32 %rd475, %r49; | |
add.s64 %rd13, %rd12, %rd475; | |
setp.lt.u64 %p69, %rd13, %rd12; | |
and.b64 %rd2387, %rd13, 4294967295; | |
@%p1 bra LBB80_1; | |
bra.uni LBB80_4; | |
LBB80_1: | |
mul.lo.s64 %rd2449, %rd2387, 3528531795; | |
ld.global.nc.u64 %rd2464, [%rd8+8]; | |
selp.u64 %rd518, 1, 0, %p69; | |
add.s64 %rd519, %rd2464, %rd518; | |
xor.b64 %rd520, %rd519, %rd2449; | |
shr.u64 %rd521, %rd520, 32; | |
mul.lo.s64 %rd2452, %rd521, 3449720151; | |
shr.u64 %rd522, %rd2452, 32; | |
and.b64 %rd523, %rd519, 4294967295; | |
mul.lo.s64 %rd524, %rd523, 3449720151; | |
and.b64 %rd525, %rd524, 4294967295; | |
xor.b64 %rd526, %rd525, %rd522; | |
xor.b64 %rd527, %rd526, 2654435769; | |
mul.lo.s64 %rd2455, %rd527, 3528531795; | |
xor.b64 %rd2445, %rd524, %rd13; | |
mov.u32 %r312, -1879881855; | |
mov.u32 %r311, -845247145; | |
mov.u32 %r310, 534103459; | |
mov.u64 %rd2463, 3678237736; | |
mov.u64 %rd2462, 3041712726; | |
mov.u64 %rd2461, 1401181199; | |
mov.u64 %rd2460, 2835769497; | |
mov.u64 %rd2459, 1684936478; | |
mov.u64 %rd2458, 2027808484; | |
mov.u64 %rd2457, 387276957; | |
mov.u64 %rd2456, 842468239; | |
mov.u64 %rd2454, 3986602516; | |
mov.u64 %rd2453, 1013904242; | |
mov.u64 %rd2451, 3668340011; | |
mov.u64 %rd2450, 3144134277; | |
mov.u64 %rd2448, 3449720151; | |
mov.u64 %rd2447, 1993301258; | |
mov.u64 %rd2446, 3528531795; | |
bra.uni LBB80_5; | |
LBB80_4: | |
mov.u32 %r311, -766435501; | |
mov.u64 %rd2462, 1684936478; | |
mov.u64 %rd2461, 534103459; | |
mov.u64 %rd2460, 387276957; | |
mov.u64 %rd2459, 3041712726; | |
mov.u64 %rd2458, 3986602516; | |
mov.u64 %rd2457, 2835769497; | |
mov.u64 %rd2456, 3668340011; | |
mov.u64 %rd2454, 2027808484; | |
mov.u64 %rd2453, 1993301258; | |
mov.u64 %rd2451, 842468239; | |
mov.u64 %rd2450, 2654435769; | |
mov.u64 %rd2448, 3528531795; | |
mov.u64 %rd2447, 1013904242; | |
mov.u64 %rd2446, 3449720151; | |
mov.u32 %r312, -1767562579; | |
mov.u32 %r310, 1401181199; | |
mov.u64 %rd2463, 4055616968; | |
ld.global.nc.u64 %rd2464, [%rd8+8]; | |
selp.u64 %rd491, 1, 0, %p69; | |
add.s64 %rd492, %rd2464, %rd491; | |
and.b64 %rd493, %rd492, 4294967295; | |
mul.lo.s64 %rd2449, %rd493, 3449720151; | |
xor.b64 %rd494, %rd2449, %rd13; | |
shr.u64 %rd495, %rd494, 32; | |
mul.lo.s64 %rd2452, %rd495, 3528531795; | |
shr.u64 %rd496, %rd2452, 32; | |
mul.lo.s64 %rd498, %rd2387, 3528531795; | |
and.b64 %rd499, %rd498, 4294967295; | |
xor.b64 %rd500, %rd499, %rd496; | |
xor.b64 %rd501, %rd500, 3144134277; | |
mul.lo.s64 %rd2455, %rd501, 3449720151; | |
xor.b64 %rd2445, %rd492, %rd498; | |
LBB80_5: | |
shr.u64 %rd528, %rd2455, 32; | |
shr.u64 %rd529, %rd2445, 32; | |
mul.lo.s64 %rd530, %rd529, %rd2446; | |
and.b64 %rd531, %rd530, 4294967295; | |
xor.b64 %rd532, %rd531, %rd528; | |
xor.b64 %rd533, %rd532, %rd2447; | |
mul.lo.s64 %rd534, %rd533, %rd2448; | |
shr.u64 %rd535, %rd534, 32; | |
shr.u64 %rd536, %rd530, 32; | |
and.b64 %rd537, %rd2449, 4294967295; | |
xor.b64 %rd538, %rd537, %rd536; | |
xor.b64 %rd539, %rd538, %rd2450; | |
mul.lo.s64 %rd540, %rd539, %rd2448; | |
and.b64 %rd541, %rd540, 4294967295; | |
xor.b64 %rd542, %rd541, %rd535; | |
xor.b64 %rd543, %rd542, %rd2451; | |
mul.lo.s64 %rd544, %rd543, %rd2446; | |
shr.u64 %rd545, %rd544, 32; | |
shr.u64 %rd546, %rd540, 32; | |
and.b64 %rd547, %rd2452, 4294967295; | |
xor.b64 %rd548, %rd547, %rd546; | |
xor.b64 %rd549, %rd548, %rd2453; | |
mul.lo.s64 %rd550, %rd549, %rd2446; | |
and.b64 %rd551, %rd550, 4294967295; | |
xor.b64 %rd552, %rd551, %rd545; | |
xor.b64 %rd553, %rd552, %rd2454; | |
mul.lo.s64 %rd554, %rd553, %rd2448; | |
shr.u64 %rd555, %rd554, 32; | |
shr.u64 %rd556, %rd550, 32; | |
and.b64 %rd557, %rd2455, 4294967295; | |
xor.b64 %rd558, %rd557, %rd556; | |
xor.b64 %rd559, %rd558, %rd2456; | |
mul.lo.s64 %rd560, %rd559, %rd2448; | |
and.b64 %rd561, %rd560, 4294967295; | |
xor.b64 %rd562, %rd561, %rd555; | |
xor.b64 %rd563, %rd562, %rd2457; | |
mul.lo.s64 %rd564, %rd563, %rd2446; | |
shr.u64 %rd565, %rd564, 32; | |
shr.u64 %rd566, %rd560, 32; | |
and.b64 %rd567, %rd534, 4294967295; | |
xor.b64 %rd568, %rd567, %rd566; | |
xor.b64 %rd569, %rd568, %rd2458; | |
mul.lo.s64 %rd570, %rd569, %rd2446; | |
and.b64 %rd571, %rd570, 4294967295; | |
xor.b64 %rd572, %rd571, %rd565; | |
xor.b64 %rd573, %rd572, %rd2459; | |
mul.lo.s64 %rd574, %rd573, %rd2448; | |
shr.u64 %rd575, %rd574, 32; | |
shr.u64 %rd576, %rd570, 32; | |
and.b64 %rd577, %rd544, 4294967295; | |
xor.b64 %rd578, %rd577, %rd576; | |
xor.b64 %rd579, %rd578, %rd2460; | |
mul.lo.s64 %rd580, %rd579, %rd2448; | |
and.b64 %rd581, %rd580, 4294967295; | |
xor.b64 %rd582, %rd581, %rd575; | |
xor.b64 %rd583, %rd582, %rd2461; | |
mul.lo.s64 %rd584, %rd583, %rd2446; | |
shr.u64 %rd585, %rd584, 32; | |
shr.u64 %rd586, %rd580, 32; | |
and.b64 %rd587, %rd554, 4294967295; | |
xor.b64 %rd588, %rd587, %rd586; | |
xor.b64 %rd589, %rd588, %rd2462; | |
mul.lo.s64 %rd590, %rd589, %rd2446; | |
and.b64 %rd591, %rd590, 4294967295; | |
xor.b64 %rd592, %rd591, %rd585; | |
xor.b64 %rd593, %rd592, %rd2463; | |
mul.lo.s64 %rd594, %rd593, %rd2448; | |
shr.u64 %rd595, %rd594, 32; | |
cvt.u32.u64 %r56, %rd595; | |
shr.u64 %rd596, %rd590, 32; | |
xor.b64 %rd597, %rd596, %rd564; | |
cvt.u32.u64 %r57, %rd597; | |
xor.b32 %r58, %r310, %r57; | |
mul.lo.s32 %r59, %r58, %r311; | |
xor.b32 %r60, %r59, %r56; | |
xor.b32 %r61, %r60, %r312; | |
shr.u32 %r62, %r61, 9; | |
cvt.rn.f32.u32 %f20, %r62; | |
mul.rn.f32 %f21, %f20, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f21; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p4, %h1, %h2; | |
mul.wide.u32 %rd598, %r2, 2048; | |
add.s64 %rd599, %rd10, %rd598; | |
mul.wide.u32 %rd600, %r3, 2; | |
add.s64 %rd45, %rd599, %rd600; | |
ld.global.nc.b16 %h3, [%rd45]; | |
mul.wide.u32 %rd601, %r3, 4; | |
add.s64 %rd46, %rd1, %rd601; | |
ld.global.nc.f32 %f22, [%rd46]; | |
cvt.rn.f16.f32 %h4, %f22; | |
add.rn.f16 %h5, %h3, %h4; | |
mov.b16 %h6, 0x3C72; | |
mul.rn.f16 %h7, %h5, %h6; | |
selp.b16 %h8, %h7, 0x0000, %p4; | |
cvt.f32.f16 %f23, %h8; | |
add.s64 %rd602, %rd9, %rd598; | |
add.s64 %rd47, %rd602, %rd600; | |
ld.global.nc.b16 %h9, [%rd47]; | |
cvt.f32.f16 %f24, %h9; | |
mul.wide.u32 %rd603, %r2, 4; | |
add.s64 %rd604, %rd5, %rd603; | |
ld.global.nc.f32 %f25, [%rd604]; | |
mul.rn.f32 %f26, %f25, 0f3A800000; | |
add.rn.f32 %f27, %f26, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f1, %f27; | |
add.s64 %rd48, %rd2, %rd601; | |
ld.global.nc.f32 %f28, [%rd48]; | |
mul.rn.f32 %f29, %f1, %f28; | |
mul.rn.f32 %f30, %f29, %f24; | |
add.s64 %rd49, %rd3, %rd601; | |
ld.global.nc.f32 %f31, [%rd49]; | |
add.s64 %rd605, %rd4, %rd603; | |
ld.global.nc.f32 %f32, [%rd605]; | |
mul.rn.f32 %f2, %f32, 0f3A800000; | |
mul.rn.f32 %f33, %f29, %f2; | |
sub.rn.f32 %f34, %f31, %f33; | |
add.rn.f32 %f35, %f30, %f34; | |
add.rn.f32 %f36, %f35, %f23; | |
add.s64 %rd606, %rd6, %rd603; | |
ld.global.nc.f32 %f37, [%rd606]; | |
mul.rn.f32 %f3, %f37, 0f3A800000; | |
sub.rn.f32 %f38, %f36, %f3; | |
mul.rn.f32 %f39, %f38, %f38; | |
add.rn.f32 %f4, %f39, 0f00000000; | |
or.b32 %r63, %r3, 1; | |
and.b32 %r64, %r63, 3; | |
setp.ne.s32 %p5, %r64, 1; | |
@%p5 bra LBB80_7; | |
mul.lo.s64 %rd2469, %rd2387, 3528531795; | |
selp.u64 %rd647, 1, 0, %p69; | |
add.s64 %rd648, %rd2464, %rd647; | |
xor.b64 %rd649, %rd648, %rd2469; | |
shr.u64 %rd650, %rd649, 32; | |
mul.lo.s64 %rd2472, %rd650, 3449720151; | |
shr.u64 %rd651, %rd2472, 32; | |
and.b64 %rd652, %rd648, 4294967295; | |
mul.lo.s64 %rd653, %rd652, 3449720151; | |
and.b64 %rd654, %rd653, 4294967295; | |
xor.b64 %rd655, %rd654, %rd651; | |
xor.b64 %rd656, %rd655, 2654435769; | |
mul.lo.s64 %rd2475, %rd656, 3528531795; | |
xor.b64 %rd2465, %rd653, %rd13; | |
mov.u32 %r314, -845247145; | |
mov.u32 %r313, -616729560; | |
mov.u64 %rd2482, 3041712726; | |
mov.u64 %rd2481, 1401181199; | |
mov.u64 %rd2480, 2835769497; | |
mov.u64 %rd2479, 1684936478; | |
mov.u64 %rd2478, 2027808484; | |
mov.u64 %rd2477, 387276957; | |
mov.u64 %rd2476, 842468239; | |
mov.u64 %rd2474, 3986602516; | |
mov.u64 %rd2473, 1013904242; | |
mov.u64 %rd2471, 3668340011; | |
mov.u64 %rd2470, 3144134277; | |
mov.u64 %rd2468, 3449720151; | |
mov.u64 %rd2467, 1993301258; | |
mov.u64 %rd2466, 3528531795; | |
bra.uni LBB80_8; | |
LBB80_7: | |
mov.u32 %r313, -239350328; | |
selp.u64 %rd621, 1, 0, %p69; | |
add.s64 %rd622, %rd2464, %rd621; | |
and.b64 %rd623, %rd622, 4294967295; | |
mul.lo.s64 %rd2469, %rd623, 3449720151; | |
xor.b64 %rd624, %rd2469, %rd13; | |
shr.u64 %rd625, %rd624, 32; | |
mul.lo.s64 %rd2472, %rd625, 3528531795; | |
shr.u64 %rd626, %rd2472, 32; | |
mul.lo.s64 %rd628, %rd2387, 3528531795; | |
and.b64 %rd629, %rd628, 4294967295; | |
xor.b64 %rd630, %rd629, %rd626; | |
xor.b64 %rd631, %rd630, 3144134277; | |
mul.lo.s64 %rd2475, %rd631, 3449720151; | |
xor.b64 %rd2465, %rd622, %rd628; | |
mov.u32 %r314, -766435501; | |
mov.u64 %rd2482, 1684936478; | |
mov.u64 %rd2481, 534103459; | |
mov.u64 %rd2480, 387276957; | |
mov.u64 %rd2479, 3041712726; | |
mov.u64 %rd2478, 3986602516; | |
mov.u64 %rd2477, 2835769497; | |
mov.u64 %rd2476, 3668340011; | |
mov.u64 %rd2474, 2027808484; | |
mov.u64 %rd2473, 1993301258; | |
mov.u64 %rd2471, 842468239; | |
mov.u64 %rd2470, 2654435769; | |
mov.u64 %rd2468, 3528531795; | |
mov.u64 %rd2467, 1013904242; | |
mov.u64 %rd2466, 3449720151; | |
LBB80_8: | |
setp.ne.s32 %p8, %r5, 0; | |
shr.u64 %rd657, %rd2475, 32; | |
shr.u64 %rd658, %rd2465, 32; | |
mul.lo.s64 %rd659, %rd658, %rd2466; | |
and.b64 %rd660, %rd659, 4294967295; | |
xor.b64 %rd661, %rd660, %rd657; | |
xor.b64 %rd662, %rd661, %rd2467; | |
mul.lo.s64 %rd663, %rd662, %rd2468; | |
shr.u64 %rd664, %rd663, 32; | |
shr.u64 %rd665, %rd659, 32; | |
and.b64 %rd666, %rd2469, 4294967295; | |
xor.b64 %rd667, %rd666, %rd665; | |
xor.b64 %rd668, %rd667, %rd2470; | |
mul.lo.s64 %rd669, %rd668, %rd2468; | |
and.b64 %rd670, %rd669, 4294967295; | |
xor.b64 %rd671, %rd670, %rd664; | |
xor.b64 %rd672, %rd671, %rd2471; | |
mul.lo.s64 %rd673, %rd672, %rd2466; | |
shr.u64 %rd674, %rd673, 32; | |
shr.u64 %rd675, %rd669, 32; | |
and.b64 %rd676, %rd2472, 4294967295; | |
xor.b64 %rd677, %rd676, %rd675; | |
xor.b64 %rd678, %rd677, %rd2473; | |
mul.lo.s64 %rd679, %rd678, %rd2466; | |
and.b64 %rd680, %rd679, 4294967295; | |
xor.b64 %rd681, %rd680, %rd674; | |
xor.b64 %rd682, %rd681, %rd2474; | |
mul.lo.s64 %rd683, %rd682, %rd2468; | |
shr.u64 %rd684, %rd683, 32; | |
shr.u64 %rd685, %rd679, 32; | |
and.b64 %rd686, %rd2475, 4294967295; | |
xor.b64 %rd687, %rd686, %rd685; | |
xor.b64 %rd688, %rd687, %rd2476; | |
mul.lo.s64 %rd689, %rd688, %rd2468; | |
and.b64 %rd690, %rd689, 4294967295; | |
xor.b64 %rd691, %rd690, %rd684; | |
xor.b64 %rd692, %rd691, %rd2477; | |
mul.lo.s64 %rd693, %rd692, %rd2466; | |
shr.u64 %rd694, %rd693, 32; | |
shr.u64 %rd695, %rd689, 32; | |
and.b64 %rd696, %rd663, 4294967295; | |
xor.b64 %rd697, %rd696, %rd695; | |
xor.b64 %rd698, %rd697, %rd2478; | |
mul.lo.s64 %rd699, %rd698, %rd2466; | |
and.b64 %rd700, %rd699, 4294967295; | |
xor.b64 %rd701, %rd700, %rd694; | |
xor.b64 %rd702, %rd701, %rd2479; | |
mul.lo.s64 %rd703, %rd702, %rd2468; | |
shr.u64 %rd704, %rd703, 32; | |
shr.u64 %rd705, %rd699, 32; | |
and.b64 %rd706, %rd673, 4294967295; | |
xor.b64 %rd707, %rd706, %rd705; | |
xor.b64 %rd708, %rd707, %rd2480; | |
mul.lo.s64 %rd709, %rd708, %rd2468; | |
and.b64 %rd710, %rd709, 4294967295; | |
xor.b64 %rd711, %rd710, %rd704; | |
xor.b64 %rd712, %rd711, %rd2481; | |
mul.lo.s64 %rd713, %rd712, %rd2466; | |
shr.u64 %rd714, %rd713, 32; | |
shr.u64 %rd715, %rd709, 32; | |
xor.b64 %rd716, %rd683, %rd715; | |
xor.b64 %rd717, %rd716, %rd2482; | |
mul.lo.s64 %rd718, %rd717, %rd2466; | |
xor.b64 %rd719, %rd714, %rd718; | |
cvt.u32.u64 %r69, %rd719; | |
xor.b32 %r70, %r313, %r69; | |
mul.lo.s32 %r71, %r70, %r314; | |
shr.u32 %r72, %r71, 9; | |
cvt.rn.f32.u32 %f40, %r72; | |
mul.rn.f32 %f41, %f40, 0f34000000; | |
cvt.rn.f16.f32 %h10, %f41; | |
mov.b16 %h11, 0x2E66; | |
setp.ge.f16 %p9, %h10, %h11; | |
ld.global.nc.b16 %h12, [%rd45+2]; | |
ld.global.nc.f32 %f42, [%rd46+4]; | |
cvt.rn.f16.f32 %h13, %f42; | |
add.rn.f16 %h14, %h12, %h13; | |
mov.b16 %h15, 0x3C72; | |
mul.rn.f16 %h16, %h14, %h15; | |
selp.b16 %h17, %h16, 0x0000, %p9; | |
cvt.f32.f16 %f43, %h17; | |
ld.global.nc.b16 %h18, [%rd47+2]; | |
cvt.f32.f16 %f44, %h18; | |
ld.global.nc.f32 %f45, [%rd48+4]; | |
mul.rn.f32 %f46, %f1, %f45; | |
mul.rn.f32 %f47, %f46, %f44; | |
ld.global.nc.f32 %f48, [%rd49+4]; | |
mul.rn.f32 %f49, %f2, %f46; | |
sub.rn.f32 %f50, %f48, %f49; | |
add.rn.f32 %f51, %f47, %f50; | |
add.rn.f32 %f52, %f51, %f43; | |
sub.rn.f32 %f53, %f52, %f3; | |
mul.rn.f32 %f54, %f53, %f53; | |
add.rn.f32 %f5, %f4, %f54; | |
or.b32 %r73, %r3, %r4; | |
or.b32 %r74, %r73, 128; | |
shr.u32 %r75, %r74, 2; | |
cvt.u64.u32 %rd720, %r75; | |
add.s64 %rd76, %rd12, %rd720; | |
and.b64 %rd2436, %rd76, 4294967295; | |
setp.lt.u64 %p74, %rd76, %rd12; | |
@%p8 bra LBB80_10; | |
mul.lo.s64 %rd2487, %rd2436, 3528531795; | |
selp.u64 %rd763, 1, 0, %p74; | |
add.s64 %rd764, %rd2464, %rd763; | |
xor.b64 %rd765, %rd764, %rd2487; | |
shr.u64 %rd766, %rd765, 32; | |
mul.lo.s64 %rd2490, %rd766, 3449720151; | |
shr.u64 %rd767, %rd2490, 32; | |
and.b64 %rd768, %rd764, 4294967295; | |
mul.lo.s64 %rd769, %rd768, 3449720151; | |
and.b64 %rd770, %rd769, 4294967295; | |
xor.b64 %rd771, %rd770, %rd767; | |
xor.b64 %rd772, %rd771, 2654435769; | |
mul.lo.s64 %rd2493, %rd772, 3528531795; | |
xor.b64 %rd2483, %rd769, %rd76; | |
mov.u32 %r317, -1879881855; | |
mov.u32 %r316, -845247145; | |
mov.u32 %r315, 534103459; | |
mov.u64 %rd2501, 3678237736; | |
mov.u64 %rd2500, 3041712726; | |
mov.u64 %rd2499, 1401181199; | |
mov.u64 %rd2498, 2835769497; | |
mov.u64 %rd2497, 1684936478; | |
mov.u64 %rd2496, 2027808484; | |
mov.u64 %rd2495, 387276957; | |
mov.u64 %rd2494, 842468239; | |
mov.u64 %rd2492, 3986602516; | |
mov.u64 %rd2491, 1013904242; | |
mov.u64 %rd2489, 3668340011; | |
mov.u64 %rd2488, 3144134277; | |
mov.u64 %rd2486, 3449720151; | |
mov.u64 %rd2485, 1993301258; | |
mov.u64 %rd2484, 3528531795; | |
bra.uni LBB80_11; | |
LBB80_10: | |
selp.u64 %rd736, 1, 0, %p74; | |
add.s64 %rd737, %rd2464, %rd736; | |
and.b64 %rd738, %rd737, 4294967295; | |
mul.lo.s64 %rd2487, %rd738, 3449720151; | |
xor.b64 %rd739, %rd2487, %rd76; | |
shr.u64 %rd740, %rd739, 32; | |
mul.lo.s64 %rd2490, %rd740, 3528531795; | |
shr.u64 %rd741, %rd2490, 32; | |
mul.lo.s64 %rd743, %rd2436, 3528531795; | |
and.b64 %rd744, %rd743, 4294967295; | |
xor.b64 %rd745, %rd744, %rd741; | |
xor.b64 %rd746, %rd745, 3144134277; | |
mul.lo.s64 %rd2493, %rd746, 3449720151; | |
xor.b64 %rd2483, %rd737, %rd743; | |
mov.u32 %r317, -1767562579; | |
mov.u32 %r316, -766435501; | |
mov.u32 %r315, 1401181199; | |
mov.u64 %rd2501, 4055616968; | |
mov.u64 %rd2500, 1684936478; | |
mov.u64 %rd2499, 534103459; | |
mov.u64 %rd2498, 387276957; | |
mov.u64 %rd2497, 3041712726; | |
mov.u64 %rd2496, 3986602516; | |
mov.u64 %rd2495, 2835769497; | |
mov.u64 %rd2494, 3668340011; | |
mov.u64 %rd2492, 2027808484; | |
mov.u64 %rd2491, 1993301258; | |
mov.u64 %rd2489, 842468239; | |
mov.u64 %rd2488, 2654435769; | |
mov.u64 %rd2486, 3528531795; | |
mov.u64 %rd2485, 1013904242; | |
mov.u64 %rd2484, 3449720151; | |
LBB80_11: | |
shr.u64 %rd773, %rd2493, 32; | |
shr.u64 %rd774, %rd2483, 32; | |
mul.lo.s64 %rd775, %rd774, %rd2484; | |
and.b64 %rd776, %rd775, 4294967295; | |
xor.b64 %rd777, %rd776, %rd773; | |
xor.b64 %rd778, %rd777, %rd2485; | |
mul.lo.s64 %rd779, %rd778, %rd2486; | |
shr.u64 %rd780, %rd779, 32; | |
shr.u64 %rd781, %rd775, 32; | |
and.b64 %rd782, %rd2487, 4294967295; | |
xor.b64 %rd783, %rd782, %rd781; | |
xor.b64 %rd784, %rd783, %rd2488; | |
mul.lo.s64 %rd785, %rd784, %rd2486; | |
and.b64 %rd786, %rd785, 4294967295; | |
xor.b64 %rd787, %rd786, %rd780; | |
xor.b64 %rd788, %rd787, %rd2489; | |
mul.lo.s64 %rd789, %rd788, %rd2484; | |
shr.u64 %rd790, %rd789, 32; | |
shr.u64 %rd791, %rd785, 32; | |
and.b64 %rd792, %rd2490, 4294967295; | |
xor.b64 %rd793, %rd792, %rd791; | |
xor.b64 %rd794, %rd793, %rd2491; | |
mul.lo.s64 %rd795, %rd794, %rd2484; | |
and.b64 %rd796, %rd795, 4294967295; | |
xor.b64 %rd797, %rd796, %rd790; | |
xor.b64 %rd798, %rd797, %rd2492; | |
mul.lo.s64 %rd799, %rd798, %rd2486; | |
shr.u64 %rd800, %rd799, 32; | |
shr.u64 %rd801, %rd795, 32; | |
and.b64 %rd802, %rd2493, 4294967295; | |
xor.b64 %rd803, %rd802, %rd801; | |
xor.b64 %rd804, %rd803, %rd2494; | |
mul.lo.s64 %rd805, %rd804, %rd2486; | |
and.b64 %rd806, %rd805, 4294967295; | |
xor.b64 %rd807, %rd806, %rd800; | |
xor.b64 %rd808, %rd807, %rd2495; | |
mul.lo.s64 %rd809, %rd808, %rd2484; | |
shr.u64 %rd810, %rd809, 32; | |
shr.u64 %rd811, %rd805, 32; | |
and.b64 %rd812, %rd779, 4294967295; | |
xor.b64 %rd813, %rd812, %rd811; | |
xor.b64 %rd814, %rd813, %rd2496; | |
mul.lo.s64 %rd815, %rd814, %rd2484; | |
and.b64 %rd816, %rd815, 4294967295; | |
xor.b64 %rd817, %rd816, %rd810; | |
xor.b64 %rd818, %rd817, %rd2497; | |
mul.lo.s64 %rd819, %rd818, %rd2486; | |
shr.u64 %rd820, %rd819, 32; | |
shr.u64 %rd821, %rd815, 32; | |
and.b64 %rd822, %rd789, 4294967295; | |
xor.b64 %rd823, %rd822, %rd821; | |
xor.b64 %rd824, %rd823, %rd2498; | |
mul.lo.s64 %rd825, %rd824, %rd2486; | |
and.b64 %rd826, %rd825, 4294967295; | |
xor.b64 %rd827, %rd826, %rd820; | |
xor.b64 %rd828, %rd827, %rd2499; | |
mul.lo.s64 %rd829, %rd828, %rd2484; | |
shr.u64 %rd830, %rd829, 32; | |
shr.u64 %rd831, %rd825, 32; | |
and.b64 %rd832, %rd799, 4294967295; | |
xor.b64 %rd833, %rd832, %rd831; | |
xor.b64 %rd834, %rd833, %rd2500; | |
mul.lo.s64 %rd835, %rd834, %rd2484; | |
and.b64 %rd836, %rd835, 4294967295; | |
xor.b64 %rd837, %rd836, %rd830; | |
xor.b64 %rd838, %rd837, %rd2501; | |
mul.lo.s64 %rd839, %rd838, %rd2486; | |
shr.u64 %rd840, %rd839, 32; | |
cvt.u32.u64 %r82, %rd840; | |
shr.u64 %rd841, %rd835, 32; | |
xor.b64 %rd842, %rd841, %rd809; | |
cvt.u32.u64 %r83, %rd842; | |
xor.b32 %r84, %r315, %r83; | |
mul.lo.s32 %r85, %r84, %r316; | |
xor.b32 %r86, %r85, %r82; | |
xor.b32 %r87, %r86, %r317; | |
shr.u32 %r88, %r87, 9; | |
cvt.rn.f32.u32 %f55, %r88; | |
mul.rn.f32 %f56, %f55, 0f34000000; | |
cvt.rn.f16.f32 %h19, %f56; | |
mov.b16 %h20, 0x2E66; | |
setp.ge.f16 %p12, %h19, %h20; | |
ld.global.nc.b16 %h21, [%rd45+256]; | |
ld.global.nc.f32 %f57, [%rd46+512]; | |
cvt.rn.f16.f32 %h22, %f57; | |
add.rn.f16 %h23, %h21, %h22; | |
mov.b16 %h24, 0x3C72; | |
mul.rn.f16 %h25, %h23, %h24; | |
selp.b16 %h26, %h25, 0x0000, %p12; | |
cvt.f32.f16 %f58, %h26; | |
ld.global.nc.b16 %h27, [%rd47+256]; | |
cvt.f32.f16 %f59, %h27; | |
ld.global.nc.f32 %f60, [%rd48+512]; | |
mul.rn.f32 %f61, %f1, %f60; | |
mul.rn.f32 %f62, %f61, %f59; | |
ld.global.nc.f32 %f63, [%rd49+512]; | |
mul.rn.f32 %f64, %f2, %f61; | |
sub.rn.f32 %f65, %f63, %f64; | |
add.rn.f32 %f66, %f62, %f65; | |
add.rn.f32 %f67, %f66, %f58; | |
sub.rn.f32 %f68, %f67, %f3; | |
mul.rn.f32 %f69, %f68, %f68; | |
add.rn.f32 %f6, %f5, %f69; | |
or.b32 %r89, %r3, 129; | |
or.b32 %r90, %r89, %r4; | |
and.b32 %r91, %r89, 3; | |
shr.u32 %r92, %r90, 2; | |
setp.ne.s32 %p13, %r91, 1; | |
cvt.u64.u32 %rd843, %r92; | |
add.s64 %rd104, %rd12, %rd843; | |
and.b64 %rd2433, %rd104, 4294967295; | |
setp.lt.u64 %p73, %rd104, %rd12; | |
@%p13 bra LBB80_13; | |
mul.lo.s64 %rd2506, %rd2433, 3528531795; | |
selp.u64 %rd884, 1, 0, %p73; | |
add.s64 %rd885, %rd2464, %rd884; | |
xor.b64 %rd886, %rd885, %rd2506; | |
shr.u64 %rd887, %rd886, 32; | |
mul.lo.s64 %rd2509, %rd887, 3449720151; | |
shr.u64 %rd888, %rd2509, 32; | |
and.b64 %rd889, %rd885, 4294967295; | |
mul.lo.s64 %rd890, %rd889, 3449720151; | |
and.b64 %rd891, %rd890, 4294967295; | |
xor.b64 %rd892, %rd891, %rd888; | |
xor.b64 %rd893, %rd892, 2654435769; | |
mul.lo.s64 %rd2512, %rd893, 3528531795; | |
xor.b64 %rd2502, %rd890, %rd104; | |
mov.u32 %r319, -845247145; | |
mov.u32 %r318, -616729560; | |
mov.u64 %rd2519, 3041712726; | |
mov.u64 %rd2518, 1401181199; | |
mov.u64 %rd2517, 2835769497; | |
mov.u64 %rd2516, 1684936478; | |
mov.u64 %rd2515, 2027808484; | |
mov.u64 %rd2514, 387276957; | |
mov.u64 %rd2513, 842468239; | |
mov.u64 %rd2511, 3986602516; | |
mov.u64 %rd2510, 1013904242; | |
mov.u64 %rd2508, 3668340011; | |
mov.u64 %rd2507, 3144134277; | |
mov.u64 %rd2505, 3449720151; | |
mov.u64 %rd2504, 1993301258; | |
mov.u64 %rd2503, 3528531795; | |
bra.uni LBB80_14; | |
LBB80_13: | |
selp.u64 %rd858, 1, 0, %p73; | |
add.s64 %rd859, %rd2464, %rd858; | |
and.b64 %rd860, %rd859, 4294967295; | |
mul.lo.s64 %rd2506, %rd860, 3449720151; | |
xor.b64 %rd861, %rd2506, %rd104; | |
shr.u64 %rd862, %rd861, 32; | |
mul.lo.s64 %rd2509, %rd862, 3528531795; | |
shr.u64 %rd863, %rd2509, 32; | |
mul.lo.s64 %rd865, %rd2433, 3528531795; | |
and.b64 %rd866, %rd865, 4294967295; | |
xor.b64 %rd867, %rd866, %rd863; | |
xor.b64 %rd868, %rd867, 3144134277; | |
mul.lo.s64 %rd2512, %rd868, 3449720151; | |
xor.b64 %rd2502, %rd859, %rd865; | |
mov.u32 %r319, -766435501; | |
mov.u32 %r318, -239350328; | |
mov.u64 %rd2519, 1684936478; | |
mov.u64 %rd2518, 534103459; | |
mov.u64 %rd2517, 387276957; | |
mov.u64 %rd2516, 3041712726; | |
mov.u64 %rd2515, 3986602516; | |
mov.u64 %rd2514, 2835769497; | |
mov.u64 %rd2513, 3668340011; | |
mov.u64 %rd2511, 2027808484; | |
mov.u64 %rd2510, 1993301258; | |
mov.u64 %rd2508, 842468239; | |
mov.u64 %rd2507, 2654435769; | |
mov.u64 %rd2505, 3528531795; | |
mov.u64 %rd2504, 1013904242; | |
mov.u64 %rd2503, 3449720151; | |
LBB80_14: | |
shr.u64 %rd894, %rd2512, 32; | |
shr.u64 %rd895, %rd2502, 32; | |
mul.lo.s64 %rd896, %rd895, %rd2503; | |
and.b64 %rd897, %rd896, 4294967295; | |
xor.b64 %rd898, %rd897, %rd894; | |
xor.b64 %rd899, %rd898, %rd2504; | |
mul.lo.s64 %rd900, %rd899, %rd2505; | |
shr.u64 %rd901, %rd900, 32; | |
shr.u64 %rd902, %rd896, 32; | |
and.b64 %rd903, %rd2506, 4294967295; | |
xor.b64 %rd904, %rd903, %rd902; | |
xor.b64 %rd905, %rd904, %rd2507; | |
mul.lo.s64 %rd906, %rd905, %rd2505; | |
and.b64 %rd907, %rd906, 4294967295; | |
xor.b64 %rd908, %rd907, %rd901; | |
xor.b64 %rd909, %rd908, %rd2508; | |
mul.lo.s64 %rd910, %rd909, %rd2503; | |
shr.u64 %rd911, %rd910, 32; | |
shr.u64 %rd912, %rd906, 32; | |
and.b64 %rd913, %rd2509, 4294967295; | |
xor.b64 %rd914, %rd913, %rd912; | |
xor.b64 %rd915, %rd914, %rd2510; | |
mul.lo.s64 %rd916, %rd915, %rd2503; | |
and.b64 %rd917, %rd916, 4294967295; | |
xor.b64 %rd918, %rd917, %rd911; | |
xor.b64 %rd919, %rd918, %rd2511; | |
mul.lo.s64 %rd920, %rd919, %rd2505; | |
shr.u64 %rd921, %rd920, 32; | |
shr.u64 %rd922, %rd916, 32; | |
and.b64 %rd923, %rd2512, 4294967295; | |
xor.b64 %rd924, %rd923, %rd922; | |
xor.b64 %rd925, %rd924, %rd2513; | |
mul.lo.s64 %rd926, %rd925, %rd2505; | |
and.b64 %rd927, %rd926, 4294967295; | |
xor.b64 %rd928, %rd927, %rd921; | |
xor.b64 %rd929, %rd928, %rd2514; | |
mul.lo.s64 %rd930, %rd929, %rd2503; | |
shr.u64 %rd931, %rd930, 32; | |
shr.u64 %rd932, %rd926, 32; | |
and.b64 %rd933, %rd900, 4294967295; | |
xor.b64 %rd934, %rd933, %rd932; | |
xor.b64 %rd935, %rd934, %rd2515; | |
mul.lo.s64 %rd936, %rd935, %rd2503; | |
and.b64 %rd937, %rd936, 4294967295; | |
xor.b64 %rd938, %rd937, %rd931; | |
xor.b64 %rd939, %rd938, %rd2516; | |
mul.lo.s64 %rd940, %rd939, %rd2505; | |
shr.u64 %rd941, %rd940, 32; | |
shr.u64 %rd942, %rd936, 32; | |
and.b64 %rd943, %rd910, 4294967295; | |
xor.b64 %rd944, %rd943, %rd942; | |
xor.b64 %rd945, %rd944, %rd2517; | |
mul.lo.s64 %rd946, %rd945, %rd2505; | |
and.b64 %rd947, %rd946, 4294967295; | |
xor.b64 %rd948, %rd947, %rd941; | |
xor.b64 %rd949, %rd948, %rd2518; | |
mul.lo.s64 %rd950, %rd949, %rd2503; | |
shr.u64 %rd951, %rd950, 32; | |
shr.u64 %rd952, %rd946, 32; | |
xor.b64 %rd953, %rd920, %rd952; | |
xor.b64 %rd954, %rd953, %rd2519; | |
mul.lo.s64 %rd955, %rd954, %rd2503; | |
xor.b64 %rd956, %rd951, %rd955; | |
cvt.u32.u64 %r97, %rd956; | |
xor.b32 %r98, %r318, %r97; | |
mul.lo.s32 %r99, %r98, %r319; | |
shr.u32 %r100, %r99, 9; | |
cvt.rn.f32.u32 %f70, %r100; | |
mul.rn.f32 %f71, %f70, 0f34000000; | |
cvt.rn.f16.f32 %h28, %f71; | |
mov.b16 %h29, 0x2E66; | |
setp.ge.f16 %p17, %h28, %h29; | |
ld.global.nc.b16 %h30, [%rd45+258]; | |
ld.global.nc.f32 %f72, [%rd46+516]; | |
cvt.rn.f16.f32 %h31, %f72; | |
add.rn.f16 %h32, %h30, %h31; | |
mov.b16 %h33, 0x3C72; | |
mul.rn.f16 %h34, %h32, %h33; | |
selp.b16 %h35, %h34, 0x0000, %p17; | |
cvt.f32.f16 %f73, %h35; | |
ld.global.nc.b16 %h36, [%rd47+258]; | |
cvt.f32.f16 %f74, %h36; | |
ld.global.nc.f32 %f75, [%rd48+516]; | |
mul.rn.f32 %f76, %f1, %f75; | |
mul.rn.f32 %f77, %f76, %f74; | |
ld.global.nc.f32 %f78, [%rd49+516]; | |
mul.rn.f32 %f79, %f2, %f76; | |
sub.rn.f32 %f80, %f78, %f79; | |
add.rn.f32 %f81, %f77, %f80; | |
add.rn.f32 %f82, %f81, %f73; | |
sub.rn.f32 %f83, %f82, %f3; | |
mul.rn.f32 %f84, %f83, %f83; | |
add.rn.f32 %f7, %f6, %f84; | |
or.b32 %r102, %r73, 256; | |
shr.u32 %r103, %r102, 2; | |
cvt.u64.u32 %rd957, %r103; | |
add.s64 %rd131, %rd12, %rd957; | |
and.b64 %rd2429, %rd131, 4294967295; | |
setp.lt.u64 %p72, %rd131, %rd12; | |
@%p8 bra LBB80_16; | |
mul.lo.s64 %rd2524, %rd2429, 3528531795; | |
selp.u64 %rd1000, 1, 0, %p72; | |
add.s64 %rd1001, %rd2464, %rd1000; | |
xor.b64 %rd1002, %rd1001, %rd2524; | |
shr.u64 %rd1003, %rd1002, 32; | |
mul.lo.s64 %rd2527, %rd1003, 3449720151; | |
shr.u64 %rd1004, %rd2527, 32; | |
and.b64 %rd1005, %rd1001, 4294967295; | |
mul.lo.s64 %rd1006, %rd1005, 3449720151; | |
and.b64 %rd1007, %rd1006, 4294967295; | |
xor.b64 %rd1008, %rd1007, %rd1004; | |
xor.b64 %rd1009, %rd1008, 2654435769; | |
mul.lo.s64 %rd2530, %rd1009, 3528531795; | |
xor.b64 %rd2520, %rd1006, %rd131; | |
mov.u32 %r322, -1879881855; | |
mov.u32 %r321, -845247145; | |
mov.u32 %r320, 534103459; | |
mov.u64 %rd2538, 3678237736; | |
mov.u64 %rd2537, 3041712726; | |
mov.u64 %rd2536, 1401181199; | |
mov.u64 %rd2535, 2835769497; | |
mov.u64 %rd2534, 1684936478; | |
mov.u64 %rd2533, 2027808484; | |
mov.u64 %rd2532, 387276957; | |
mov.u64 %rd2531, 842468239; | |
mov.u64 %rd2529, 3986602516; | |
mov.u64 %rd2528, 1013904242; | |
mov.u64 %rd2526, 3668340011; | |
mov.u64 %rd2525, 3144134277; | |
mov.u64 %rd2523, 3449720151; | |
mov.u64 %rd2522, 1993301258; | |
mov.u64 %rd2521, 3528531795; | |
bra.uni LBB80_17; | |
LBB80_16: | |
selp.u64 %rd973, 1, 0, %p72; | |
add.s64 %rd974, %rd2464, %rd973; | |
and.b64 %rd975, %rd974, 4294967295; | |
mul.lo.s64 %rd2524, %rd975, 3449720151; | |
xor.b64 %rd976, %rd2524, %rd131; | |
shr.u64 %rd977, %rd976, 32; | |
mul.lo.s64 %rd2527, %rd977, 3528531795; | |
shr.u64 %rd978, %rd2527, 32; | |
mul.lo.s64 %rd980, %rd2429, 3528531795; | |
and.b64 %rd981, %rd980, 4294967295; | |
xor.b64 %rd982, %rd981, %rd978; | |
xor.b64 %rd983, %rd982, 3144134277; | |
mul.lo.s64 %rd2530, %rd983, 3449720151; | |
xor.b64 %rd2520, %rd974, %rd980; | |
mov.u32 %r322, -1767562579; | |
mov.u32 %r321, -766435501; | |
mov.u32 %r320, 1401181199; | |
mov.u64 %rd2538, 4055616968; | |
mov.u64 %rd2537, 1684936478; | |
mov.u64 %rd2536, 534103459; | |
mov.u64 %rd2535, 387276957; | |
mov.u64 %rd2534, 3041712726; | |
mov.u64 %rd2533, 3986602516; | |
mov.u64 %rd2532, 2835769497; | |
mov.u64 %rd2531, 3668340011; | |
mov.u64 %rd2529, 2027808484; | |
mov.u64 %rd2528, 1993301258; | |
mov.u64 %rd2526, 842468239; | |
mov.u64 %rd2525, 2654435769; | |
mov.u64 %rd2523, 3528531795; | |
mov.u64 %rd2522, 1013904242; | |
mov.u64 %rd2521, 3449720151; | |
LBB80_17: | |
shr.u64 %rd1010, %rd2530, 32; | |
shr.u64 %rd1011, %rd2520, 32; | |
mul.lo.s64 %rd1012, %rd1011, %rd2521; | |
and.b64 %rd1013, %rd1012, 4294967295; | |
xor.b64 %rd1014, %rd1013, %rd1010; | |
xor.b64 %rd1015, %rd1014, %rd2522; | |
mul.lo.s64 %rd1016, %rd1015, %rd2523; | |
shr.u64 %rd1017, %rd1016, 32; | |
shr.u64 %rd1018, %rd1012, 32; | |
and.b64 %rd1019, %rd2524, 4294967295; | |
xor.b64 %rd1020, %rd1019, %rd1018; | |
xor.b64 %rd1021, %rd1020, %rd2525; | |
mul.lo.s64 %rd1022, %rd1021, %rd2523; | |
and.b64 %rd1023, %rd1022, 4294967295; | |
xor.b64 %rd1024, %rd1023, %rd1017; | |
xor.b64 %rd1025, %rd1024, %rd2526; | |
mul.lo.s64 %rd1026, %rd1025, %rd2521; | |
shr.u64 %rd1027, %rd1026, 32; | |
shr.u64 %rd1028, %rd1022, 32; | |
and.b64 %rd1029, %rd2527, 4294967295; | |
xor.b64 %rd1030, %rd1029, %rd1028; | |
xor.b64 %rd1031, %rd1030, %rd2528; | |
mul.lo.s64 %rd1032, %rd1031, %rd2521; | |
and.b64 %rd1033, %rd1032, 4294967295; | |
xor.b64 %rd1034, %rd1033, %rd1027; | |
xor.b64 %rd1035, %rd1034, %rd2529; | |
mul.lo.s64 %rd1036, %rd1035, %rd2523; | |
shr.u64 %rd1037, %rd1036, 32; | |
shr.u64 %rd1038, %rd1032, 32; | |
and.b64 %rd1039, %rd2530, 4294967295; | |
xor.b64 %rd1040, %rd1039, %rd1038; | |
xor.b64 %rd1041, %rd1040, %rd2531; | |
mul.lo.s64 %rd1042, %rd1041, %rd2523; | |
and.b64 %rd1043, %rd1042, 4294967295; | |
xor.b64 %rd1044, %rd1043, %rd1037; | |
xor.b64 %rd1045, %rd1044, %rd2532; | |
mul.lo.s64 %rd1046, %rd1045, %rd2521; | |
shr.u64 %rd1047, %rd1046, 32; | |
shr.u64 %rd1048, %rd1042, 32; | |
and.b64 %rd1049, %rd1016, 4294967295; | |
xor.b64 %rd1050, %rd1049, %rd1048; | |
xor.b64 %rd1051, %rd1050, %rd2533; | |
mul.lo.s64 %rd1052, %rd1051, %rd2521; | |
and.b64 %rd1053, %rd1052, 4294967295; | |
xor.b64 %rd1054, %rd1053, %rd1047; | |
xor.b64 %rd1055, %rd1054, %rd2534; | |
mul.lo.s64 %rd1056, %rd1055, %rd2523; | |
shr.u64 %rd1057, %rd1056, 32; | |
shr.u64 %rd1058, %rd1052, 32; | |
and.b64 %rd1059, %rd1026, 4294967295; | |
xor.b64 %rd1060, %rd1059, %rd1058; | |
xor.b64 %rd1061, %rd1060, %rd2535; | |
mul.lo.s64 %rd1062, %rd1061, %rd2523; | |
and.b64 %rd1063, %rd1062, 4294967295; | |
xor.b64 %rd1064, %rd1063, %rd1057; | |
xor.b64 %rd1065, %rd1064, %rd2536; | |
mul.lo.s64 %rd1066, %rd1065, %rd2521; | |
shr.u64 %rd1067, %rd1066, 32; | |
shr.u64 %rd1068, %rd1062, 32; | |
and.b64 %rd1069, %rd1036, 4294967295; | |
xor.b64 %rd1070, %rd1069, %rd1068; | |
xor.b64 %rd1071, %rd1070, %rd2537; | |
mul.lo.s64 %rd1072, %rd1071, %rd2521; | |
and.b64 %rd1073, %rd1072, 4294967295; | |
xor.b64 %rd1074, %rd1073, %rd1067; | |
xor.b64 %rd1075, %rd1074, %rd2538; | |
mul.lo.s64 %rd1076, %rd1075, %rd2523; | |
shr.u64 %rd1077, %rd1076, 32; | |
cvt.u32.u64 %r110, %rd1077; | |
shr.u64 %rd1078, %rd1072, 32; | |
xor.b64 %rd1079, %rd1078, %rd1046; | |
cvt.u32.u64 %r111, %rd1079; | |
xor.b32 %r112, %r320, %r111; | |
mul.lo.s32 %r113, %r112, %r321; | |
xor.b32 %r114, %r113, %r110; | |
xor.b32 %r115, %r114, %r322; | |
shr.u32 %r116, %r115, 9; | |
cvt.rn.f32.u32 %f85, %r116; | |
mul.rn.f32 %f86, %f85, 0f34000000; | |
cvt.rn.f16.f32 %h37, %f86; | |
mov.b16 %h38, 0x2E66; | |
setp.ge.f16 %p20, %h37, %h38; | |
ld.global.nc.b16 %h39, [%rd45+512]; | |
ld.global.nc.f32 %f87, [%rd46+1024]; | |
cvt.rn.f16.f32 %h40, %f87; | |
add.rn.f16 %h41, %h39, %h40; | |
mov.b16 %h42, 0x3C72; | |
mul.rn.f16 %h43, %h41, %h42; | |
selp.b16 %h44, %h43, 0x0000, %p20; | |
cvt.f32.f16 %f88, %h44; | |
ld.global.nc.b16 %h45, [%rd47+512]; | |
cvt.f32.f16 %f89, %h45; | |
ld.global.nc.f32 %f90, [%rd48+1024]; | |
mul.rn.f32 %f91, %f1, %f90; | |
mul.rn.f32 %f92, %f91, %f89; | |
ld.global.nc.f32 %f93, [%rd49+1024]; | |
mul.rn.f32 %f94, %f2, %f91; | |
sub.rn.f32 %f95, %f93, %f94; | |
add.rn.f32 %f96, %f92, %f95; | |
add.rn.f32 %f97, %f96, %f88; | |
sub.rn.f32 %f98, %f97, %f3; | |
mul.rn.f32 %f99, %f98, %f98; | |
add.rn.f32 %f8, %f7, %f99; | |
or.b32 %r117, %r3, 257; | |
or.b32 %r118, %r117, %r4; | |
and.b32 %r119, %r117, 3; | |
shr.u32 %r120, %r118, 2; | |
setp.ne.s32 %p21, %r119, 1; | |
cvt.u64.u32 %rd1080, %r120; | |
add.s64 %rd159, %rd12, %rd1080; | |
and.b64 %rd2426, %rd159, 4294967295; | |
setp.lt.u64 %p71, %rd159, %rd12; | |
@%p21 bra LBB80_19; | |
mul.lo.s64 %rd2543, %rd2426, 3528531795; | |
selp.u64 %rd1121, 1, 0, %p71; | |
add.s64 %rd1122, %rd2464, %rd1121; | |
xor.b64 %rd1123, %rd1122, %rd2543; | |
shr.u64 %rd1124, %rd1123, 32; | |
mul.lo.s64 %rd2546, %rd1124, 3449720151; | |
shr.u64 %rd1125, %rd2546, 32; | |
and.b64 %rd1126, %rd1122, 4294967295; | |
mul.lo.s64 %rd1127, %rd1126, 3449720151; | |
and.b64 %rd1128, %rd1127, 4294967295; | |
xor.b64 %rd1129, %rd1128, %rd1125; | |
xor.b64 %rd1130, %rd1129, 2654435769; | |
mul.lo.s64 %rd2549, %rd1130, 3528531795; | |
xor.b64 %rd2539, %rd1127, %rd159; | |
mov.u32 %r324, -845247145; | |
mov.u32 %r323, -616729560; | |
mov.u64 %rd2556, 3041712726; | |
mov.u64 %rd2555, 1401181199; | |
mov.u64 %rd2554, 2835769497; | |
mov.u64 %rd2553, 1684936478; | |
mov.u64 %rd2552, 2027808484; | |
mov.u64 %rd2551, 387276957; | |
mov.u64 %rd2550, 842468239; | |
mov.u64 %rd2548, 3986602516; | |
mov.u64 %rd2547, 1013904242; | |
mov.u64 %rd2545, 3668340011; | |
mov.u64 %rd2544, 3144134277; | |
mov.u64 %rd2542, 3449720151; | |
mov.u64 %rd2541, 1993301258; | |
mov.u64 %rd2540, 3528531795; | |
bra.uni LBB80_20; | |
LBB80_19: | |
selp.u64 %rd1095, 1, 0, %p71; | |
add.s64 %rd1096, %rd2464, %rd1095; | |
and.b64 %rd1097, %rd1096, 4294967295; | |
mul.lo.s64 %rd2543, %rd1097, 3449720151; | |
xor.b64 %rd1098, %rd2543, %rd159; | |
shr.u64 %rd1099, %rd1098, 32; | |
mul.lo.s64 %rd2546, %rd1099, 3528531795; | |
shr.u64 %rd1100, %rd2546, 32; | |
mul.lo.s64 %rd1102, %rd2426, 3528531795; | |
and.b64 %rd1103, %rd1102, 4294967295; | |
xor.b64 %rd1104, %rd1103, %rd1100; | |
xor.b64 %rd1105, %rd1104, 3144134277; | |
mul.lo.s64 %rd2549, %rd1105, 3449720151; | |
xor.b64 %rd2539, %rd1096, %rd1102; | |
mov.u32 %r324, -766435501; | |
mov.u32 %r323, -239350328; | |
mov.u64 %rd2556, 1684936478; | |
mov.u64 %rd2555, 534103459; | |
mov.u64 %rd2554, 387276957; | |
mov.u64 %rd2553, 3041712726; | |
mov.u64 %rd2552, 3986602516; | |
mov.u64 %rd2551, 2835769497; | |
mov.u64 %rd2550, 3668340011; | |
mov.u64 %rd2548, 2027808484; | |
mov.u64 %rd2547, 1993301258; | |
mov.u64 %rd2545, 842468239; | |
mov.u64 %rd2544, 2654435769; | |
mov.u64 %rd2542, 3528531795; | |
mov.u64 %rd2541, 1013904242; | |
mov.u64 %rd2540, 3449720151; | |
LBB80_20: | |
shr.u64 %rd1131, %rd2549, 32; | |
shr.u64 %rd1132, %rd2539, 32; | |
mul.lo.s64 %rd1133, %rd1132, %rd2540; | |
and.b64 %rd1134, %rd1133, 4294967295; | |
xor.b64 %rd1135, %rd1134, %rd1131; | |
xor.b64 %rd1136, %rd1135, %rd2541; | |
mul.lo.s64 %rd1137, %rd1136, %rd2542; | |
shr.u64 %rd1138, %rd1137, 32; | |
shr.u64 %rd1139, %rd1133, 32; | |
and.b64 %rd1140, %rd2543, 4294967295; | |
xor.b64 %rd1141, %rd1140, %rd1139; | |
xor.b64 %rd1142, %rd1141, %rd2544; | |
mul.lo.s64 %rd1143, %rd1142, %rd2542; | |
and.b64 %rd1144, %rd1143, 4294967295; | |
xor.b64 %rd1145, %rd1144, %rd1138; | |
xor.b64 %rd1146, %rd1145, %rd2545; | |
mul.lo.s64 %rd1147, %rd1146, %rd2540; | |
shr.u64 %rd1148, %rd1147, 32; | |
shr.u64 %rd1149, %rd1143, 32; | |
and.b64 %rd1150, %rd2546, 4294967295; | |
xor.b64 %rd1151, %rd1150, %rd1149; | |
xor.b64 %rd1152, %rd1151, %rd2547; | |
mul.lo.s64 %rd1153, %rd1152, %rd2540; | |
and.b64 %rd1154, %rd1153, 4294967295; | |
xor.b64 %rd1155, %rd1154, %rd1148; | |
xor.b64 %rd1156, %rd1155, %rd2548; | |
mul.lo.s64 %rd1157, %rd1156, %rd2542; | |
shr.u64 %rd1158, %rd1157, 32; | |
shr.u64 %rd1159, %rd1153, 32; | |
and.b64 %rd1160, %rd2549, 4294967295; | |
xor.b64 %rd1161, %rd1160, %rd1159; | |
xor.b64 %rd1162, %rd1161, %rd2550; | |
mul.lo.s64 %rd1163, %rd1162, %rd2542; | |
and.b64 %rd1164, %rd1163, 4294967295; | |
xor.b64 %rd1165, %rd1164, %rd1158; | |
xor.b64 %rd1166, %rd1165, %rd2551; | |
mul.lo.s64 %rd1167, %rd1166, %rd2540; | |
shr.u64 %rd1168, %rd1167, 32; | |
shr.u64 %rd1169, %rd1163, 32; | |
and.b64 %rd1170, %rd1137, 4294967295; | |
xor.b64 %rd1171, %rd1170, %rd1169; | |
xor.b64 %rd1172, %rd1171, %rd2552; | |
mul.lo.s64 %rd1173, %rd1172, %rd2540; | |
and.b64 %rd1174, %rd1173, 4294967295; | |
xor.b64 %rd1175, %rd1174, %rd1168; | |
xor.b64 %rd1176, %rd1175, %rd2553; | |
mul.lo.s64 %rd1177, %rd1176, %rd2542; | |
shr.u64 %rd1178, %rd1177, 32; | |
shr.u64 %rd1179, %rd1173, 32; | |
and.b64 %rd1180, %rd1147, 4294967295; | |
xor.b64 %rd1181, %rd1180, %rd1179; | |
xor.b64 %rd1182, %rd1181, %rd2554; | |
mul.lo.s64 %rd1183, %rd1182, %rd2542; | |
and.b64 %rd1184, %rd1183, 4294967295; | |
xor.b64 %rd1185, %rd1184, %rd1178; | |
xor.b64 %rd1186, %rd1185, %rd2555; | |
mul.lo.s64 %rd1187, %rd1186, %rd2540; | |
shr.u64 %rd1188, %rd1187, 32; | |
shr.u64 %rd1189, %rd1183, 32; | |
xor.b64 %rd1190, %rd1157, %rd1189; | |
xor.b64 %rd1191, %rd1190, %rd2556; | |
mul.lo.s64 %rd1192, %rd1191, %rd2540; | |
xor.b64 %rd1193, %rd1188, %rd1192; | |
cvt.u32.u64 %r125, %rd1193; | |
xor.b32 %r126, %r323, %r125; | |
mul.lo.s32 %r127, %r126, %r324; | |
shr.u32 %r128, %r127, 9; | |
cvt.rn.f32.u32 %f100, %r128; | |
mul.rn.f32 %f101, %f100, 0f34000000; | |
cvt.rn.f16.f32 %h46, %f101; | |
mov.b16 %h47, 0x2E66; | |
setp.ge.f16 %p25, %h46, %h47; | |
ld.global.nc.b16 %h48, [%rd45+514]; | |
ld.global.nc.f32 %f102, [%rd46+1028]; | |
cvt.rn.f16.f32 %h49, %f102; | |
add.rn.f16 %h50, %h48, %h49; | |
mov.b16 %h51, 0x3C72; | |
mul.rn.f16 %h52, %h50, %h51; | |
selp.b16 %h53, %h52, 0x0000, %p25; | |
cvt.f32.f16 %f103, %h53; | |
ld.global.nc.b16 %h54, [%rd47+514]; | |
cvt.f32.f16 %f104, %h54; | |
ld.global.nc.f32 %f105, [%rd48+1028]; | |
mul.rn.f32 %f106, %f1, %f105; | |
mul.rn.f32 %f107, %f106, %f104; | |
ld.global.nc.f32 %f108, [%rd49+1028]; | |
mul.rn.f32 %f109, %f2, %f106; | |
sub.rn.f32 %f110, %f108, %f109; | |
add.rn.f32 %f111, %f107, %f110; | |
add.rn.f32 %f112, %f111, %f103; | |
sub.rn.f32 %f113, %f112, %f3; | |
mul.rn.f32 %f114, %f113, %f113; | |
add.rn.f32 %f9, %f8, %f114; | |
or.b32 %r130, %r73, 384; | |
shr.u32 %r131, %r130, 2; | |
cvt.u64.u32 %rd1194, %r131; | |
add.s64 %rd186, %rd12, %rd1194; | |
and.b64 %rd2422, %rd186, 4294967295; | |
setp.lt.u64 %p70, %rd186, %rd12; | |
@%p8 bra LBB80_22; | |
mul.lo.s64 %rd2561, %rd2422, 3528531795; | |
selp.u64 %rd1237, 1, 0, %p70; | |
add.s64 %rd1238, %rd2464, %rd1237; | |
xor.b64 %rd1239, %rd1238, %rd2561; | |
shr.u64 %rd1240, %rd1239, 32; | |
mul.lo.s64 %rd2564, %rd1240, 3449720151; | |
shr.u64 %rd1241, %rd2564, 32; | |
and.b64 %rd1242, %rd1238, 4294967295; | |
mul.lo.s64 %rd1243, %rd1242, 3449720151; | |
and.b64 %rd1244, %rd1243, 4294967295; | |
xor.b64 %rd1245, %rd1244, %rd1241; | |
xor.b64 %rd1246, %rd1245, 2654435769; | |
mul.lo.s64 %rd2567, %rd1246, 3528531795; | |
xor.b64 %rd2557, %rd1243, %rd186; | |
mov.u32 %r327, -1879881855; | |
mov.u32 %r326, -845247145; | |
mov.u32 %r325, 534103459; | |
mov.u64 %rd2575, 3678237736; | |
mov.u64 %rd2574, 3041712726; | |
mov.u64 %rd2573, 1401181199; | |
mov.u64 %rd2572, 2835769497; | |
mov.u64 %rd2571, 1684936478; | |
mov.u64 %rd2570, 2027808484; | |
mov.u64 %rd2569, 387276957; | |
mov.u64 %rd2568, 842468239; | |
mov.u64 %rd2566, 3986602516; | |
mov.u64 %rd2565, 1013904242; | |
mov.u64 %rd2563, 3668340011; | |
mov.u64 %rd2562, 3144134277; | |
mov.u64 %rd2560, 3449720151; | |
mov.u64 %rd2559, 1993301258; | |
mov.u64 %rd2558, 3528531795; | |
bra.uni LBB80_23; | |
LBB80_22: | |
selp.u64 %rd1210, 1, 0, %p70; | |
add.s64 %rd1211, %rd2464, %rd1210; | |
and.b64 %rd1212, %rd1211, 4294967295; | |
mul.lo.s64 %rd2561, %rd1212, 3449720151; | |
xor.b64 %rd1213, %rd2561, %rd186; | |
shr.u64 %rd1214, %rd1213, 32; | |
mul.lo.s64 %rd2564, %rd1214, 3528531795; | |
shr.u64 %rd1215, %rd2564, 32; | |
mul.lo.s64 %rd1217, %rd2422, 3528531795; | |
and.b64 %rd1218, %rd1217, 4294967295; | |
xor.b64 %rd1219, %rd1218, %rd1215; | |
xor.b64 %rd1220, %rd1219, 3144134277; | |
mul.lo.s64 %rd2567, %rd1220, 3449720151; | |
xor.b64 %rd2557, %rd1211, %rd1217; | |
mov.u32 %r327, -1767562579; | |
mov.u32 %r326, -766435501; | |
mov.u32 %r325, 1401181199; | |
mov.u64 %rd2575, 4055616968; | |
mov.u64 %rd2574, 1684936478; | |
mov.u64 %rd2573, 534103459; | |
mov.u64 %rd2572, 387276957; | |
mov.u64 %rd2571, 3041712726; | |
mov.u64 %rd2570, 3986602516; | |
mov.u64 %rd2569, 2835769497; | |
mov.u64 %rd2568, 3668340011; | |
mov.u64 %rd2566, 2027808484; | |
mov.u64 %rd2565, 1993301258; | |
mov.u64 %rd2563, 842468239; | |
mov.u64 %rd2562, 2654435769; | |
mov.u64 %rd2560, 3528531795; | |
mov.u64 %rd2559, 1013904242; | |
mov.u64 %rd2558, 3449720151; | |
LBB80_23: | |
shr.u64 %rd1247, %rd2567, 32; | |
shr.u64 %rd1248, %rd2557, 32; | |
mul.lo.s64 %rd1249, %rd1248, %rd2558; | |
and.b64 %rd1250, %rd1249, 4294967295; | |
xor.b64 %rd1251, %rd1250, %rd1247; | |
xor.b64 %rd1252, %rd1251, %rd2559; | |
mul.lo.s64 %rd1253, %rd1252, %rd2560; | |
shr.u64 %rd1254, %rd1253, 32; | |
shr.u64 %rd1255, %rd1249, 32; | |
and.b64 %rd1256, %rd2561, 4294967295; | |
xor.b64 %rd1257, %rd1256, %rd1255; | |
xor.b64 %rd1258, %rd1257, %rd2562; | |
mul.lo.s64 %rd1259, %rd1258, %rd2560; | |
and.b64 %rd1260, %rd1259, 4294967295; | |
xor.b64 %rd1261, %rd1260, %rd1254; | |
xor.b64 %rd1262, %rd1261, %rd2563; | |
mul.lo.s64 %rd1263, %rd1262, %rd2558; | |
shr.u64 %rd1264, %rd1263, 32; | |
shr.u64 %rd1265, %rd1259, 32; | |
and.b64 %rd1266, %rd2564, 4294967295; | |
xor.b64 %rd1267, %rd1266, %rd1265; | |
xor.b64 %rd1268, %rd1267, %rd2565; | |
mul.lo.s64 %rd1269, %rd1268, %rd2558; | |
and.b64 %rd1270, %rd1269, 4294967295; | |
xor.b64 %rd1271, %rd1270, %rd1264; | |
xor.b64 %rd1272, %rd1271, %rd2566; | |
mul.lo.s64 %rd1273, %rd1272, %rd2560; | |
shr.u64 %rd1274, %rd1273, 32; | |
shr.u64 %rd1275, %rd1269, 32; | |
and.b64 %rd1276, %rd2567, 4294967295; | |
xor.b64 %rd1277, %rd1276, %rd1275; | |
xor.b64 %rd1278, %rd1277, %rd2568; | |
mul.lo.s64 %rd1279, %rd1278, %rd2560; | |
and.b64 %rd1280, %rd1279, 4294967295; | |
xor.b64 %rd1281, %rd1280, %rd1274; | |
xor.b64 %rd1282, %rd1281, %rd2569; | |
mul.lo.s64 %rd1283, %rd1282, %rd2558; | |
shr.u64 %rd1284, %rd1283, 32; | |
shr.u64 %rd1285, %rd1279, 32; | |
and.b64 %rd1286, %rd1253, 4294967295; | |
xor.b64 %rd1287, %rd1286, %rd1285; | |
xor.b64 %rd1288, %rd1287, %rd2570; | |
mul.lo.s64 %rd1289, %rd1288, %rd2558; | |
and.b64 %rd1290, %rd1289, 4294967295; | |
xor.b64 %rd1291, %rd1290, %rd1284; | |
xor.b64 %rd1292, %rd1291, %rd2571; | |
mul.lo.s64 %rd1293, %rd1292, %rd2560; | |
shr.u64 %rd1294, %rd1293, 32; | |
shr.u64 %rd1295, %rd1289, 32; | |
and.b64 %rd1296, %rd1263, 4294967295; | |
xor.b64 %rd1297, %rd1296, %rd1295; | |
xor.b64 %rd1298, %rd1297, %rd2572; | |
mul.lo.s64 %rd1299, %rd1298, %rd2560; | |
and.b64 %rd1300, %rd1299, 4294967295; | |
xor.b64 %rd1301, %rd1300, %rd1294; | |
xor.b64 %rd1302, %rd1301, %rd2573; | |
mul.lo.s64 %rd1303, %rd1302, %rd2558; | |
shr.u64 %rd1304, %rd1303, 32; | |
shr.u64 %rd1305, %rd1299, 32; | |
and.b64 %rd1306, %rd1273, 4294967295; | |
xor.b64 %rd1307, %rd1306, %rd1305; | |
xor.b64 %rd1308, %rd1307, %rd2574; | |
mul.lo.s64 %rd1309, %rd1308, %rd2558; | |
and.b64 %rd1310, %rd1309, 4294967295; | |
xor.b64 %rd1311, %rd1310, %rd1304; | |
xor.b64 %rd1312, %rd1311, %rd2575; | |
mul.lo.s64 %rd1313, %rd1312, %rd2560; | |
shr.u64 %rd1314, %rd1313, 32; | |
cvt.u32.u64 %r138, %rd1314; | |
shr.u64 %rd1315, %rd1309, 32; | |
xor.b64 %rd1316, %rd1315, %rd1283; | |
cvt.u32.u64 %r139, %rd1316; | |
xor.b32 %r140, %r325, %r139; | |
mul.lo.s32 %r141, %r140, %r326; | |
xor.b32 %r142, %r141, %r138; | |
xor.b32 %r143, %r142, %r327; | |
shr.u32 %r144, %r143, 9; | |
cvt.rn.f32.u32 %f115, %r144; | |
mul.rn.f32 %f116, %f115, 0f34000000; | |
cvt.rn.f16.f32 %h55, %f116; | |
mov.b16 %h56, 0x2E66; | |
setp.ge.f16 %p28, %h55, %h56; | |
ld.global.nc.b16 %h57, [%rd45+768]; | |
ld.global.nc.f32 %f117, [%rd46+1536]; | |
cvt.rn.f16.f32 %h58, %f117; | |
add.rn.f16 %h59, %h57, %h58; | |
mov.b16 %h60, 0x3C72; | |
mul.rn.f16 %h61, %h59, %h60; | |
selp.b16 %h62, %h61, 0x0000, %p28; | |
cvt.f32.f16 %f118, %h62; | |
ld.global.nc.b16 %h63, [%rd47+768]; | |
cvt.f32.f16 %f119, %h63; | |
ld.global.nc.f32 %f120, [%rd48+1536]; | |
mul.rn.f32 %f121, %f1, %f120; | |
mul.rn.f32 %f122, %f121, %f119; | |
ld.global.nc.f32 %f123, [%rd49+1536]; | |
mul.rn.f32 %f124, %f2, %f121; | |
sub.rn.f32 %f125, %f123, %f124; | |
add.rn.f32 %f126, %f122, %f125; | |
add.rn.f32 %f127, %f126, %f118; | |
sub.rn.f32 %f128, %f127, %f3; | |
mul.rn.f32 %f129, %f128, %f128; | |
add.rn.f32 %f10, %f9, %f129; | |
or.b32 %r145, %r3, 385; | |
or.b32 %r146, %r145, %r4; | |
and.b32 %r147, %r145, 3; | |
shr.u32 %r148, %r146, 2; | |
setp.ne.s32 %p29, %r147, 1; | |
cvt.u64.u32 %rd1317, %r148; | |
add.s64 %rd214, %rd12, %rd1317; | |
@%p29 bra LBB80_25; | |
and.b64 %rd1357, %rd214, 4294967295; | |
mul.lo.s64 %rd2580, %rd1357, 3528531795; | |
setp.lt.u64 %p31, %rd214, %rd12; | |
selp.u64 %rd1358, 1, 0, %p31; | |
add.s64 %rd1359, %rd2464, %rd1358; | |
xor.b64 %rd1360, %rd1359, %rd2580; | |
shr.u64 %rd1361, %rd1360, 32; | |
mul.lo.s64 %rd2583, %rd1361, 3449720151; | |
shr.u64 %rd1362, %rd2583, 32; | |
and.b64 %rd1363, %rd1359, 4294967295; | |
mul.lo.s64 %rd1364, %rd1363, 3449720151; | |
and.b64 %rd1365, %rd1364, 4294967295; | |
xor.b64 %rd1366, %rd1365, %rd1362; | |
xor.b64 %rd1367, %rd1366, 2654435769; | |
mul.lo.s64 %rd2586, %rd1367, 3528531795; | |
xor.b64 %rd2576, %rd1364, %rd214; | |
mov.u32 %r329, -845247145; | |
mov.u32 %r328, -616729560; | |
mov.u64 %rd2593, 3041712726; | |
mov.u64 %rd2592, 1401181199; | |
mov.u64 %rd2591, 2835769497; | |
mov.u64 %rd2590, 1684936478; | |
mov.u64 %rd2589, 2027808484; | |
mov.u64 %rd2588, 387276957; | |
mov.u64 %rd2587, 842468239; | |
mov.u64 %rd2585, 3986602516; | |
mov.u64 %rd2584, 1013904242; | |
mov.u64 %rd2582, 3668340011; | |
mov.u64 %rd2581, 3144134277; | |
mov.u64 %rd2579, 3449720151; | |
mov.u64 %rd2578, 1993301258; | |
mov.u64 %rd2577, 3528531795; | |
bra.uni LBB80_26; | |
LBB80_25: | |
setp.lt.u64 %p30, %rd214, %rd12; | |
selp.u64 %rd1332, 1, 0, %p30; | |
add.s64 %rd1333, %rd2464, %rd1332; | |
and.b64 %rd1334, %rd1333, 4294967295; | |
mul.lo.s64 %rd2580, %rd1334, 3449720151; | |
xor.b64 %rd1335, %rd2580, %rd214; | |
shr.u64 %rd1336, %rd1335, 32; | |
mul.lo.s64 %rd2583, %rd1336, 3528531795; | |
shr.u64 %rd1337, %rd2583, 32; | |
and.b64 %rd1338, %rd214, 4294967295; | |
mul.lo.s64 %rd1339, %rd1338, 3528531795; | |
and.b64 %rd1340, %rd1339, 4294967295; | |
xor.b64 %rd1341, %rd1340, %rd1337; | |
xor.b64 %rd1342, %rd1341, 3144134277; | |
mul.lo.s64 %rd2586, %rd1342, 3449720151; | |
xor.b64 %rd2576, %rd1333, %rd1339; | |
mov.u32 %r329, -766435501; | |
mov.u32 %r328, -239350328; | |
mov.u64 %rd2593, 1684936478; | |
mov.u64 %rd2592, 534103459; | |
mov.u64 %rd2591, 387276957; | |
mov.u64 %rd2590, 3041712726; | |
mov.u64 %rd2589, 3986602516; | |
mov.u64 %rd2588, 2835769497; | |
mov.u64 %rd2587, 3668340011; | |
mov.u64 %rd2585, 2027808484; | |
mov.u64 %rd2584, 1993301258; | |
mov.u64 %rd2582, 842468239; | |
mov.u64 %rd2581, 2654435769; | |
mov.u64 %rd2579, 3528531795; | |
mov.u64 %rd2578, 1013904242; | |
mov.u64 %rd2577, 3449720151; | |
LBB80_26: | |
shr.u64 %rd1368, %rd2586, 32; | |
shr.u64 %rd1369, %rd2576, 32; | |
mul.lo.s64 %rd1370, %rd1369, %rd2577; | |
and.b64 %rd1371, %rd1370, 4294967295; | |
xor.b64 %rd1372, %rd1371, %rd1368; | |
xor.b64 %rd1373, %rd1372, %rd2578; | |
mul.lo.s64 %rd1374, %rd1373, %rd2579; | |
shr.u64 %rd1375, %rd1374, 32; | |
shr.u64 %rd1376, %rd1370, 32; | |
and.b64 %rd1377, %rd2580, 4294967295; | |
xor.b64 %rd1378, %rd1377, %rd1376; | |
xor.b64 %rd1379, %rd1378, %rd2581; | |
mul.lo.s64 %rd1380, %rd1379, %rd2579; | |
and.b64 %rd1381, %rd1380, 4294967295; | |
xor.b64 %rd1382, %rd1381, %rd1375; | |
xor.b64 %rd1383, %rd1382, %rd2582; | |
mul.lo.s64 %rd1384, %rd1383, %rd2577; | |
shr.u64 %rd1385, %rd1384, 32; | |
shr.u64 %rd1386, %rd1380, 32; | |
and.b64 %rd1387, %rd2583, 4294967295; | |
xor.b64 %rd1388, %rd1387, %rd1386; | |
xor.b64 %rd1389, %rd1388, %rd2584; | |
mul.lo.s64 %rd1390, %rd1389, %rd2577; | |
and.b64 %rd1391, %rd1390, 4294967295; | |
xor.b64 %rd1392, %rd1391, %rd1385; | |
xor.b64 %rd1393, %rd1392, %rd2585; | |
mul.lo.s64 %rd1394, %rd1393, %rd2579; | |
shr.u64 %rd1395, %rd1394, 32; | |
shr.u64 %rd1396, %rd1390, 32; | |
and.b64 %rd1397, %rd2586, 4294967295; | |
xor.b64 %rd1398, %rd1397, %rd1396; | |
xor.b64 %rd1399, %rd1398, %rd2587; | |
mul.lo.s64 %rd1400, %rd1399, %rd2579; | |
and.b64 %rd1401, %rd1400, 4294967295; | |
xor.b64 %rd1402, %rd1401, %rd1395; | |
xor.b64 %rd1403, %rd1402, %rd2588; | |
mul.lo.s64 %rd1404, %rd1403, %rd2577; | |
shr.u64 %rd1405, %rd1404, 32; | |
shr.u64 %rd1406, %rd1400, 32; | |
and.b64 %rd1407, %rd1374, 4294967295; | |
xor.b64 %rd1408, %rd1407, %rd1406; | |
xor.b64 %rd1409, %rd1408, %rd2589; | |
mul.lo.s64 %rd1410, %rd1409, %rd2577; | |
and.b64 %rd1411, %rd1410, 4294967295; | |
xor.b64 %rd1412, %rd1411, %rd1405; | |
xor.b64 %rd1413, %rd1412, %rd2590; | |
mul.lo.s64 %rd1414, %rd1413, %rd2579; | |
shr.u64 %rd1415, %rd1414, 32; | |
shr.u64 %rd1416, %rd1410, 32; | |
and.b64 %rd1417, %rd1384, 4294967295; | |
xor.b64 %rd1418, %rd1417, %rd1416; | |
xor.b64 %rd1419, %rd1418, %rd2591; | |
mul.lo.s64 %rd1420, %rd1419, %rd2579; | |
and.b64 %rd1421, %rd1420, 4294967295; | |
xor.b64 %rd1422, %rd1421, %rd1415; | |
xor.b64 %rd1423, %rd1422, %rd2592; | |
mul.lo.s64 %rd1424, %rd1423, %rd2577; | |
shr.u64 %rd1425, %rd1424, 32; | |
shr.u64 %rd1426, %rd1420, 32; | |
xor.b64 %rd1427, %rd1394, %rd1426; | |
xor.b64 %rd1428, %rd1427, %rd2593; | |
mul.lo.s64 %rd1429, %rd1428, %rd2577; | |
xor.b64 %rd1430, %rd1425, %rd1429; | |
cvt.u32.u64 %r153, %rd1430; | |
xor.b32 %r154, %r328, %r153; | |
mul.lo.s32 %r155, %r154, %r329; | |
shr.u32 %r156, %r155, 9; | |
cvt.rn.f32.u32 %f130, %r156; | |
mul.rn.f32 %f131, %f130, 0f34000000; | |
cvt.rn.f16.f32 %h64, %f131; | |
mov.b16 %h65, 0x2E66; | |
setp.ge.f16 %p33, %h64, %h65; | |
ld.global.nc.b16 %h66, [%rd45+770]; | |
ld.global.nc.f32 %f132, [%rd46+1540]; | |
cvt.rn.f16.f32 %h67, %f132; | |
add.rn.f16 %h68, %h66, %h67; | |
mov.b16 %h69, 0x3C72; | |
mul.rn.f16 %h70, %h68, %h69; | |
selp.b16 %h71, %h70, 0x0000, %p33; | |
cvt.f32.f16 %f133, %h71; | |
ld.global.nc.b16 %h72, [%rd47+770]; | |
cvt.f32.f16 %f134, %h72; | |
ld.global.nc.f32 %f135, [%rd48+1540]; | |
mul.rn.f32 %f136, %f1, %f135; | |
mul.rn.f32 %f137, %f136, %f134; | |
ld.global.nc.f32 %f138, [%rd49+1540]; | |
mul.rn.f32 %f139, %f2, %f136; | |
sub.rn.f32 %f140, %f138, %f139; | |
add.rn.f32 %f141, %f137, %f140; | |
add.rn.f32 %f142, %f141, %f133; | |
sub.rn.f32 %f143, %f142, %f3; | |
mul.rn.f32 %f144, %f143, %f143; | |
add.rn.f32 %f11, %f10, %f144; | |
or.b32 %r158, %r73, 512; | |
shr.u32 %r159, %r158, 2; | |
cvt.u64.u32 %rd1431, %r159; | |
add.s64 %rd241, %rd12, %rd1431; | |
@%p8 bra LBB80_28; | |
and.b64 %rd1473, %rd241, 4294967295; | |
mul.lo.s64 %rd2598, %rd1473, 3528531795; | |
setp.lt.u64 %p35, %rd241, %rd12; | |
selp.u64 %rd1474, 1, 0, %p35; | |
add.s64 %rd1475, %rd2464, %rd1474; | |
xor.b64 %rd1476, %rd1475, %rd2598; | |
shr.u64 %rd1477, %rd1476, 32; | |
mul.lo.s64 %rd2601, %rd1477, 3449720151; | |
shr.u64 %rd1478, %rd2601, 32; | |
and.b64 %rd1479, %rd1475, 4294967295; | |
mul.lo.s64 %rd1480, %rd1479, 3449720151; | |
and.b64 %rd1481, %rd1480, 4294967295; | |
xor.b64 %rd1482, %rd1481, %rd1478; | |
xor.b64 %rd1483, %rd1482, 2654435769; | |
mul.lo.s64 %rd2604, %rd1483, 3528531795; | |
xor.b64 %rd2594, %rd1480, %rd241; | |
mov.u32 %r332, -1879881855; | |
mov.u32 %r331, -845247145; | |
mov.u32 %r330, 534103459; | |
mov.u64 %rd2612, 3678237736; | |
mov.u64 %rd2611, 3041712726; | |
mov.u64 %rd2610, 1401181199; | |
mov.u64 %rd2609, 2835769497; | |
mov.u64 %rd2608, 1684936478; | |
mov.u64 %rd2607, 2027808484; | |
mov.u64 %rd2606, 387276957; | |
mov.u64 %rd2605, 842468239; | |
mov.u64 %rd2603, 3986602516; | |
mov.u64 %rd2602, 1013904242; | |
mov.u64 %rd2600, 3668340011; | |
mov.u64 %rd2599, 3144134277; | |
mov.u64 %rd2597, 3449720151; | |
mov.u64 %rd2596, 1993301258; | |
mov.u64 %rd2595, 3528531795; | |
bra.uni LBB80_29; | |
LBB80_28: | |
setp.lt.u64 %p34, %rd241, %rd12; | |
selp.u64 %rd1447, 1, 0, %p34; | |
add.s64 %rd1448, %rd2464, %rd1447; | |
and.b64 %rd1449, %rd1448, 4294967295; | |
mul.lo.s64 %rd2598, %rd1449, 3449720151; | |
xor.b64 %rd1450, %rd2598, %rd241; | |
shr.u64 %rd1451, %rd1450, 32; | |
mul.lo.s64 %rd2601, %rd1451, 3528531795; | |
shr.u64 %rd1452, %rd2601, 32; | |
and.b64 %rd1453, %rd241, 4294967295; | |
mul.lo.s64 %rd1454, %rd1453, 3528531795; | |
and.b64 %rd1455, %rd1454, 4294967295; | |
xor.b64 %rd1456, %rd1455, %rd1452; | |
xor.b64 %rd1457, %rd1456, 3144134277; | |
mul.lo.s64 %rd2604, %rd1457, 3449720151; | |
xor.b64 %rd2594, %rd1448, %rd1454; | |
mov.u32 %r332, -1767562579; | |
mov.u32 %r331, -766435501; | |
mov.u32 %r330, 1401181199; | |
mov.u64 %rd2612, 4055616968; | |
mov.u64 %rd2611, 1684936478; | |
mov.u64 %rd2610, 534103459; | |
mov.u64 %rd2609, 387276957; | |
mov.u64 %rd2608, 3041712726; | |
mov.u64 %rd2607, 3986602516; | |
mov.u64 %rd2606, 2835769497; | |
mov.u64 %rd2605, 3668340011; | |
mov.u64 %rd2603, 2027808484; | |
mov.u64 %rd2602, 1993301258; | |
mov.u64 %rd2600, 842468239; | |
mov.u64 %rd2599, 2654435769; | |
mov.u64 %rd2597, 3528531795; | |
mov.u64 %rd2596, 1013904242; | |
mov.u64 %rd2595, 3449720151; | |
LBB80_29: | |
shr.u64 %rd1484, %rd2604, 32; | |
shr.u64 %rd1485, %rd2594, 32; | |
mul.lo.s64 %rd1486, %rd1485, %rd2595; | |
and.b64 %rd1487, %rd1486, 4294967295; | |
xor.b64 %rd1488, %rd1487, %rd1484; | |
xor.b64 %rd1489, %rd1488, %rd2596; | |
mul.lo.s64 %rd1490, %rd1489, %rd2597; | |
shr.u64 %rd1491, %rd1490, 32; | |
shr.u64 %rd1492, %rd1486, 32; | |
and.b64 %rd1493, %rd2598, 4294967295; | |
xor.b64 %rd1494, %rd1493, %rd1492; | |
xor.b64 %rd1495, %rd1494, %rd2599; | |
mul.lo.s64 %rd1496, %rd1495, %rd2597; | |
and.b64 %rd1497, %rd1496, 4294967295; | |
xor.b64 %rd1498, %rd1497, %rd1491; | |
xor.b64 %rd1499, %rd1498, %rd2600; | |
mul.lo.s64 %rd1500, %rd1499, %rd2595; | |
shr.u64 %rd1501, %rd1500, 32; | |
shr.u64 %rd1502, %rd1496, 32; | |
and.b64 %rd1503, %rd2601, 4294967295; | |
xor.b64 %rd1504, %rd1503, %rd1502; | |
xor.b64 %rd1505, %rd1504, %rd2602; | |
mul.lo.s64 %rd1506, %rd1505, %rd2595; | |
and.b64 %rd1507, %rd1506, 4294967295; | |
xor.b64 %rd1508, %rd1507, %rd1501; | |
xor.b64 %rd1509, %rd1508, %rd2603; | |
mul.lo.s64 %rd1510, %rd1509, %rd2597; | |
shr.u64 %rd1511, %rd1510, 32; | |
shr.u64 %rd1512, %rd1506, 32; | |
and.b64 %rd1513, %rd2604, 4294967295; | |
xor.b64 %rd1514, %rd1513, %rd1512; | |
xor.b64 %rd1515, %rd1514, %rd2605; | |
mul.lo.s64 %rd1516, %rd1515, %rd2597; | |
and.b64 %rd1517, %rd1516, 4294967295; | |
xor.b64 %rd1518, %rd1517, %rd1511; | |
xor.b64 %rd1519, %rd1518, %rd2606; | |
mul.lo.s64 %rd1520, %rd1519, %rd2595; | |
shr.u64 %rd1521, %rd1520, 32; | |
shr.u64 %rd1522, %rd1516, 32; | |
and.b64 %rd1523, %rd1490, 4294967295; | |
xor.b64 %rd1524, %rd1523, %rd1522; | |
xor.b64 %rd1525, %rd1524, %rd2607; | |
mul.lo.s64 %rd1526, %rd1525, %rd2595; | |
and.b64 %rd1527, %rd1526, 4294967295; | |
xor.b64 %rd1528, %rd1527, %rd1521; | |
xor.b64 %rd1529, %rd1528, %rd2608; | |
mul.lo.s64 %rd1530, %rd1529, %rd2597; | |
shr.u64 %rd1531, %rd1530, 32; | |
shr.u64 %rd1532, %rd1526, 32; | |
and.b64 %rd1533, %rd1500, 4294967295; | |
xor.b64 %rd1534, %rd1533, %rd1532; | |
xor.b64 %rd1535, %rd1534, %rd2609; | |
mul.lo.s64 %rd1536, %rd1535, %rd2597; | |
and.b64 %rd1537, %rd1536, 4294967295; | |
xor.b64 %rd1538, %rd1537, %rd1531; | |
xor.b64 %rd1539, %rd1538, %rd2610; | |
mul.lo.s64 %rd1540, %rd1539, %rd2595; | |
shr.u64 %rd1541, %rd1540, 32; | |
shr.u64 %rd1542, %rd1536, 32; | |
and.b64 %rd1543, %rd1510, 4294967295; | |
xor.b64 %rd1544, %rd1543, %rd1542; | |
xor.b64 %rd1545, %rd1544, %rd2611; | |
mul.lo.s64 %rd1546, %rd1545, %rd2595; | |
and.b64 %rd1547, %rd1546, 4294967295; | |
xor.b64 %rd1548, %rd1547, %rd1541; | |
xor.b64 %rd1549, %rd1548, %rd2612; | |
mul.lo.s64 %rd1550, %rd1549, %rd2597; | |
shr.u64 %rd1551, %rd1550, 32; | |
cvt.u32.u64 %r166, %rd1551; | |
shr.u64 %rd1552, %rd1546, 32; | |
xor.b64 %rd1553, %rd1552, %rd1520; | |
cvt.u32.u64 %r167, %rd1553; | |
xor.b32 %r168, %r330, %r167; | |
mul.lo.s32 %r169, %r168, %r331; | |
xor.b32 %r170, %r169, %r166; | |
xor.b32 %r171, %r170, %r332; | |
shr.u32 %r172, %r171, 9; | |
cvt.rn.f32.u32 %f145, %r172; | |
mul.rn.f32 %f146, %f145, 0f34000000; | |
cvt.rn.f16.f32 %h73, %f146; | |
mov.b16 %h74, 0x2E66; | |
setp.ge.f16 %p36, %h73, %h74; | |
ld.global.nc.b16 %h75, [%rd45+1024]; | |
ld.global.nc.f32 %f147, [%rd46+2048]; | |
cvt.rn.f16.f32 %h76, %f147; | |
add.rn.f16 %h77, %h75, %h76; | |
mov.b16 %h78, 0x3C72; | |
mul.rn.f16 %h79, %h77, %h78; | |
selp.b16 %h80, %h79, 0x0000, %p36; | |
cvt.f32.f16 %f148, %h80; | |
ld.global.nc.b16 %h81, [%rd47+1024]; | |
cvt.f32.f16 %f149, %h81; | |
ld.global.nc.f32 %f150, [%rd48+2048]; | |
mul.rn.f32 %f151, %f1, %f150; | |
mul.rn.f32 %f152, %f151, %f149; | |
ld.global.nc.f32 %f153, [%rd49+2048]; | |
mul.rn.f32 %f154, %f2, %f151; | |
sub.rn.f32 %f155, %f153, %f154; | |
add.rn.f32 %f156, %f152, %f155; | |
add.rn.f32 %f157, %f156, %f148; | |
sub.rn.f32 %f158, %f157, %f3; | |
mul.rn.f32 %f159, %f158, %f158; | |
add.rn.f32 %f12, %f11, %f159; | |
or.b32 %r173, %r3, 513; | |
or.b32 %r174, %r173, %r4; | |
and.b32 %r175, %r173, 3; | |
shr.u32 %r176, %r174, 2; | |
setp.ne.s32 %p37, %r175, 1; | |
cvt.u64.u32 %rd1554, %r176; | |
add.s64 %rd269, %rd12, %rd1554; | |
@%p37 bra LBB80_31; | |
and.b64 %rd1594, %rd269, 4294967295; | |
mul.lo.s64 %rd2617, %rd1594, 3528531795; | |
setp.lt.u64 %p39, %rd269, %rd12; | |
selp.u64 %rd1595, 1, 0, %p39; | |
add.s64 %rd1596, %rd2464, %rd1595; | |
xor.b64 %rd1597, %rd1596, %rd2617; | |
shr.u64 %rd1598, %rd1597, 32; | |
mul.lo.s64 %rd2620, %rd1598, 3449720151; | |
shr.u64 %rd1599, %rd2620, 32; | |
and.b64 %rd1600, %rd1596, 4294967295; | |
mul.lo.s64 %rd1601, %rd1600, 3449720151; | |
and.b64 %rd1602, %rd1601, 4294967295; | |
xor.b64 %rd1603, %rd1602, %rd1599; | |
xor.b64 %rd1604, %rd1603, 2654435769; | |
mul.lo.s64 %rd2623, %rd1604, 3528531795; | |
xor.b64 %rd2613, %rd1601, %rd269; | |
mov.u32 %r334, -845247145; | |
mov.u32 %r333, -616729560; | |
mov.u64 %rd2630, 3041712726; | |
mov.u64 %rd2629, 1401181199; | |
mov.u64 %rd2628, 2835769497; | |
mov.u64 %rd2627, 1684936478; | |
mov.u64 %rd2626, 2027808484; | |
mov.u64 %rd2625, 387276957; | |
mov.u64 %rd2624, 842468239; | |
mov.u64 %rd2622, 3986602516; | |
mov.u64 %rd2621, 1013904242; | |
mov.u64 %rd2619, 3668340011; | |
mov.u64 %rd2618, 3144134277; | |
mov.u64 %rd2616, 3449720151; | |
mov.u64 %rd2615, 1993301258; | |
mov.u64 %rd2614, 3528531795; | |
bra.uni LBB80_32; | |
LBB80_31: | |
setp.lt.u64 %p38, %rd269, %rd12; | |
selp.u64 %rd1569, 1, 0, %p38; | |
add.s64 %rd1570, %rd2464, %rd1569; | |
and.b64 %rd1571, %rd1570, 4294967295; | |
mul.lo.s64 %rd2617, %rd1571, 3449720151; | |
xor.b64 %rd1572, %rd2617, %rd269; | |
shr.u64 %rd1573, %rd1572, 32; | |
mul.lo.s64 %rd2620, %rd1573, 3528531795; | |
shr.u64 %rd1574, %rd2620, 32; | |
and.b64 %rd1575, %rd269, 4294967295; | |
mul.lo.s64 %rd1576, %rd1575, 3528531795; | |
and.b64 %rd1577, %rd1576, 4294967295; | |
xor.b64 %rd1578, %rd1577, %rd1574; | |
xor.b64 %rd1579, %rd1578, 3144134277; | |
mul.lo.s64 %rd2623, %rd1579, 3449720151; | |
xor.b64 %rd2613, %rd1570, %rd1576; | |
mov.u32 %r334, -766435501; | |
mov.u32 %r333, -239350328; | |
mov.u64 %rd2630, 1684936478; | |
mov.u64 %rd2629, 534103459; | |
mov.u64 %rd2628, 387276957; | |
mov.u64 %rd2627, 3041712726; | |
mov.u64 %rd2626, 3986602516; | |
mov.u64 %rd2625, 2835769497; | |
mov.u64 %rd2624, 3668340011; | |
mov.u64 %rd2622, 2027808484; | |
mov.u64 %rd2621, 1993301258; | |
mov.u64 %rd2619, 842468239; | |
mov.u64 %rd2618, 2654435769; | |
mov.u64 %rd2616, 3528531795; | |
mov.u64 %rd2615, 1013904242; | |
mov.u64 %rd2614, 3449720151; | |
LBB80_32: | |
shr.u64 %rd1605, %rd2623, 32; | |
shr.u64 %rd1606, %rd2613, 32; | |
mul.lo.s64 %rd1607, %rd1606, %rd2614; | |
and.b64 %rd1608, %rd1607, 4294967295; | |
xor.b64 %rd1609, %rd1608, %rd1605; | |
xor.b64 %rd1610, %rd1609, %rd2615; | |
mul.lo.s64 %rd1611, %rd1610, %rd2616; | |
shr.u64 %rd1612, %rd1611, 32; | |
shr.u64 %rd1613, %rd1607, 32; | |
and.b64 %rd1614, %rd2617, 4294967295; | |
xor.b64 %rd1615, %rd1614, %rd1613; | |
xor.b64 %rd1616, %rd1615, %rd2618; | |
mul.lo.s64 %rd1617, %rd1616, %rd2616; | |
and.b64 %rd1618, %rd1617, 4294967295; | |
xor.b64 %rd1619, %rd1618, %rd1612; | |
xor.b64 %rd1620, %rd1619, %rd2619; | |
mul.lo.s64 %rd1621, %rd1620, %rd2614; | |
shr.u64 %rd1622, %rd1621, 32; | |
shr.u64 %rd1623, %rd1617, 32; | |
and.b64 %rd1624, %rd2620, 4294967295; | |
xor.b64 %rd1625, %rd1624, %rd1623; | |
xor.b64 %rd1626, %rd1625, %rd2621; | |
mul.lo.s64 %rd1627, %rd1626, %rd2614; | |
and.b64 %rd1628, %rd1627, 4294967295; | |
xor.b64 %rd1629, %rd1628, %rd1622; | |
xor.b64 %rd1630, %rd1629, %rd2622; | |
mul.lo.s64 %rd1631, %rd1630, %rd2616; | |
shr.u64 %rd1632, %rd1631, 32; | |
shr.u64 %rd1633, %rd1627, 32; | |
and.b64 %rd1634, %rd2623, 4294967295; | |
xor.b64 %rd1635, %rd1634, %rd1633; | |
xor.b64 %rd1636, %rd1635, %rd2624; | |
mul.lo.s64 %rd1637, %rd1636, %rd2616; | |
and.b64 %rd1638, %rd1637, 4294967295; | |
xor.b64 %rd1639, %rd1638, %rd1632; | |
xor.b64 %rd1640, %rd1639, %rd2625; | |
mul.lo.s64 %rd1641, %rd1640, %rd2614; | |
shr.u64 %rd1642, %rd1641, 32; | |
shr.u64 %rd1643, %rd1637, 32; | |
and.b64 %rd1644, %rd1611, 4294967295; | |
xor.b64 %rd1645, %rd1644, %rd1643; | |
xor.b64 %rd1646, %rd1645, %rd2626; | |
mul.lo.s64 %rd1647, %rd1646, %rd2614; | |
and.b64 %rd1648, %rd1647, 4294967295; | |
xor.b64 %rd1649, %rd1648, %rd1642; | |
xor.b64 %rd1650, %rd1649, %rd2627; | |
mul.lo.s64 %rd1651, %rd1650, %rd2616; | |
shr.u64 %rd1652, %rd1651, 32; | |
shr.u64 %rd1653, %rd1647, 32; | |
and.b64 %rd1654, %rd1621, 4294967295; | |
xor.b64 %rd1655, %rd1654, %rd1653; | |
xor.b64 %rd1656, %rd1655, %rd2628; | |
mul.lo.s64 %rd1657, %rd1656, %rd2616; | |
and.b64 %rd1658, %rd1657, 4294967295; | |
xor.b64 %rd1659, %rd1658, %rd1652; | |
xor.b64 %rd1660, %rd1659, %rd2629; | |
mul.lo.s64 %rd1661, %rd1660, %rd2614; | |
shr.u64 %rd1662, %rd1661, 32; | |
shr.u64 %rd1663, %rd1657, 32; | |
xor.b64 %rd1664, %rd1631, %rd1663; | |
xor.b64 %rd1665, %rd1664, %rd2630; | |
mul.lo.s64 %rd1666, %rd1665, %rd2614; | |
xor.b64 %rd1667, %rd1662, %rd1666; | |
cvt.u32.u64 %r181, %rd1667; | |
xor.b32 %r182, %r333, %r181; | |
mul.lo.s32 %r183, %r182, %r334; | |
shr.u32 %r184, %r183, 9; | |
cvt.rn.f32.u32 %f160, %r184; | |
mul.rn.f32 %f161, %f160, 0f34000000; | |
cvt.rn.f16.f32 %h82, %f161; | |
mov.b16 %h83, 0x2E66; | |
setp.ge.f16 %p41, %h82, %h83; | |
ld.global.nc.b16 %h84, [%rd45+1026]; | |
ld.global.nc.f32 %f162, [%rd46+2052]; | |
cvt.rn.f16.f32 %h85, %f162; | |
add.rn.f16 %h86, %h84, %h85; | |
mov.b16 %h87, 0x3C72; | |
mul.rn.f16 %h88, %h86, %h87; | |
selp.b16 %h89, %h88, 0x0000, %p41; | |
cvt.f32.f16 %f163, %h89; | |
ld.global.nc.b16 %h90, [%rd47+1026]; | |
cvt.f32.f16 %f164, %h90; | |
ld.global.nc.f32 %f165, [%rd48+2052]; | |
mul.rn.f32 %f166, %f1, %f165; | |
mul.rn.f32 %f167, %f166, %f164; | |
ld.global.nc.f32 %f168, [%rd49+2052]; | |
mul.rn.f32 %f169, %f2, %f166; | |
sub.rn.f32 %f170, %f168, %f169; | |
add.rn.f32 %f171, %f167, %f170; | |
add.rn.f32 %f172, %f171, %f163; | |
sub.rn.f32 %f173, %f172, %f3; | |
mul.rn.f32 %f174, %f173, %f173; | |
add.rn.f32 %f13, %f12, %f174; | |
or.b32 %r186, %r73, 640; | |
shr.u32 %r187, %r186, 2; | |
cvt.u64.u32 %rd1668, %r187; | |
add.s64 %rd296, %rd12, %rd1668; | |
@%p8 bra LBB80_34; | |
and.b64 %rd1710, %rd296, 4294967295; | |
mul.lo.s64 %rd2635, %rd1710, 3528531795; | |
setp.lt.u64 %p43, %rd296, %rd12; | |
selp.u64 %rd1711, 1, 0, %p43; | |
add.s64 %rd1712, %rd2464, %rd1711; | |
xor.b64 %rd1713, %rd1712, %rd2635; | |
shr.u64 %rd1714, %rd1713, 32; | |
mul.lo.s64 %rd2638, %rd1714, 3449720151; | |
shr.u64 %rd1715, %rd2638, 32; | |
and.b64 %rd1716, %rd1712, 4294967295; | |
mul.lo.s64 %rd1717, %rd1716, 3449720151; | |
and.b64 %rd1718, %rd1717, 4294967295; | |
xor.b64 %rd1719, %rd1718, %rd1715; | |
xor.b64 %rd1720, %rd1719, 2654435769; | |
mul.lo.s64 %rd2641, %rd1720, 3528531795; | |
xor.b64 %rd2631, %rd1717, %rd296; | |
mov.u32 %r337, -1879881855; | |
mov.u32 %r336, -845247145; | |
mov.u32 %r335, 534103459; | |
mov.u64 %rd2649, 3678237736; | |
mov.u64 %rd2648, 3041712726; | |
mov.u64 %rd2647, 1401181199; | |
mov.u64 %rd2646, 2835769497; | |
mov.u64 %rd2645, 1684936478; | |
mov.u64 %rd2644, 2027808484; | |
mov.u64 %rd2643, 387276957; | |
mov.u64 %rd2642, 842468239; | |
mov.u64 %rd2640, 3986602516; | |
mov.u64 %rd2639, 1013904242; | |
mov.u64 %rd2637, 3668340011; | |
mov.u64 %rd2636, 3144134277; | |
mov.u64 %rd2634, 3449720151; | |
mov.u64 %rd2633, 1993301258; | |
mov.u64 %rd2632, 3528531795; | |
bra.uni LBB80_35; | |
LBB80_34: | |
setp.lt.u64 %p42, %rd296, %rd12; | |
selp.u64 %rd1684, 1, 0, %p42; | |
add.s64 %rd1685, %rd2464, %rd1684; | |
and.b64 %rd1686, %rd1685, 4294967295; | |
mul.lo.s64 %rd2635, %rd1686, 3449720151; | |
xor.b64 %rd1687, %rd2635, %rd296; | |
shr.u64 %rd1688, %rd1687, 32; | |
mul.lo.s64 %rd2638, %rd1688, 3528531795; | |
shr.u64 %rd1689, %rd2638, 32; | |
and.b64 %rd1690, %rd296, 4294967295; | |
mul.lo.s64 %rd1691, %rd1690, 3528531795; | |
and.b64 %rd1692, %rd1691, 4294967295; | |
xor.b64 %rd1693, %rd1692, %rd1689; | |
xor.b64 %rd1694, %rd1693, 3144134277; | |
mul.lo.s64 %rd2641, %rd1694, 3449720151; | |
xor.b64 %rd2631, %rd1685, %rd1691; | |
mov.u32 %r337, -1767562579; | |
mov.u32 %r336, -766435501; | |
mov.u32 %r335, 1401181199; | |
mov.u64 %rd2649, 4055616968; | |
mov.u64 %rd2648, 1684936478; | |
mov.u64 %rd2647, 534103459; | |
mov.u64 %rd2646, 387276957; | |
mov.u64 %rd2645, 3041712726; | |
mov.u64 %rd2644, 3986602516; | |
mov.u64 %rd2643, 2835769497; | |
mov.u64 %rd2642, 3668340011; | |
mov.u64 %rd2640, 2027808484; | |
mov.u64 %rd2639, 1993301258; | |
mov.u64 %rd2637, 842468239; | |
mov.u64 %rd2636, 2654435769; | |
mov.u64 %rd2634, 3528531795; | |
mov.u64 %rd2633, 1013904242; | |
mov.u64 %rd2632, 3449720151; | |
LBB80_35: | |
shr.u64 %rd1721, %rd2641, 32; | |
shr.u64 %rd1722, %rd2631, 32; | |
mul.lo.s64 %rd1723, %rd1722, %rd2632; | |
and.b64 %rd1724, %rd1723, 4294967295; | |
xor.b64 %rd1725, %rd1724, %rd1721; | |
xor.b64 %rd1726, %rd1725, %rd2633; | |
mul.lo.s64 %rd1727, %rd1726, %rd2634; | |
shr.u64 %rd1728, %rd1727, 32; | |
shr.u64 %rd1729, %rd1723, 32; | |
and.b64 %rd1730, %rd2635, 4294967295; | |
xor.b64 %rd1731, %rd1730, %rd1729; | |
xor.b64 %rd1732, %rd1731, %rd2636; | |
mul.lo.s64 %rd1733, %rd1732, %rd2634; | |
and.b64 %rd1734, %rd1733, 4294967295; | |
xor.b64 %rd1735, %rd1734, %rd1728; | |
xor.b64 %rd1736, %rd1735, %rd2637; | |
mul.lo.s64 %rd1737, %rd1736, %rd2632; | |
shr.u64 %rd1738, %rd1737, 32; | |
shr.u64 %rd1739, %rd1733, 32; | |
and.b64 %rd1740, %rd2638, 4294967295; | |
xor.b64 %rd1741, %rd1740, %rd1739; | |
xor.b64 %rd1742, %rd1741, %rd2639; | |
mul.lo.s64 %rd1743, %rd1742, %rd2632; | |
and.b64 %rd1744, %rd1743, 4294967295; | |
xor.b64 %rd1745, %rd1744, %rd1738; | |
xor.b64 %rd1746, %rd1745, %rd2640; | |
mul.lo.s64 %rd1747, %rd1746, %rd2634; | |
shr.u64 %rd1748, %rd1747, 32; | |
shr.u64 %rd1749, %rd1743, 32; | |
and.b64 %rd1750, %rd2641, 4294967295; | |
xor.b64 %rd1751, %rd1750, %rd1749; | |
xor.b64 %rd1752, %rd1751, %rd2642; | |
mul.lo.s64 %rd1753, %rd1752, %rd2634; | |
and.b64 %rd1754, %rd1753, 4294967295; | |
xor.b64 %rd1755, %rd1754, %rd1748; | |
xor.b64 %rd1756, %rd1755, %rd2643; | |
mul.lo.s64 %rd1757, %rd1756, %rd2632; | |
shr.u64 %rd1758, %rd1757, 32; | |
shr.u64 %rd1759, %rd1753, 32; | |
and.b64 %rd1760, %rd1727, 4294967295; | |
xor.b64 %rd1761, %rd1760, %rd1759; | |
xor.b64 %rd1762, %rd1761, %rd2644; | |
mul.lo.s64 %rd1763, %rd1762, %rd2632; | |
and.b64 %rd1764, %rd1763, 4294967295; | |
xor.b64 %rd1765, %rd1764, %rd1758; | |
xor.b64 %rd1766, %rd1765, %rd2645; | |
mul.lo.s64 %rd1767, %rd1766, %rd2634; | |
shr.u64 %rd1768, %rd1767, 32; | |
shr.u64 %rd1769, %rd1763, 32; | |
and.b64 %rd1770, %rd1737, 4294967295; | |
xor.b64 %rd1771, %rd1770, %rd1769; | |
xor.b64 %rd1772, %rd1771, %rd2646; | |
mul.lo.s64 %rd1773, %rd1772, %rd2634; | |
and.b64 %rd1774, %rd1773, 4294967295; | |
xor.b64 %rd1775, %rd1774, %rd1768; | |
xor.b64 %rd1776, %rd1775, %rd2647; | |
mul.lo.s64 %rd1777, %rd1776, %rd2632; | |
shr.u64 %rd1778, %rd1777, 32; | |
shr.u64 %rd1779, %rd1773, 32; | |
and.b64 %rd1780, %rd1747, 4294967295; | |
xor.b64 %rd1781, %rd1780, %rd1779; | |
xor.b64 %rd1782, %rd1781, %rd2648; | |
mul.lo.s64 %rd1783, %rd1782, %rd2632; | |
and.b64 %rd1784, %rd1783, 4294967295; | |
xor.b64 %rd1785, %rd1784, %rd1778; | |
xor.b64 %rd1786, %rd1785, %rd2649; | |
mul.lo.s64 %rd1787, %rd1786, %rd2634; | |
shr.u64 %rd1788, %rd1787, 32; | |
cvt.u32.u64 %r194, %rd1788; | |
shr.u64 %rd1789, %rd1783, 32; | |
xor.b64 %rd1790, %rd1789, %rd1757; | |
cvt.u32.u64 %r195, %rd1790; | |
xor.b32 %r196, %r335, %r195; | |
mul.lo.s32 %r197, %r196, %r336; | |
xor.b32 %r198, %r197, %r194; | |
xor.b32 %r199, %r198, %r337; | |
shr.u32 %r200, %r199, 9; | |
cvt.rn.f32.u32 %f175, %r200; | |
mul.rn.f32 %f176, %f175, 0f34000000; | |
cvt.rn.f16.f32 %h91, %f176; | |
mov.b16 %h92, 0x2E66; | |
setp.ge.f16 %p44, %h91, %h92; | |
ld.global.nc.b16 %h93, [%rd45+1280]; | |
ld.global.nc.f32 %f177, [%rd46+2560]; | |
cvt.rn.f16.f32 %h94, %f177; | |
add.rn.f16 %h95, %h93, %h94; | |
mov.b16 %h96, 0x3C72; | |
mul.rn.f16 %h97, %h95, %h96; | |
selp.b16 %h98, %h97, 0x0000, %p44; | |
cvt.f32.f16 %f178, %h98; | |
ld.global.nc.b16 %h99, [%rd47+1280]; | |
cvt.f32.f16 %f179, %h99; | |
ld.global.nc.f32 %f180, [%rd48+2560]; | |
mul.rn.f32 %f181, %f1, %f180; | |
mul.rn.f32 %f182, %f181, %f179; | |
ld.global.nc.f32 %f183, [%rd49+2560]; | |
mul.rn.f32 %f184, %f2, %f181; | |
sub.rn.f32 %f185, %f183, %f184; | |
add.rn.f32 %f186, %f182, %f185; | |
add.rn.f32 %f187, %f186, %f178; | |
sub.rn.f32 %f188, %f187, %f3; | |
mul.rn.f32 %f189, %f188, %f188; | |
add.rn.f32 %f14, %f13, %f189; | |
or.b32 %r201, %r3, 641; | |
or.b32 %r202, %r201, %r4; | |
and.b32 %r203, %r201, 3; | |
shr.u32 %r204, %r202, 2; | |
setp.ne.s32 %p45, %r203, 1; | |
cvt.u64.u32 %rd1791, %r204; | |
add.s64 %rd324, %rd12, %rd1791; | |
@%p45 bra LBB80_37; | |
and.b64 %rd1831, %rd324, 4294967295; | |
mul.lo.s64 %rd2654, %rd1831, 3528531795; | |
setp.lt.u64 %p47, %rd324, %rd12; | |
selp.u64 %rd1832, 1, 0, %p47; | |
add.s64 %rd1833, %rd2464, %rd1832; | |
xor.b64 %rd1834, %rd1833, %rd2654; | |
shr.u64 %rd1835, %rd1834, 32; | |
mul.lo.s64 %rd2657, %rd1835, 3449720151; | |
shr.u64 %rd1836, %rd2657, 32; | |
and.b64 %rd1837, %rd1833, 4294967295; | |
mul.lo.s64 %rd1838, %rd1837, 3449720151; | |
and.b64 %rd1839, %rd1838, 4294967295; | |
xor.b64 %rd1840, %rd1839, %rd1836; | |
xor.b64 %rd1841, %rd1840, 2654435769; | |
mul.lo.s64 %rd2660, %rd1841, 3528531795; | |
xor.b64 %rd2650, %rd1838, %rd324; | |
mov.u32 %r339, -845247145; | |
mov.u32 %r338, -616729560; | |
mov.u64 %rd2667, 3041712726; | |
mov.u64 %rd2666, 1401181199; | |
mov.u64 %rd2665, 2835769497; | |
mov.u64 %rd2664, 1684936478; | |
mov.u64 %rd2663, 2027808484; | |
mov.u64 %rd2662, 387276957; | |
mov.u64 %rd2661, 842468239; | |
mov.u64 %rd2659, 3986602516; | |
mov.u64 %rd2658, 1013904242; | |
mov.u64 %rd2656, 3668340011; | |
mov.u64 %rd2655, 3144134277; | |
mov.u64 %rd2653, 3449720151; | |
mov.u64 %rd2652, 1993301258; | |
mov.u64 %rd2651, 3528531795; | |
bra.uni LBB80_38; | |
LBB80_37: | |
setp.lt.u64 %p46, %rd324, %rd12; | |
selp.u64 %rd1806, 1, 0, %p46; | |
add.s64 %rd1807, %rd2464, %rd1806; | |
and.b64 %rd1808, %rd1807, 4294967295; | |
mul.lo.s64 %rd2654, %rd1808, 3449720151; | |
xor.b64 %rd1809, %rd2654, %rd324; | |
shr.u64 %rd1810, %rd1809, 32; | |
mul.lo.s64 %rd2657, %rd1810, 3528531795; | |
shr.u64 %rd1811, %rd2657, 32; | |
and.b64 %rd1812, %rd324, 4294967295; | |
mul.lo.s64 %rd1813, %rd1812, 3528531795; | |
and.b64 %rd1814, %rd1813, 4294967295; | |
xor.b64 %rd1815, %rd1814, %rd1811; | |
xor.b64 %rd1816, %rd1815, 3144134277; | |
mul.lo.s64 %rd2660, %rd1816, 3449720151; | |
xor.b64 %rd2650, %rd1807, %rd1813; | |
mov.u32 %r339, -766435501; | |
mov.u32 %r338, -239350328; | |
mov.u64 %rd2667, 1684936478; | |
mov.u64 %rd2666, 534103459; | |
mov.u64 %rd2665, 387276957; | |
mov.u64 %rd2664, 3041712726; | |
mov.u64 %rd2663, 3986602516; | |
mov.u64 %rd2662, 2835769497; | |
mov.u64 %rd2661, 3668340011; | |
mov.u64 %rd2659, 2027808484; | |
mov.u64 %rd2658, 1993301258; | |
mov.u64 %rd2656, 842468239; | |
mov.u64 %rd2655, 2654435769; | |
mov.u64 %rd2653, 3528531795; | |
mov.u64 %rd2652, 1013904242; | |
mov.u64 %rd2651, 3449720151; | |
LBB80_38: | |
shr.u64 %rd1842, %rd2660, 32; | |
shr.u64 %rd1843, %rd2650, 32; | |
mul.lo.s64 %rd1844, %rd1843, %rd2651; | |
and.b64 %rd1845, %rd1844, 4294967295; | |
xor.b64 %rd1846, %rd1845, %rd1842; | |
xor.b64 %rd1847, %rd1846, %rd2652; | |
mul.lo.s64 %rd1848, %rd1847, %rd2653; | |
shr.u64 %rd1849, %rd1848, 32; | |
shr.u64 %rd1850, %rd1844, 32; | |
and.b64 %rd1851, %rd2654, 4294967295; | |
xor.b64 %rd1852, %rd1851, %rd1850; | |
xor.b64 %rd1853, %rd1852, %rd2655; | |
mul.lo.s64 %rd1854, %rd1853, %rd2653; | |
and.b64 %rd1855, %rd1854, 4294967295; | |
xor.b64 %rd1856, %rd1855, %rd1849; | |
xor.b64 %rd1857, %rd1856, %rd2656; | |
mul.lo.s64 %rd1858, %rd1857, %rd2651; | |
shr.u64 %rd1859, %rd1858, 32; | |
shr.u64 %rd1860, %rd1854, 32; | |
and.b64 %rd1861, %rd2657, 4294967295; | |
xor.b64 %rd1862, %rd1861, %rd1860; | |
xor.b64 %rd1863, %rd1862, %rd2658; | |
mul.lo.s64 %rd1864, %rd1863, %rd2651; | |
and.b64 %rd1865, %rd1864, 4294967295; | |
xor.b64 %rd1866, %rd1865, %rd1859; | |
xor.b64 %rd1867, %rd1866, %rd2659; | |
mul.lo.s64 %rd1868, %rd1867, %rd2653; | |
shr.u64 %rd1869, %rd1868, 32; | |
shr.u64 %rd1870, %rd1864, 32; | |
and.b64 %rd1871, %rd2660, 4294967295; | |
xor.b64 %rd1872, %rd1871, %rd1870; | |
xor.b64 %rd1873, %rd1872, %rd2661; | |
mul.lo.s64 %rd1874, %rd1873, %rd2653; | |
and.b64 %rd1875, %rd1874, 4294967295; | |
xor.b64 %rd1876, %rd1875, %rd1869; | |
xor.b64 %rd1877, %rd1876, %rd2662; | |
mul.lo.s64 %rd1878, %rd1877, %rd2651; | |
shr.u64 %rd1879, %rd1878, 32; | |
shr.u64 %rd1880, %rd1874, 32; | |
and.b64 %rd1881, %rd1848, 4294967295; | |
xor.b64 %rd1882, %rd1881, %rd1880; | |
xor.b64 %rd1883, %rd1882, %rd2663; | |
mul.lo.s64 %rd1884, %rd1883, %rd2651; | |
and.b64 %rd1885, %rd1884, 4294967295; | |
xor.b64 %rd1886, %rd1885, %rd1879; | |
xor.b64 %rd1887, %rd1886, %rd2664; | |
mul.lo.s64 %rd1888, %rd1887, %rd2653; | |
shr.u64 %rd1889, %rd1888, 32; | |
shr.u64 %rd1890, %rd1884, 32; | |
and.b64 %rd1891, %rd1858, 4294967295; | |
xor.b64 %rd1892, %rd1891, %rd1890; | |
xor.b64 %rd1893, %rd1892, %rd2665; | |
mul.lo.s64 %rd1894, %rd1893, %rd2653; | |
and.b64 %rd1895, %rd1894, 4294967295; | |
xor.b64 %rd1896, %rd1895, %rd1889; | |
xor.b64 %rd1897, %rd1896, %rd2666; | |
mul.lo.s64 %rd1898, %rd1897, %rd2651; | |
shr.u64 %rd1899, %rd1898, 32; | |
shr.u64 %rd1900, %rd1894, 32; | |
xor.b64 %rd1901, %rd1868, %rd1900; | |
xor.b64 %rd1902, %rd1901, %rd2667; | |
mul.lo.s64 %rd1903, %rd1902, %rd2651; | |
xor.b64 %rd1904, %rd1899, %rd1903; | |
cvt.u32.u64 %r209, %rd1904; | |
xor.b32 %r210, %r338, %r209; | |
mul.lo.s32 %r211, %r210, %r339; | |
shr.u32 %r212, %r211, 9; | |
cvt.rn.f32.u32 %f190, %r212; | |
mul.rn.f32 %f191, %f190, 0f34000000; | |
cvt.rn.f16.f32 %h100, %f191; | |
mov.b16 %h101, 0x2E66; | |
setp.ge.f16 %p49, %h100, %h101; | |
ld.global.nc.b16 %h102, [%rd45+1282]; | |
ld.global.nc.f32 %f192, [%rd46+2564]; | |
cvt.rn.f16.f32 %h103, %f192; | |
add.rn.f16 %h104, %h102, %h103; | |
mov.b16 %h105, 0x3C72; | |
mul.rn.f16 %h106, %h104, %h105; | |
selp.b16 %h107, %h106, 0x0000, %p49; | |
cvt.f32.f16 %f193, %h107; | |
ld.global.nc.b16 %h108, [%rd47+1282]; | |
cvt.f32.f16 %f194, %h108; | |
ld.global.nc.f32 %f195, [%rd48+2564]; | |
mul.rn.f32 %f196, %f1, %f195; | |
mul.rn.f32 %f197, %f196, %f194; | |
ld.global.nc.f32 %f198, [%rd49+2564]; | |
mul.rn.f32 %f199, %f2, %f196; | |
sub.rn.f32 %f200, %f198, %f199; | |
add.rn.f32 %f201, %f197, %f200; | |
add.rn.f32 %f202, %f201, %f193; | |
sub.rn.f32 %f203, %f202, %f3; | |
mul.rn.f32 %f204, %f203, %f203; | |
add.rn.f32 %f15, %f14, %f204; | |
or.b32 %r214, %r73, 768; | |
shr.u32 %r215, %r214, 2; | |
cvt.u64.u32 %rd1905, %r215; | |
add.s64 %rd351, %rd12, %rd1905; | |
@%p8 bra LBB80_40; | |
and.b64 %rd1947, %rd351, 4294967295; | |
mul.lo.s64 %rd2672, %rd1947, 3528531795; | |
setp.lt.u64 %p51, %rd351, %rd12; | |
selp.u64 %rd1948, 1, 0, %p51; | |
add.s64 %rd1949, %rd2464, %rd1948; | |
xor.b64 %rd1950, %rd1949, %rd2672; | |
shr.u64 %rd1951, %rd1950, 32; | |
mul.lo.s64 %rd2675, %rd1951, 3449720151; | |
shr.u64 %rd1952, %rd2675, 32; | |
and.b64 %rd1953, %rd1949, 4294967295; | |
mul.lo.s64 %rd1954, %rd1953, 3449720151; | |
and.b64 %rd1955, %rd1954, 4294967295; | |
xor.b64 %rd1956, %rd1955, %rd1952; | |
xor.b64 %rd1957, %rd1956, 2654435769; | |
mul.lo.s64 %rd2678, %rd1957, 3528531795; | |
xor.b64 %rd2668, %rd1954, %rd351; | |
mov.u32 %r342, -1879881855; | |
mov.u32 %r341, -845247145; | |
mov.u32 %r340, 534103459; | |
mov.u64 %rd2686, 3678237736; | |
mov.u64 %rd2685, 3041712726; | |
mov.u64 %rd2684, 1401181199; | |
mov.u64 %rd2683, 2835769497; | |
mov.u64 %rd2682, 1684936478; | |
mov.u64 %rd2681, 2027808484; | |
mov.u64 %rd2680, 387276957; | |
mov.u64 %rd2679, 842468239; | |
mov.u64 %rd2677, 3986602516; | |
mov.u64 %rd2676, 1013904242; | |
mov.u64 %rd2674, 3668340011; | |
mov.u64 %rd2673, 3144134277; | |
mov.u64 %rd2671, 3449720151; | |
mov.u64 %rd2670, 1993301258; | |
mov.u64 %rd2669, 3528531795; | |
bra.uni LBB80_41; | |
LBB80_40: | |
setp.lt.u64 %p50, %rd351, %rd12; | |
selp.u64 %rd1921, 1, 0, %p50; | |
add.s64 %rd1922, %rd2464, %rd1921; | |
and.b64 %rd1923, %rd1922, 4294967295; | |
mul.lo.s64 %rd2672, %rd1923, 3449720151; | |
xor.b64 %rd1924, %rd2672, %rd351; | |
shr.u64 %rd1925, %rd1924, 32; | |
mul.lo.s64 %rd2675, %rd1925, 3528531795; | |
shr.u64 %rd1926, %rd2675, 32; | |
and.b64 %rd1927, %rd351, 4294967295; | |
mul.lo.s64 %rd1928, %rd1927, 3528531795; | |
and.b64 %rd1929, %rd1928, 4294967295; | |
xor.b64 %rd1930, %rd1929, %rd1926; | |
xor.b64 %rd1931, %rd1930, 3144134277; | |
mul.lo.s64 %rd2678, %rd1931, 3449720151; | |
xor.b64 %rd2668, %rd1922, %rd1928; | |
mov.u32 %r342, -1767562579; | |
mov.u32 %r341, -766435501; | |
mov.u32 %r340, 1401181199; | |
mov.u64 %rd2686, 4055616968; | |
mov.u64 %rd2685, 1684936478; | |
mov.u64 %rd2684, 534103459; | |
mov.u64 %rd2683, 387276957; | |
mov.u64 %rd2682, 3041712726; | |
mov.u64 %rd2681, 3986602516; | |
mov.u64 %rd2680, 2835769497; | |
mov.u64 %rd2679, 3668340011; | |
mov.u64 %rd2677, 2027808484; | |
mov.u64 %rd2676, 1993301258; | |
mov.u64 %rd2674, 842468239; | |
mov.u64 %rd2673, 2654435769; | |
mov.u64 %rd2671, 3528531795; | |
mov.u64 %rd2670, 1013904242; | |
mov.u64 %rd2669, 3449720151; | |
LBB80_41: | |
shr.u64 %rd1958, %rd2678, 32; | |
shr.u64 %rd1959, %rd2668, 32; | |
mul.lo.s64 %rd1960, %rd1959, %rd2669; | |
and.b64 %rd1961, %rd1960, 4294967295; | |
xor.b64 %rd1962, %rd1961, %rd1958; | |
xor.b64 %rd1963, %rd1962, %rd2670; | |
mul.lo.s64 %rd1964, %rd1963, %rd2671; | |
shr.u64 %rd1965, %rd1964, 32; | |
shr.u64 %rd1966, %rd1960, 32; | |
and.b64 %rd1967, %rd2672, 4294967295; | |
xor.b64 %rd1968, %rd1967, %rd1966; | |
xor.b64 %rd1969, %rd1968, %rd2673; | |
mul.lo.s64 %rd1970, %rd1969, %rd2671; | |
and.b64 %rd1971, %rd1970, 4294967295; | |
xor.b64 %rd1972, %rd1971, %rd1965; | |
xor.b64 %rd1973, %rd1972, %rd2674; | |
mul.lo.s64 %rd1974, %rd1973, %rd2669; | |
shr.u64 %rd1975, %rd1974, 32; | |
shr.u64 %rd1976, %rd1970, 32; | |
and.b64 %rd1977, %rd2675, 4294967295; | |
xor.b64 %rd1978, %rd1977, %rd1976; | |
xor.b64 %rd1979, %rd1978, %rd2676; | |
mul.lo.s64 %rd1980, %rd1979, %rd2669; | |
and.b64 %rd1981, %rd1980, 4294967295; | |
xor.b64 %rd1982, %rd1981, %rd1975; | |
xor.b64 %rd1983, %rd1982, %rd2677; | |
mul.lo.s64 %rd1984, %rd1983, %rd2671; | |
shr.u64 %rd1985, %rd1984, 32; | |
shr.u64 %rd1986, %rd1980, 32; | |
and.b64 %rd1987, %rd2678, 4294967295; | |
xor.b64 %rd1988, %rd1987, %rd1986; | |
xor.b64 %rd1989, %rd1988, %rd2679; | |
mul.lo.s64 %rd1990, %rd1989, %rd2671; | |
and.b64 %rd1991, %rd1990, 4294967295; | |
xor.b64 %rd1992, %rd1991, %rd1985; | |
xor.b64 %rd1993, %rd1992, %rd2680; | |
mul.lo.s64 %rd1994, %rd1993, %rd2669; | |
shr.u64 %rd1995, %rd1994, 32; | |
shr.u64 %rd1996, %rd1990, 32; | |
and.b64 %rd1997, %rd1964, 4294967295; | |
xor.b64 %rd1998, %rd1997, %rd1996; | |
xor.b64 %rd1999, %rd1998, %rd2681; | |
mul.lo.s64 %rd2000, %rd1999, %rd2669; | |
and.b64 %rd2001, %rd2000, 4294967295; | |
xor.b64 %rd2002, %rd2001, %rd1995; | |
xor.b64 %rd2003, %rd2002, %rd2682; | |
mul.lo.s64 %rd2004, %rd2003, %rd2671; | |
shr.u64 %rd2005, %rd2004, 32; | |
shr.u64 %rd2006, %rd2000, 32; | |
and.b64 %rd2007, %rd1974, 4294967295; | |
xor.b64 %rd2008, %rd2007, %rd2006; | |
xor.b64 %rd2009, %rd2008, %rd2683; | |
mul.lo.s64 %rd2010, %rd2009, %rd2671; | |
and.b64 %rd2011, %rd2010, 4294967295; | |
xor.b64 %rd2012, %rd2011, %rd2005; | |
xor.b64 %rd2013, %rd2012, %rd2684; | |
mul.lo.s64 %rd2014, %rd2013, %rd2669; | |
shr.u64 %rd2015, %rd2014, 32; | |
shr.u64 %rd2016, %rd2010, 32; | |
and.b64 %rd2017, %rd1984, 4294967295; | |
xor.b64 %rd2018, %rd2017, %rd2016; | |
xor.b64 %rd2019, %rd2018, %rd2685; | |
mul.lo.s64 %rd2020, %rd2019, %rd2669; | |
and.b64 %rd2021, %rd2020, 4294967295; | |
xor.b64 %rd2022, %rd2021, %rd2015; | |
xor.b64 %rd2023, %rd2022, %rd2686; | |
mul.lo.s64 %rd2024, %rd2023, %rd2671; | |
shr.u64 %rd2025, %rd2024, 32; | |
cvt.u32.u64 %r222, %rd2025; | |
shr.u64 %rd2026, %rd2020, 32; | |
xor.b64 %rd2027, %rd2026, %rd1994; | |
cvt.u32.u64 %r223, %rd2027; | |
xor.b32 %r224, %r340, %r223; | |
mul.lo.s32 %r225, %r224, %r341; | |
xor.b32 %r226, %r225, %r222; | |
xor.b32 %r227, %r226, %r342; | |
shr.u32 %r228, %r227, 9; | |
cvt.rn.f32.u32 %f205, %r228; | |
mul.rn.f32 %f206, %f205, 0f34000000; | |
cvt.rn.f16.f32 %h109, %f206; | |
mov.b16 %h110, 0x2E66; | |
setp.ge.f16 %p52, %h109, %h110; | |
ld.global.nc.b16 %h111, [%rd45+1536]; | |
ld.global.nc.f32 %f207, [%rd46+3072]; | |
cvt.rn.f16.f32 %h112, %f207; | |
add.rn.f16 %h113, %h111, %h112; | |
mov.b16 %h114, 0x3C72; | |
mul.rn.f16 %h115, %h113, %h114; | |
selp.b16 %h116, %h115, 0x0000, %p52; | |
cvt.f32.f16 %f208, %h116; | |
ld.global.nc.b16 %h117, [%rd47+1536]; | |
cvt.f32.f16 %f209, %h117; | |
ld.global.nc.f32 %f210, [%rd48+3072]; | |
mul.rn.f32 %f211, %f1, %f210; | |
mul.rn.f32 %f212, %f211, %f209; | |
ld.global.nc.f32 %f213, [%rd49+3072]; | |
mul.rn.f32 %f214, %f2, %f211; | |
sub.rn.f32 %f215, %f213, %f214; | |
add.rn.f32 %f216, %f212, %f215; | |
add.rn.f32 %f217, %f216, %f208; | |
sub.rn.f32 %f218, %f217, %f3; | |
mul.rn.f32 %f219, %f218, %f218; | |
add.rn.f32 %f16, %f15, %f219; | |
or.b32 %r229, %r3, 769; | |
or.b32 %r230, %r229, %r4; | |
and.b32 %r231, %r229, 3; | |
shr.u32 %r232, %r230, 2; | |
setp.ne.s32 %p53, %r231, 1; | |
cvt.u64.u32 %rd2028, %r232; | |
add.s64 %rd379, %rd12, %rd2028; | |
@%p53 bra LBB80_43; | |
and.b64 %rd2068, %rd379, 4294967295; | |
mul.lo.s64 %rd2691, %rd2068, 3528531795; | |
setp.lt.u64 %p55, %rd379, %rd12; | |
selp.u64 %rd2069, 1, 0, %p55; | |
add.s64 %rd2070, %rd2464, %rd2069; | |
xor.b64 %rd2071, %rd2070, %rd2691; | |
shr.u64 %rd2072, %rd2071, 32; | |
mul.lo.s64 %rd2694, %rd2072, 3449720151; | |
shr.u64 %rd2073, %rd2694, 32; | |
and.b64 %rd2074, %rd2070, 4294967295; | |
mul.lo.s64 %rd2075, %rd2074, 3449720151; | |
and.b64 %rd2076, %rd2075, 4294967295; | |
xor.b64 %rd2077, %rd2076, %rd2073; | |
xor.b64 %rd2078, %rd2077, 2654435769; | |
mul.lo.s64 %rd2697, %rd2078, 3528531795; | |
xor.b64 %rd2687, %rd2075, %rd379; | |
mov.u32 %r344, -845247145; | |
mov.u32 %r343, -616729560; | |
mov.u64 %rd2704, 3041712726; | |
mov.u64 %rd2703, 1401181199; | |
mov.u64 %rd2702, 2835769497; | |
mov.u64 %rd2701, 1684936478; | |
mov.u64 %rd2700, 2027808484; | |
mov.u64 %rd2699, 387276957; | |
mov.u64 %rd2698, 842468239; | |
mov.u64 %rd2696, 3986602516; | |
mov.u64 %rd2695, 1013904242; | |
mov.u64 %rd2693, 3668340011; | |
mov.u64 %rd2692, 3144134277; | |
mov.u64 %rd2690, 3449720151; | |
mov.u64 %rd2689, 1993301258; | |
mov.u64 %rd2688, 3528531795; | |
bra.uni LBB80_44; | |
LBB80_43: | |
setp.lt.u64 %p54, %rd379, %rd12; | |
selp.u64 %rd2043, 1, 0, %p54; | |
add.s64 %rd2044, %rd2464, %rd2043; | |
and.b64 %rd2045, %rd2044, 4294967295; | |
mul.lo.s64 %rd2691, %rd2045, 3449720151; | |
xor.b64 %rd2046, %rd2691, %rd379; | |
shr.u64 %rd2047, %rd2046, 32; | |
mul.lo.s64 %rd2694, %rd2047, 3528531795; | |
shr.u64 %rd2048, %rd2694, 32; | |
and.b64 %rd2049, %rd379, 4294967295; | |
mul.lo.s64 %rd2050, %rd2049, 3528531795; | |
and.b64 %rd2051, %rd2050, 4294967295; | |
xor.b64 %rd2052, %rd2051, %rd2048; | |
xor.b64 %rd2053, %rd2052, 3144134277; | |
mul.lo.s64 %rd2697, %rd2053, 3449720151; | |
xor.b64 %rd2687, %rd2044, %rd2050; | |
mov.u32 %r344, -766435501; | |
mov.u32 %r343, -239350328; | |
mov.u64 %rd2704, 1684936478; | |
mov.u64 %rd2703, 534103459; | |
mov.u64 %rd2702, 387276957; | |
mov.u64 %rd2701, 3041712726; | |
mov.u64 %rd2700, 3986602516; | |
mov.u64 %rd2699, 2835769497; | |
mov.u64 %rd2698, 3668340011; | |
mov.u64 %rd2696, 2027808484; | |
mov.u64 %rd2695, 1993301258; | |
mov.u64 %rd2693, 842468239; | |
mov.u64 %rd2692, 2654435769; | |
mov.u64 %rd2690, 3528531795; | |
mov.u64 %rd2689, 1013904242; | |
mov.u64 %rd2688, 3449720151; | |
LBB80_44: | |
shr.u64 %rd2079, %rd2697, 32; | |
shr.u64 %rd2080, %rd2687, 32; | |
mul.lo.s64 %rd2081, %rd2080, %rd2688; | |
and.b64 %rd2082, %rd2081, 4294967295; | |
xor.b64 %rd2083, %rd2082, %rd2079; | |
xor.b64 %rd2084, %rd2083, %rd2689; | |
mul.lo.s64 %rd2085, %rd2084, %rd2690; | |
shr.u64 %rd2086, %rd2085, 32; | |
shr.u64 %rd2087, %rd2081, 32; | |
and.b64 %rd2088, %rd2691, 4294967295; | |
xor.b64 %rd2089, %rd2088, %rd2087; | |
xor.b64 %rd2090, %rd2089, %rd2692; | |
mul.lo.s64 %rd2091, %rd2090, %rd2690; | |
and.b64 %rd2092, %rd2091, 4294967295; | |
xor.b64 %rd2093, %rd2092, %rd2086; | |
xor.b64 %rd2094, %rd2093, %rd2693; | |
mul.lo.s64 %rd2095, %rd2094, %rd2688; | |
shr.u64 %rd2096, %rd2095, 32; | |
shr.u64 %rd2097, %rd2091, 32; | |
and.b64 %rd2098, %rd2694, 4294967295; | |
xor.b64 %rd2099, %rd2098, %rd2097; | |
xor.b64 %rd2100, %rd2099, %rd2695; | |
mul.lo.s64 %rd2101, %rd2100, %rd2688; | |
and.b64 %rd2102, %rd2101, 4294967295; | |
xor.b64 %rd2103, %rd2102, %rd2096; | |
xor.b64 %rd2104, %rd2103, %rd2696; | |
mul.lo.s64 %rd2105, %rd2104, %rd2690; | |
shr.u64 %rd2106, %rd2105, 32; | |
shr.u64 %rd2107, %rd2101, 32; | |
and.b64 %rd2108, %rd2697, 4294967295; | |
xor.b64 %rd2109, %rd2108, %rd2107; | |
xor.b64 %rd2110, %rd2109, %rd2698; | |
mul.lo.s64 %rd2111, %rd2110, %rd2690; | |
and.b64 %rd2112, %rd2111, 4294967295; | |
xor.b64 %rd2113, %rd2112, %rd2106; | |
xor.b64 %rd2114, %rd2113, %rd2699; | |
mul.lo.s64 %rd2115, %rd2114, %rd2688; | |
shr.u64 %rd2116, %rd2115, 32; | |
shr.u64 %rd2117, %rd2111, 32; | |
and.b64 %rd2118, %rd2085, 4294967295; | |
xor.b64 %rd2119, %rd2118, %rd2117; | |
xor.b64 %rd2120, %rd2119, %rd2700; | |
mul.lo.s64 %rd2121, %rd2120, %rd2688; | |
and.b64 %rd2122, %rd2121, 4294967295; | |
xor.b64 %rd2123, %rd2122, %rd2116; | |
xor.b64 %rd2124, %rd2123, %rd2701; | |
mul.lo.s64 %rd2125, %rd2124, %rd2690; | |
shr.u64 %rd2126, %rd2125, 32; | |
shr.u64 %rd2127, %rd2121, 32; | |
and.b64 %rd2128, %rd2095, 4294967295; | |
xor.b64 %rd2129, %rd2128, %rd2127; | |
xor.b64 %rd2130, %rd2129, %rd2702; | |
mul.lo.s64 %rd2131, %rd2130, %rd2690; | |
and.b64 %rd2132, %rd2131, 4294967295; | |
xor.b64 %rd2133, %rd2132, %rd2126; | |
xor.b64 %rd2134, %rd2133, %rd2703; | |
mul.lo.s64 %rd2135, %rd2134, %rd2688; | |
shr.u64 %rd2136, %rd2135, 32; | |
shr.u64 %rd2137, %rd2131, 32; | |
xor.b64 %rd2138, %rd2105, %rd2137; | |
xor.b64 %rd2139, %rd2138, %rd2704; | |
mul.lo.s64 %rd2140, %rd2139, %rd2688; | |
xor.b64 %rd2141, %rd2136, %rd2140; | |
cvt.u32.u64 %r237, %rd2141; | |
xor.b32 %r238, %r343, %r237; | |
mul.lo.s32 %r239, %r238, %r344; | |
shr.u32 %r240, %r239, 9; | |
cvt.rn.f32.u32 %f220, %r240; | |
mul.rn.f32 %f221, %f220, 0f34000000; | |
cvt.rn.f16.f32 %h118, %f221; | |
mov.b16 %h119, 0x2E66; | |
setp.ge.f16 %p57, %h118, %h119; | |
ld.global.nc.b16 %h120, [%rd45+1538]; | |
ld.global.nc.f32 %f222, [%rd46+3076]; | |
cvt.rn.f16.f32 %h121, %f222; | |
add.rn.f16 %h122, %h120, %h121; | |
mov.b16 %h123, 0x3C72; | |
mul.rn.f16 %h124, %h122, %h123; | |
selp.b16 %h125, %h124, 0x0000, %p57; | |
cvt.f32.f16 %f223, %h125; | |
ld.global.nc.b16 %h126, [%rd47+1538]; | |
cvt.f32.f16 %f224, %h126; | |
ld.global.nc.f32 %f225, [%rd48+3076]; | |
mul.rn.f32 %f226, %f1, %f225; | |
mul.rn.f32 %f227, %f226, %f224; | |
ld.global.nc.f32 %f228, [%rd49+3076]; | |
mul.rn.f32 %f229, %f2, %f226; | |
sub.rn.f32 %f230, %f228, %f229; | |
add.rn.f32 %f231, %f227, %f230; | |
add.rn.f32 %f232, %f231, %f223; | |
sub.rn.f32 %f233, %f232, %f3; | |
mul.rn.f32 %f234, %f233, %f233; | |
add.rn.f32 %f17, %f16, %f234; | |
or.b32 %r242, %r73, 896; | |
shr.u32 %r243, %r242, 2; | |
cvt.u64.u32 %rd2142, %r243; | |
add.s64 %rd406, %rd12, %rd2142; | |
@%p8 bra LBB80_46; | |
mov.u32 %r347, -1879881855; | |
mov.u32 %r345, 534103459; | |
mov.u64 %rd2723, 3678237736; | |
and.b64 %rd2184, %rd406, 4294967295; | |
mul.lo.s64 %rd2709, %rd2184, 3528531795; | |
setp.lt.u64 %p59, %rd406, %rd12; | |
selp.u64 %rd2185, 1, 0, %p59; | |
add.s64 %rd2186, %rd2464, %rd2185; | |
xor.b64 %rd2187, %rd2186, %rd2709; | |
shr.u64 %rd2188, %rd2187, 32; | |
mul.lo.s64 %rd2712, %rd2188, 3449720151; | |
shr.u64 %rd2189, %rd2712, 32; | |
and.b64 %rd2190, %rd2186, 4294967295; | |
mul.lo.s64 %rd2191, %rd2190, 3449720151; | |
and.b64 %rd2192, %rd2191, 4294967295; | |
xor.b64 %rd2193, %rd2192, %rd2189; | |
xor.b64 %rd2194, %rd2193, 2654435769; | |
mul.lo.s64 %rd2715, %rd2194, 3528531795; | |
xor.b64 %rd2705, %rd2191, %rd406; | |
mov.u32 %r346, -845247145; | |
mov.u64 %rd2722, 3041712726; | |
mov.u64 %rd2721, 1401181199; | |
mov.u64 %rd2720, 2835769497; | |
mov.u64 %rd2719, 1684936478; | |
mov.u64 %rd2718, 2027808484; | |
mov.u64 %rd2717, 387276957; | |
mov.u64 %rd2716, 842468239; | |
mov.u64 %rd2714, 3986602516; | |
mov.u64 %rd2713, 1013904242; | |
mov.u64 %rd2711, 3668340011; | |
mov.u64 %rd2710, 3144134277; | |
mov.u64 %rd2708, 3449720151; | |
mov.u64 %rd2707, 1993301258; | |
mov.u64 %rd2706, 3528531795; | |
bra.uni LBB80_47; | |
LBB80_46: | |
setp.lt.u64 %p58, %rd406, %rd12; | |
selp.u64 %rd2158, 1, 0, %p58; | |
add.s64 %rd2159, %rd2464, %rd2158; | |
and.b64 %rd2160, %rd2159, 4294967295; | |
mul.lo.s64 %rd2709, %rd2160, 3449720151; | |
xor.b64 %rd2161, %rd2709, %rd406; | |
shr.u64 %rd2162, %rd2161, 32; | |
mul.lo.s64 %rd2712, %rd2162, 3528531795; | |
shr.u64 %rd2163, %rd2712, 32; | |
and.b64 %rd2164, %rd406, 4294967295; | |
mul.lo.s64 %rd2165, %rd2164, 3528531795; | |
and.b64 %rd2166, %rd2165, 4294967295; | |
xor.b64 %rd2167, %rd2166, %rd2163; | |
xor.b64 %rd2168, %rd2167, 3144134277; | |
mul.lo.s64 %rd2715, %rd2168, 3449720151; | |
xor.b64 %rd2705, %rd2159, %rd2165; | |
mov.u32 %r347, -1767562579; | |
mov.u32 %r346, -766435501; | |
mov.u32 %r345, 1401181199; | |
mov.u64 %rd2723, 4055616968; | |
mov.u64 %rd2722, 1684936478; | |
mov.u64 %rd2721, 534103459; | |
mov.u64 %rd2720, 387276957; | |
mov.u64 %rd2719, 3041712726; | |
mov.u64 %rd2718, 3986602516; | |
mov.u64 %rd2717, 2835769497; | |
mov.u64 %rd2716, 3668340011; | |
mov.u64 %rd2714, 2027808484; | |
mov.u64 %rd2713, 1993301258; | |
mov.u64 %rd2711, 842468239; | |
mov.u64 %rd2710, 2654435769; | |
mov.u64 %rd2708, 3528531795; | |
mov.u64 %rd2707, 1013904242; | |
mov.u64 %rd2706, 3449720151; | |
LBB80_47: | |
shr.u64 %rd2195, %rd2715, 32; | |
shr.u64 %rd2196, %rd2705, 32; | |
mul.lo.s64 %rd2197, %rd2196, %rd2706; | |
and.b64 %rd2198, %rd2197, 4294967295; | |
xor.b64 %rd2199, %rd2198, %rd2195; | |
xor.b64 %rd2200, %rd2199, %rd2707; | |
mul.lo.s64 %rd2201, %rd2200, %rd2708; | |
shr.u64 %rd2202, %rd2201, 32; | |
shr.u64 %rd2203, %rd2197, 32; | |
and.b64 %rd2204, %rd2709, 4294967295; | |
xor.b64 %rd2205, %rd2204, %rd2203; | |
xor.b64 %rd2206, %rd2205, %rd2710; | |
mul.lo.s64 %rd2207, %rd2206, %rd2708; | |
and.b64 %rd2208, %rd2207, 4294967295; | |
xor.b64 %rd2209, %rd2208, %rd2202; | |
xor.b64 %rd2210, %rd2209, %rd2711; | |
mul.lo.s64 %rd2211, %rd2210, %rd2706; | |
shr.u64 %rd2212, %rd2211, 32; | |
shr.u64 %rd2213, %rd2207, 32; | |
and.b64 %rd2214, %rd2712, 4294967295; | |
xor.b64 %rd2215, %rd2214, %rd2213; | |
xor.b64 %rd2216, %rd2215, %rd2713; | |
mul.lo.s64 %rd2217, %rd2216, %rd2706; | |
and.b64 %rd2218, %rd2217, 4294967295; | |
xor.b64 %rd2219, %rd2218, %rd2212; | |
xor.b64 %rd2220, %rd2219, %rd2714; | |
mul.lo.s64 %rd2221, %rd2220, %rd2708; | |
shr.u64 %rd2222, %rd2221, 32; | |
shr.u64 %rd2223, %rd2217, 32; | |
and.b64 %rd2224, %rd2715, 4294967295; | |
xor.b64 %rd2225, %rd2224, %rd2223; | |
xor.b64 %rd2226, %rd2225, %rd2716; | |
mul.lo.s64 %rd2227, %rd2226, %rd2708; | |
and.b64 %rd2228, %rd2227, 4294967295; | |
xor.b64 %rd2229, %rd2228, %rd2222; | |
xor.b64 %rd2230, %rd2229, %rd2717; | |
mul.lo.s64 %rd2231, %rd2230, %rd2706; | |
shr.u64 %rd2232, %rd2231, 32; | |
shr.u64 %rd2233, %rd2227, 32; | |
and.b64 %rd2234, %rd2201, 4294967295; | |
xor.b64 %rd2235, %rd2234, %rd2233; | |
xor.b64 %rd2236, %rd2235, %rd2718; | |
mul.lo.s64 %rd2237, %rd2236, %rd2706; | |
and.b64 %rd2238, %rd2237, 4294967295; | |
xor.b64 %rd2239, %rd2238, %rd2232; | |
xor.b64 %rd2240, %rd2239, %rd2719; | |
mul.lo.s64 %rd2241, %rd2240, %rd2708; | |
shr.u64 %rd2242, %rd2241, 32; | |
shr.u64 %rd2243, %rd2237, 32; | |
and.b64 %rd2244, %rd2211, 4294967295; | |
xor.b64 %rd2245, %rd2244, %rd2243; | |
xor.b64 %rd2246, %rd2245, %rd2720; | |
mul.lo.s64 %rd2247, %rd2246, %rd2708; | |
and.b64 %rd2248, %rd2247, 4294967295; | |
xor.b64 %rd2249, %rd2248, %rd2242; | |
xor.b64 %rd2250, %rd2249, %rd2721; | |
mul.lo.s64 %rd2251, %rd2250, %rd2706; | |
shr.u64 %rd2252, %rd2251, 32; | |
shr.u64 %rd2253, %rd2247, 32; | |
and.b64 %rd2254, %rd2221, 4294967295; | |
xor.b64 %rd2255, %rd2254, %rd2253; | |
xor.b64 %rd2256, %rd2255, %rd2722; | |
mul.lo.s64 %rd2257, %rd2256, %rd2706; | |
and.b64 %rd2258, %rd2257, 4294967295; | |
xor.b64 %rd2259, %rd2258, %rd2252; | |
xor.b64 %rd2260, %rd2259, %rd2723; | |
mul.lo.s64 %rd2261, %rd2260, %rd2708; | |
shr.u64 %rd2262, %rd2261, 32; | |
cvt.u32.u64 %r250, %rd2262; | |
shr.u64 %rd2263, %rd2257, 32; | |
xor.b64 %rd2264, %rd2263, %rd2231; | |
cvt.u32.u64 %r251, %rd2264; | |
xor.b32 %r252, %r345, %r251; | |
mul.lo.s32 %r253, %r252, %r346; | |
xor.b32 %r254, %r253, %r250; | |
xor.b32 %r255, %r254, %r347; | |
shr.u32 %r256, %r255, 9; | |
cvt.rn.f32.u32 %f235, %r256; | |
mul.rn.f32 %f236, %f235, 0f34000000; | |
cvt.rn.f16.f32 %h127, %f236; | |
mov.b16 %h128, 0x2E66; | |
setp.ge.f16 %p60, %h127, %h128; | |
ld.global.nc.b16 %h129, [%rd45+1792]; | |
ld.global.nc.f32 %f237, [%rd46+3584]; | |
cvt.rn.f16.f32 %h130, %f237; | |
add.rn.f16 %h131, %h129, %h130; | |
mov.b16 %h132, 0x3C72; | |
mul.rn.f16 %h133, %h131, %h132; | |
selp.b16 %h134, %h133, 0x0000, %p60; | |
cvt.f32.f16 %f238, %h134; | |
ld.global.nc.b16 %h135, [%rd47+1792]; | |
cvt.f32.f16 %f239, %h135; | |
ld.global.nc.f32 %f240, [%rd48+3584]; | |
mul.rn.f32 %f241, %f1, %f240; | |
mul.rn.f32 %f242, %f241, %f239; | |
ld.global.nc.f32 %f243, [%rd49+3584]; | |
mul.rn.f32 %f244, %f2, %f241; | |
sub.rn.f32 %f245, %f243, %f244; | |
add.rn.f32 %f246, %f242, %f245; | |
add.rn.f32 %f247, %f246, %f238; | |
sub.rn.f32 %f248, %f247, %f3; | |
mul.rn.f32 %f249, %f248, %f248; | |
add.rn.f32 %f18, %f17, %f249; | |
or.b32 %r257, %r3, 897; | |
or.b32 %r258, %r257, %r4; | |
and.b32 %r259, %r257, 3; | |
shr.u32 %r260, %r258, 2; | |
setp.ne.s32 %p61, %r259, 1; | |
cvt.u64.u32 %rd2265, %r260; | |
add.s64 %rd434, %rd12, %rd2265; | |
@%p61 bra LBB80_49; | |
mov.u32 %r349, -845247145; | |
mov.u64 %rd2740, 1401181199; | |
mov.u64 %rd2729, 3144134277; | |
mov.u32 %r348, -616729560; | |
and.b64 %rd2305, %rd434, 4294967295; | |
mul.lo.s64 %rd2728, %rd2305, 3528531795; | |
setp.lt.u64 %p63, %rd434, %rd12; | |
selp.u64 %rd2306, 1, 0, %p63; | |
add.s64 %rd2307, %rd2464, %rd2306; | |
xor.b64 %rd2308, %rd2307, %rd2728; | |
shr.u64 %rd2309, %rd2308, 32; | |
mul.lo.s64 %rd2731, %rd2309, 3449720151; | |
shr.u64 %rd2310, %rd2731, 32; | |
and.b64 %rd2311, %rd2307, 4294967295; | |
mul.lo.s64 %rd2312, %rd2311, 3449720151; | |
and.b64 %rd2313, %rd2312, 4294967295; | |
xor.b64 %rd2314, %rd2313, %rd2310; | |
xor.b64 %rd2315, %rd2314, 2654435769; | |
mul.lo.s64 %rd2734, %rd2315, 3528531795; | |
xor.b64 %rd2724, %rd2312, %rd434; | |
mov.u64 %rd2741, 3041712726; | |
mov.u64 %rd2739, 2835769497; | |
mov.u64 %rd2738, 1684936478; | |
mov.u64 %rd2737, 2027808484; | |
mov.u64 %rd2736, 387276957; | |
mov.u64 %rd2735, 842468239; | |
mov.u64 %rd2733, 3986602516; | |
mov.u64 %rd2732, 1013904242; | |
mov.u64 %rd2730, 3668340011; | |
mov.u64 %rd2727, 3449720151; | |
mov.u64 %rd2726, 1993301258; | |
mov.u64 %rd2725, 3528531795; | |
bra.uni LBB80_50; | |
LBB80_49: | |
setp.lt.u64 %p62, %rd434, %rd12; | |
selp.u64 %rd2280, 1, 0, %p62; | |
add.s64 %rd2281, %rd2464, %rd2280; | |
and.b64 %rd2282, %rd2281, 4294967295; | |
mul.lo.s64 %rd2728, %rd2282, 3449720151; | |
xor.b64 %rd2283, %rd2728, %rd434; | |
shr.u64 %rd2284, %rd2283, 32; | |
mul.lo.s64 %rd2731, %rd2284, 3528531795; | |
shr.u64 %rd2285, %rd2731, 32; | |
and.b64 %rd2286, %rd434, 4294967295; | |
mul.lo.s64 %rd2287, %rd2286, 3528531795; | |
and.b64 %rd2288, %rd2287, 4294967295; | |
xor.b64 %rd2289, %rd2288, %rd2285; | |
xor.b64 %rd2290, %rd2289, 3144134277; | |
mul.lo.s64 %rd2734, %rd2290, 3449720151; | |
xor.b64 %rd2724, %rd2281, %rd2287; | |
mov.u32 %r349, -766435501; | |
mov.u32 %r348, -239350328; | |
mov.u64 %rd2741, 1684936478; | |
mov.u64 %rd2740, 534103459; | |
mov.u64 %rd2739, 387276957; | |
mov.u64 %rd2738, 3041712726; | |
mov.u64 %rd2737, 3986602516; | |
mov.u64 %rd2736, 2835769497; | |
mov.u64 %rd2735, 3668340011; | |
mov.u64 %rd2733, 2027808484; | |
mov.u64 %rd2732, 1993301258; | |
mov.u64 %rd2730, 842468239; | |
mov.u64 %rd2729, 2654435769; | |
mov.u64 %rd2727, 3528531795; | |
mov.u64 %rd2726, 1013904242; | |
mov.u64 %rd2725, 3449720151; | |
LBB80_50: | |
shr.u64 %rd2316, %rd2734, 32; | |
shr.u64 %rd2317, %rd2724, 32; | |
mul.lo.s64 %rd2318, %rd2317, %rd2725; | |
and.b64 %rd2319, %rd2318, 4294967295; | |
xor.b64 %rd2320, %rd2319, %rd2316; | |
xor.b64 %rd2321, %rd2320, %rd2726; | |
mul.lo.s64 %rd2322, %rd2321, %rd2727; | |
shr.u64 %rd2323, %rd2322, 32; | |
shr.u64 %rd2324, %rd2318, 32; | |
and.b64 %rd2325, %rd2728, 4294967295; | |
xor.b64 %rd2326, %rd2325, %rd2324; | |
xor.b64 %rd2327, %rd2326, %rd2729; | |
mul.lo.s64 %rd2328, %rd2327, %rd2727; | |
and.b64 %rd2329, %rd2328, 4294967295; | |
xor.b64 %rd2330, %rd2329, %rd2323; | |
xor.b64 %rd2331, %rd2330, %rd2730; | |
mul.lo.s64 %rd2332, %rd2331, %rd2725; | |
shr.u64 %rd2333, %rd2332, 32; | |
shr.u64 %rd2334, %rd2328, 32; | |
and.b64 %rd2335, %rd2731, 4294967295; | |
xor.b64 %rd2336, %rd2335, %rd2334; | |
xor.b64 %rd2337, %rd2336, %rd2732; | |
mul.lo.s64 %rd2338, %rd2337, %rd2725; | |
and.b64 %rd2339, %rd2338, 4294967295; | |
xor.b64 %rd2340, %rd2339, %rd2333; | |
xor.b64 %rd2341, %rd2340, %rd2733; | |
mul.lo.s64 %rd2342, %rd2341, %rd2727; | |
shr.u64 %rd2343, %rd2342, 32; | |
shr.u64 %rd2344, %rd2338, 32; | |
and.b64 %rd2345, %rd2734, 4294967295; | |
xor.b64 %rd2346, %rd2345, %rd2344; | |
xor.b64 %rd2347, %rd2346, %rd2735; | |
mul.lo.s64 %rd2348, %rd2347, %rd2727; | |
and.b64 %rd2349, %rd2348, 4294967295; | |
xor.b64 %rd2350, %rd2349, %rd2343; | |
xor.b64 %rd2351, %rd2350, %rd2736; | |
mul.lo.s64 %rd2352, %rd2351, %rd2725; | |
shr.u64 %rd2353, %rd2352, 32; | |
shr.u64 %rd2354, %rd2348, 32; | |
and.b64 %rd2355, %rd2322, 4294967295; | |
xor.b64 %rd2356, %rd2355, %rd2354; | |
xor.b64 %rd2357, %rd2356, %rd2737; | |
mul.lo.s64 %rd2358, %rd2357, %rd2725; | |
and.b64 %rd2359, %rd2358, 4294967295; | |
xor.b64 %rd2360, %rd2359, %rd2353; | |
xor.b64 %rd2361, %rd2360, %rd2738; | |
mul.lo.s64 %rd2362, %rd2361, %rd2727; | |
shr.u64 %rd2363, %rd2362, 32; | |
shr.u64 %rd2364, %rd2358, 32; | |
and.b64 %rd2365, %rd2332, 4294967295; | |
xor.b64 %rd2366, %rd2365, %rd2364; | |
xor.b64 %rd2367, %rd2366, %rd2739; | |
mul.lo.s64 %rd2368, %rd2367, %rd2727; | |
and.b64 %rd2369, %rd2368, 4294967295; | |
xor.b64 %rd2370, %rd2369, %rd2363; | |
xor.b64 %rd2371, %rd2370, %rd2740; | |
mul.lo.s64 %rd2372, %rd2371, %rd2725; | |
shr.u64 %rd2373, %rd2372, 32; | |
shr.u64 %rd2374, %rd2368, 32; | |
xor.b64 %rd2375, %rd2342, %rd2374; | |
xor.b64 %rd2376, %rd2375, %rd2741; | |
mul.lo.s64 %rd2377, %rd2376, %rd2725; | |
xor.b64 %rd2378, %rd2373, %rd2377; | |
cvt.u32.u64 %r265, %rd2378; | |
xor.b32 %r266, %r348, %r265; | |
mul.lo.s32 %r267, %r266, %r349; | |
shr.u32 %r268, %r267, 9; | |
cvt.rn.f32.u32 %f250, %r268; | |
mul.rn.f32 %f251, %f250, 0f34000000; | |
cvt.rn.f16.f32 %h136, %f251; | |
mov.b16 %h137, 0x2E66; | |
setp.ge.f16 %p64, %h136, %h137; | |
ld.global.nc.b16 %h138, [%rd45+1794]; | |
ld.global.nc.f32 %f252, [%rd46+3588]; | |
cvt.rn.f16.f32 %h139, %f252; | |
add.rn.f16 %h140, %h138, %h139; | |
mov.b16 %h141, 0x3C72; | |
mul.rn.f16 %h142, %h140, %h141; | |
selp.b16 %h143, %h142, 0x0000, %p64; | |
cvt.f32.f16 %f253, %h143; | |
ld.global.nc.b16 %h144, [%rd47+1794]; | |
cvt.f32.f16 %f254, %h144; | |
ld.global.nc.f32 %f255, [%rd48+3588]; | |
mul.rn.f32 %f256, %f1, %f255; | |
mul.rn.f32 %f257, %f256, %f254; | |
ld.global.nc.f32 %f258, [%rd49+3588]; | |
mul.rn.f32 %f259, %f2, %f256; | |
sub.rn.f32 %f260, %f258, %f259; | |
add.rn.f32 %f261, %f257, %f260; | |
add.rn.f32 %f262, %f261, %f253; | |
sub.rn.f32 %f263, %f262, %f3; | |
mul.rn.f32 %f264, %f263, %f263; | |
add.rn.f32 %f265, %f18, %f264; | |
and.b32 %r46, %r1, 31; | |
shfl.sync.down.b32 %f266, %f265, 16, 31, -1; | |
add.rn.f32 %f267, %f266, %f265; | |
shfl.sync.down.b32 %f268, %f267, 8, 31, -1; | |
add.rn.f32 %f269, %f268, %f267; | |
shfl.sync.down.b32 %f270, %f269, 4, 31, -1; | |
add.rn.f32 %f271, %f270, %f269; | |
shfl.sync.down.b32 %f272, %f271, 2, 31, -1; | |
add.rn.f32 %f273, %f272, %f271; | |
shfl.sync.down.b32 %f274, %f273, 1, 31, -1; | |
shr.u32 %r47, %r1, 5; | |
setp.ne.s32 %p65, %r46, 0; | |
mov.u64 %rd2381, shared_cache_019; | |
@%p65 bra LBB80_2; | |
mul.wide.u32 %rd2380, %r47, 4; | |
add.s64 %rd462, %rd2381, %rd2380; | |
add.rn.f32 %f19, %f274, %f273; | |
st.shared.f32 [%rd462], %f19; | |
LBB80_2: | |
bar.sync 0; | |
setp.eq.s32 %p66, %r47, 0; | |
@%p66 bra LBB80_52; | |
bra.uni LBB80_3; | |
LBB80_52: | |
add.u64 %rd474, %SP, 0; | |
add.u64 %rd11, %SPL, 0; | |
mul.wide.u32 %rd2382, %r46, 4; | |
add.s64 %rd463, %rd2381, %rd2382; | |
cvta.shared.u64 %rd2384, %rd463; | |
mov.u32 %r269, 0; | |
st.local.u32 [%rd11], %r269; | |
setp.lt.u32 %p67, %r1, 2; | |
selp.b64 %rd2386, %rd2384, %rd474, %p67; | |
ld.f32 %f275, [%rd2386]; | |
shfl.sync.down.b32 %f276, %f275, 16, 31, -1; | |
add.rn.f32 %f277, %f275, %f276; | |
shfl.sync.down.b32 %f278, %f277, 8, 31, -1; | |
add.rn.f32 %f279, %f277, %f278; | |
shfl.sync.down.b32 %f280, %f279, 4, 31, -1; | |
add.rn.f32 %f281, %f279, %f280; | |
shfl.sync.down.b32 %f282, %f281, 2, 31, -1; | |
add.rn.f32 %f283, %f281, %f282; | |
shfl.sync.down.b32 %f284, %f283, 1, 31, -1; | |
add.rn.f32 %f285, %f283, %f284; | |
st.f32 [%rd2386], %f285; | |
setp.ne.s32 %p68, %r1, 0; | |
@%p68 bra LBB80_3; | |
ld.param.u64 %rd470, [fusion_2178_param_3]; | |
cvt.u64.u32 %rd44, %r2; | |
cvta.to.global.u64 %rd7, %rd470; | |
shl.b64 %rd2379, %rd44, 2; | |
add.s64 %rd461, %rd7, %rd2379; | |
ld.shared.f32 %f286, [%rd463]; | |
atom.global.add.f32 %f287, [%rd461], %f286; | |
LBB80_3: | |
ret; | |
} | |
// .globl fusion_2175 | |
.visible .entry fusion_2175( | |
.param .u64 fusion_2175_param_0, | |
.param .u64 fusion_2175_param_1, | |
.param .u64 fusion_2175_param_2, | |
.param .u64 fusion_2175_param_3, | |
.param .u64 fusion_2175_param_4, | |
.param .u64 fusion_2175_param_5, | |
.param .u64 fusion_2175_param_6, | |
.param .u64 fusion_2175_param_7, | |
.param .u64 fusion_2175_param_8, | |
.param .u64 fusion_2175_param_9, | |
.param .u64 fusion_2175_param_10, | |
.param .u64 fusion_2175_param_11, | |
.param .u64 fusion_2175_param_12, | |
.param .u64 fusion_2175_param_13 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .pred %p<6>; | |
.reg .b16 %h<39>; | |
.reg .b32 %hh<5>; | |
.reg .f32 %f<97>; | |
.reg .b32 %r<31>; | |
.reg .b64 %rd<162>; | |
ld.param.u64 %rd1, [fusion_2175_param_0]; | |
ld.param.u64 %rd2, [fusion_2175_param_12]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2175_param_1]; | |
ld.param.u64 %rd5, [fusion_2175_param_11]; | |
cvta.to.global.u64 %rd6, %rd5; | |
ld.param.u64 %rd7, [fusion_2175_param_2]; | |
ld.param.u64 %rd8, [fusion_2175_param_10]; | |
cvta.to.global.u64 %rd9, %rd8; | |
ld.param.u64 %rd10, [fusion_2175_param_3]; | |
ld.param.u64 %rd11, [fusion_2175_param_9]; | |
cvta.to.global.u64 %rd12, %rd11; | |
ld.param.u64 %rd13, [fusion_2175_param_4]; | |
ld.param.u64 %rd14, [fusion_2175_param_8]; | |
cvta.to.global.u64 %rd15, %rd14; | |
ld.param.u64 %rd16, [fusion_2175_param_5]; | |
ld.param.u64 %rd17, [fusion_2175_param_7]; | |
cvta.to.global.u64 %rd18, %rd17; | |
ld.param.u64 %rd19, [fusion_2175_param_6]; | |
cvta.to.global.u64 %rd20, %rd19; | |
cvta.to.global.u64 %rd21, %rd16; | |
cvta.to.global.u64 %rd22, %rd13; | |
cvta.to.global.u64 %rd23, %rd10; | |
cvta.to.global.u64 %rd24, %rd7; | |
cvta.to.global.u64 %rd25, %rd4; | |
cvta.to.global.u64 %rd26, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
or.b32 %r6, %r4, 1; | |
or.b32 %r7, %r4, 2; | |
shr.u32 %r8, %r5, 2; | |
ld.global.nc.v2.u64 {%rd27, %rd28}, [%rd23]; | |
cvt.u64.u32 %rd29, %r8; | |
add.s64 %rd30, %rd27, %rd29; | |
setp.lt.u64 %p1, %rd30, %rd27; | |
and.b64 %rd31, %rd30, 4294967295; | |
mul.lo.s64 %rd32, %rd31, 3528531795; | |
selp.u64 %rd33, 1, 0, %p1; | |
add.s64 %rd34, %rd28, %rd33; | |
xor.b64 %rd35, %rd34, %rd32; | |
shr.u64 %rd36, %rd35, 32; | |
mul.lo.s64 %rd37, %rd36, 3449720151; | |
shr.u64 %rd38, %rd37, 32; | |
and.b64 %rd39, %rd34, 4294967295; | |
mul.lo.s64 %rd40, %rd39, 3449720151; | |
and.b64 %rd41, %rd40, 4294967295; | |
xor.b64 %rd42, %rd41, %rd38; | |
xor.b64 %rd43, %rd42, 2654435769; | |
mul.lo.s64 %rd44, %rd43, 3528531795; | |
shr.u64 %rd45, %rd44, 32; | |
xor.b64 %rd46, %rd40, %rd30; | |
shr.u64 %rd47, %rd46, 32; | |
mul.lo.s64 %rd48, %rd47, 3528531795; | |
and.b64 %rd49, %rd48, 4294967295; | |
xor.b64 %rd50, %rd49, %rd45; | |
xor.b64 %rd51, %rd50, 1993301258; | |
mul.lo.s64 %rd52, %rd51, 3449720151; | |
shr.u64 %rd53, %rd52, 32; | |
shr.u64 %rd54, %rd48, 32; | |
and.b64 %rd55, %rd32, 4294967295; | |
xor.b64 %rd56, %rd55, %rd54; | |
xor.b64 %rd57, %rd56, 3144134277; | |
mul.lo.s64 %rd58, %rd57, 3449720151; | |
and.b64 %rd59, %rd58, 4294967295; | |
xor.b64 %rd60, %rd59, %rd53; | |
xor.b64 %rd61, %rd60, 3668340011; | |
mul.lo.s64 %rd62, %rd61, 3528531795; | |
shr.u64 %rd63, %rd62, 32; | |
shr.u64 %rd64, %rd58, 32; | |
and.b64 %rd65, %rd37, 4294967295; | |
xor.b64 %rd66, %rd65, %rd64; | |
xor.b64 %rd67, %rd66, 1013904242; | |
mul.lo.s64 %rd68, %rd67, 3528531795; | |
and.b64 %rd69, %rd68, 4294967295; | |
xor.b64 %rd70, %rd69, %rd63; | |
xor.b64 %rd71, %rd70, 3986602516; | |
mul.lo.s64 %rd72, %rd71, 3449720151; | |
shr.u64 %rd73, %rd72, 32; | |
shr.u64 %rd74, %rd68, 32; | |
and.b64 %rd75, %rd44, 4294967295; | |
xor.b64 %rd76, %rd75, %rd74; | |
xor.b64 %rd77, %rd76, 842468239; | |
mul.lo.s64 %rd78, %rd77, 3449720151; | |
and.b64 %rd79, %rd78, 4294967295; | |
xor.b64 %rd80, %rd79, %rd73; | |
xor.b64 %rd81, %rd80, 387276957; | |
mul.lo.s64 %rd82, %rd81, 3528531795; | |
shr.u64 %rd83, %rd82, 32; | |
shr.u64 %rd84, %rd78, 32; | |
and.b64 %rd85, %rd52, 4294967295; | |
xor.b64 %rd86, %rd85, %rd84; | |
xor.b64 %rd87, %rd86, 2027808484; | |
mul.lo.s64 %rd88, %rd87, 3528531795; | |
and.b64 %rd89, %rd88, 4294967295; | |
shr.u64 %rd90, %rd88, 32; | |
and.b64 %rd91, %rd62, 4294967295; | |
xor.b64 %rd92, %rd91, %rd90; | |
xor.b64 %rd93, %rd92, 2835769497; | |
mul.lo.s64 %rd94, %rd93, 3449720151; | |
and.b64 %rd95, %rd94, 4294967295; | |
shr.u64 %rd96, %rd94, 32; | |
and.b64 %rd97, %rd72, 4294967295; | |
xor.b64 %rd98, %rd97, %rd96; | |
xor.b64 %rd99, %rd98, 3041712726; | |
mul.lo.s64 %rd100, %rd99, 3528531795; | |
and.b64 %rd101, %rd100, 4294967295; | |
xor.b64 %rd102, %rd89, %rd83; | |
xor.b64 %rd103, %rd102, 1684936478; | |
mul.lo.s64 %rd104, %rd103, 3449720151; | |
shr.u64 %rd105, %rd104, 32; | |
xor.b64 %rd106, %rd95, %rd105; | |
xor.b64 %rd107, %rd106, 1401181199; | |
mul.lo.s64 %rd108, %rd107, 3528531795; | |
shr.u64 %rd109, %rd108, 32; | |
xor.b64 %rd110, %rd101, %rd109; | |
xor.b64 %rd111, %rd110, 3678237736; | |
mul.lo.s64 %rd112, %rd111, 3449720151; | |
shr.u64 %rd113, %rd112, 32; | |
cvt.u32.u64 %r9, %rd113; | |
shr.u64 %rd114, %rd100, 32; | |
xor.b64 %rd115, %rd114, %rd82; | |
cvt.u32.u64 %r10, %rd115; | |
xor.b32 %r11, %r10, 534103459; | |
mul.lo.s32 %r12, %r11, -845247145; | |
xor.b32 %r13, %r12, %r9; | |
shr.u32 %r14, %r13, 9; | |
xor.b32 %r15, %r14, 4716963; | |
cvt.rn.f32.u32 %f1, %r15; | |
mul.rn.f32 %f2, %f1, 0f34000000; | |
cvt.rn.f16.f32 %h1, %f2; | |
mov.b16 %h2, 0x2E66; | |
setp.ge.f16 %p2, %h1, %h2; | |
mul.wide.u32 %rd116, %r5, 2; | |
add.s64 %rd117, %rd25, %rd116; | |
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd117]; | |
mov.b32 %hh1, {%h5, %h6}; | |
mov.b32 %hh2, {%h3, %h4}; | |
mov.b32 {%h7, %h8}, %hh2; | |
mov.b32 {%h9, %h10}, %hh1; | |
mul.wide.u32 %rd118, %r4, 4; | |
add.s64 %rd119, %rd6, %rd118; | |
ld.global.nc.f32 %f3, [%rd119]; | |
cvt.rn.f16.f32 %h11, %f3; | |
add.rn.f16 %h12, %h7, %h11; | |
mov.b16 %h13, 0x3C72; | |
mul.rn.f16 %h14, %h12, %h13; | |
cvt.f32.f16 %f4, %h14; | |
selp.f32 %f5, %f4, 0f00000000, %p2; | |
add.s64 %rd120, %rd24, %rd116; | |
ld.global.nc.v4.b16 {%h15, %h16, %h17, %h18}, [%rd120]; | |
mov.b32 %hh3, {%h17, %h18}; | |
mov.b32 %hh4, {%h15, %h16}; | |
mov.b32 {%h19, %h20}, %hh4; | |
mov.b32 {%h21, %h22}, %hh3; | |
cvt.f32.f16 %f6, %h19; | |
mul.wide.u32 %rd121, %r1, 4; | |
add.s64 %rd122, %rd20, %rd121; | |
ld.global.nc.f32 %f7, [%rd122]; | |
mul.rn.f32 %f8, %f7, 0f3A800000; | |
add.rn.f32 %f9, %f8, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f10, %f9; | |
add.s64 %rd123, %rd9, %rd118; | |
ld.global.nc.f32 %f11, [%rd123]; | |
mul.rn.f32 %f12, %f10, %f11; | |
mul.rn.f32 %f13, %f12, %f6; | |
add.s64 %rd124, %rd12, %rd118; | |
ld.global.nc.f32 %f14, [%rd124]; | |
add.s64 %rd125, %rd18, %rd121; | |
ld.global.nc.f32 %f15, [%rd125]; | |
mul.rn.f32 %f16, %f15, 0f3A800000; | |
mul.rn.f32 %f17, %f12, %f16; | |
sub.rn.f32 %f18, %f14, %f17; | |
add.rn.f32 %f19, %f13, %f18; | |
add.rn.f32 %f20, %f19, %f5; | |
add.s64 %rd126, %rd22, %rd121; | |
ld.global.nc.f32 %f21, [%rd126]; | |
mul.rn.f32 %f22, %f21, 0f3A800000; | |
add.rn.f32 %f23, %f22, 0f2B8CBCCC; | |
rsqrt.approx.f32 %f24, %f23; | |
add.s64 %rd127, %rd15, %rd118; | |
ld.global.nc.f32 %f25, [%rd127]; | |
mul.rn.f32 %f26, %f24, %f25; | |
mul.rn.f32 %f27, %f26, %f20; | |
add.s64 %rd128, %rd3, %rd118; | |
ld.global.nc.f32 %f28, [%rd128]; | |
add.s64 %rd129, %rd21, %rd121; | |
ld.global.nc.f32 %f29, [%rd129]; | |
mul.rn.f32 %f30, %f29, 0f3A800000; | |
mul.rn.f32 %f31, %f26, %f30; | |
sub.rn.f32 %f32, %f28, %f31; | |
add.rn.f32 %f33, %f32, %f27; | |
cvt.rn.f16.f32 %h23, %f33; | |
add.s64 %rd130, %rd26, %rd116; | |
xor.b64 %rd131, %rd72, %rd96; | |
xor.b64 %rd132, %rd131, 3041712726; | |
mul.lo.s64 %rd133, %rd132, 3528531795; | |
xor.b64 %rd134, %rd109, %rd133; | |
cvt.u32.u64 %r16, %rd134; | |
xor.b32 %r17, %r16, -616729560; | |
mul.lo.s32 %r18, %r17, -845247145; | |
shr.u32 %r19, %r18, 9; | |
cvt.rn.f32.u32 %f34, %r19; | |
mul.rn.f32 %f35, %f34, 0f34000000; | |
cvt.rn.f16.f32 %h24, %f35; | |
setp.ge.f16 %p3, %h24, %h2; | |
mul.wide.u32 %rd135, %r6, 4; | |
add.s64 %rd136, %rd6, %rd135; | |
ld.global.nc.f32 %f36, [%rd136]; | |
cvt.rn.f16.f32 %h25, %f36; | |
add.rn.f16 %h26, %h8, %h25; | |
mul.rn.f16 %h27, %h26, %h13; | |
cvt.f32.f16 %f37, %h27; | |
selp.f32 %f38, %f37, 0f00000000, %p3; | |
cvt.f32.f16 %f39, %h20; | |
add.s64 %rd137, %rd9, %rd135; | |
ld.global.nc.f32 %f40, [%rd137]; | |
mul.rn.f32 %f41, %f10, %f40; | |
mul.rn.f32 %f42, %f41, %f39; | |
add.s64 %rd138, %rd12, %rd135; | |
ld.global.nc.f32 %f43, [%rd138]; | |
mul.rn.f32 %f44, %f16, %f41; | |
sub.rn.f32 %f45, %f43, %f44; | |
add.rn.f32 %f46, %f42, %f45; | |
add.rn.f32 %f47, %f46, %f38; | |
add.s64 %rd139, %rd15, %rd135; | |
ld.global.nc.f32 %f48, [%rd139]; | |
mul.rn.f32 %f49, %f24, %f48; | |
mul.rn.f32 %f50, %f49, %f47; | |
add.s64 %rd140, %rd3, %rd135; | |
ld.global.nc.f32 %f51, [%rd140]; | |
mul.rn.f32 %f52, %f30, %f49; | |
sub.rn.f32 %f53, %f51, %f52; | |
add.rn.f32 %f54, %f53, %f50; | |
cvt.rn.f16.f32 %h28, %f54; | |
and.b64 %rd141, %rd104, 4294967295; | |
and.b64 %rd142, %rd82, 4294967295; | |
xor.b64 %rd143, %rd142, %rd114; | |
xor.b64 %rd144, %rd143, 534103459; | |
mul.lo.s64 %rd145, %rd144, 3449720151; | |
shr.u64 %rd146, %rd145, 32; | |
xor.b64 %rd147, %rd141, %rd146; | |
xor.b64 %rd148, %rd147, 4055616968; | |
mul.lo.s64 %rd149, %rd148, 3528531795; | |
shr.u64 %rd150, %rd149, 32; | |
cvt.u32.u64 %r20, %rd150; | |
xor.b64 %rd151, %rd105, %rd94; | |
cvt.u32.u64 %r21, %rd151; | |
xor.b32 %r22, %r21, 1401181199; | |
mul.lo.s32 %r23, %r22, -766435501; | |
xor.b32 %r24, %r23, %r20; | |
shr.u32 %r25, %r24, 9; | |
xor.b32 %r26, %r25, 4936337; | |
cvt.rn.f32.u32 %f55, %r26; | |
mul.rn.f32 %f56, %f55, 0f34000000; | |
cvt.rn.f16.f32 %h29, %f56; | |
setp.ge.f16 %p4, %h29, %h2; | |
mul.wide.u32 %rd152, %r7, 4; | |
add.s64 %rd153, %rd6, %rd152; | |
ld.global.nc.f32 %f57, [%rd153]; | |
cvt.rn.f16.f32 %h30, %f57; | |
add.rn.f16 %h31, %h9, %h30; | |
mul.rn.f16 %h32, %h31, %h13; | |
cvt.f32.f16 %f58, %h32; | |
selp.f32 %f59, %f58, 0f00000000, %p4; | |
cvt.f32.f16 %f60, %h21; | |
add.s64 %rd154, %rd9, %rd152; | |
ld.global.nc.f32 %f61, [%rd154]; | |
mul.rn.f32 %f62, %f10, %f61; | |
mul.rn.f32 %f63, %f62, %f60; | |
add.s64 %rd155, %rd12, %rd152; | |
ld.global.nc.f32 %f64, [%rd155]; | |
mul.rn.f32 %f65, %f16, %f62; | |
sub.rn.f32 %f66, %f64, %f65; | |
add.rn.f32 %f67, %f63, %f66; | |
add.rn.f32 %f68, %f67, %f59; | |
add.s64 %rd156, %rd15, %rd152; | |
ld.global.nc.f32 %f69, [%rd156]; | |
mul.rn.f32 %f70, %f24, %f69; | |
mul.rn.f32 %f71, %f70, %f68; | |
add.s64 %rd157, %rd3, %rd152; | |
ld.global.nc.f32 %f72, [%rd157]; | |
mul.rn.f32 %f73, %f30, %f70; | |
sub.rn.f32 %f74, %f72, %f73; | |
add.rn.f32 %f75, %f74, %f71; | |
cvt.rn.f16.f32 %h33, %f75; | |
xor.b64 %rd158, %rd83, %rd88; | |
xor.b64 %rd159, %rd158, 1684936478; | |
mul.lo.s64 %rd160, %rd159, 3449720151; | |
xor.b64 %rd161, %rd146, %rd160; | |
cvt.u32.u64 %r27, %rd161; | |
xor.b32 %r28, %r27, -239350328; | |
mul.lo.s32 %r29, %r28, -766435501; | |
shr.u32 %r30, %r29, 9; | |
cvt.rn.f32.u32 %f76, %r30; | |
mul.rn.f32 %f77, %f76, 0f34000000; | |
cvt.rn.f16.f32 %h34, %f77; | |
setp.ge.f16 %p5, %h34, %h2; | |
ld.global.nc.f32 %f78, [%rd119+12]; | |
cvt.rn.f16.f32 %h35, %f78; | |
add.rn.f16 %h36, %h10, %h35; | |
mul.rn.f16 %h37, %h36, %h13; | |
cvt.f32.f16 %f79, %h37; | |
selp.f32 %f80, %f79, 0f00000000, %p5; | |
cvt.f32.f16 %f81, %h22; | |
ld.global.nc.f32 %f82, [%rd123+12]; | |
mul.rn.f32 %f83, %f10, %f82; | |
mul.rn.f32 %f84, %f83, %f81; | |
ld.global.nc.f32 %f85, [%rd124+12]; | |
mul.rn.f32 %f86, %f16, %f83; | |
sub.rn.f32 %f87, %f85, %f86; | |
add.rn.f32 %f88, %f84, %f87; | |
add.rn.f32 %f89, %f88, %f80; | |
ld.global.nc.f32 %f90, [%rd127+12]; | |
mul.rn.f32 %f91, %f24, %f90; | |
mul.rn.f32 %f92, %f91, %f89; | |
ld.global.nc.f32 %f93, [%rd128+12]; | |
mul.rn.f32 %f94, %f30, %f91; | |
sub.rn.f32 %f95, %f93, %f94; | |
add.rn.f32 %f96, %f95, %f92; | |
cvt.rn.f16.f32 %h38, %f96; | |
st.global.v4.b16 [%rd130], {%h23, %h28, %h33, %h38}; | |
ret; | |
} | |
// .globl fusion_2698 | |
.visible .entry fusion_2698( | |
.param .u64 fusion_2698_param_0, | |
.param .u64 fusion_2698_param_1, | |
.param .u64 fusion_2698_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2698_param_0]; | |
ld.param.u64 %rd2, [fusion_2698_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2171 | |
.visible .entry fusion_2171( | |
.param .u64 fusion_2171_param_0, | |
.param .u64 fusion_2171_param_1, | |
.param .u64 fusion_2171_param_2, | |
.param .u64 fusion_2171_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2171_param_0]; | |
ld.param.u64 %rd2, [fusion_2171_param_3]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2171_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd9, %r14, 2048; | |
add.s64 %rd10, %rd6, %rd9; | |
mul.wide.u32 %rd11, %r16, 2; | |
add.s64 %rd12, %rd10, %rd11; | |
ld.global.nc.b16 %h1, [%rd12]; | |
mul.wide.u32 %rd13, %r7, 256; | |
add.s64 %rd14, %rd5, %rd13; | |
mul.wide.u32 %rd15, %r6, 4; | |
add.s64 %rd16, %rd14, %rd15; | |
ld.global.nc.f32 %f1, [%rd16]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd10, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd14, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd10, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd14, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd10, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd14, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd8+444893184], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2699 | |
.visible .entry fusion_2699( | |
.param .u64 fusion_2699_param_0, | |
.param .u64 fusion_2699_param_1, | |
.param .u64 fusion_2699_param_2 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<5>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<6>; | |
.reg .b64 %rd<9>; | |
ld.param.u64 %rd1, [fusion_2699_param_0]; | |
ld.param.u64 %rd2, [fusion_2699_param_1]; | |
cvta.to.global.u64 %rd3, %rd2; | |
cvta.to.global.u64 %rd4, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
mul.wide.u32 %rd5, %r5, 4; | |
add.s64 %rd6, %rd4, %rd5; | |
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6]; | |
cvt.rn.f16.f32 %h1, %f1; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
cvt.rn.f16.f32 %h2, %f2; | |
cvt.rn.f16.f32 %h3, %f3; | |
cvt.rn.f16.f32 %h4, %f4; | |
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4}; | |
ret; | |
} | |
// .globl fusion_2172 | |
.visible .entry fusion_2172( | |
.param .u64 fusion_2172_param_0, | |
.param .u64 fusion_2172_param_1, | |
.param .u64 fusion_2172_param_2, | |
.param .u64 fusion_2172_param_3 | |
) | |
.reqntid 256, 1, 1 | |
{ | |
.reg .b16 %h<13>; | |
.reg .f32 %f<5>; | |
.reg .b32 %r<20>; | |
.reg .b64 %rd<29>; | |
ld.param.u64 %rd1, [fusion_2172_param_0]; | |
ld.param.u64 %rd2, [fusion_2172_param_3]; | |
cvta.to.global.u64 %rd3, %rd2; | |
ld.param.u64 %rd4, [fusion_2172_param_1]; | |
cvta.to.global.u64 %rd5, %rd4; | |
cvta.to.global.u64 %rd6, %rd1; | |
mov.u32 %r1, %ctaid.x; | |
mov.u32 %r2, %tid.x; | |
shl.b32 %r3, %r1, 10; | |
shl.b32 %r4, %r2, 2; | |
or.b32 %r5, %r4, %r3; | |
and.b32 %r6, %r4, 60; | |
shr.u32 %r7, %r1, 5; | |
or.b32 %r8, %r4, 1; | |
and.b32 %r9, %r8, 61; | |
or.b32 %r10, %r4, 2; | |
and.b32 %r11, %r10, 62; | |
or.b32 %r12, %r4, 3; | |
and.b32 %r13, %r12, 63; | |
bfe.u32 %r14, %r5, 6, 9; | |
mul.wide.u32 %rd7, %r5, 2; | |
add.s64 %rd8, %rd3, %rd7; | |
shl.b32 %r15, %r7, 6; | |
or.b32 %r16, %r6, %r15; | |
mul.wide.u32 %rd9, %r14, 2048; | |
add.s64 %rd10, %rd6, %rd9; | |
mul.wide.u32 %rd11, %r16, 2; | |
add.s64 %rd12, %rd10, %rd11; | |
ld.global.nc.b16 %h1, [%rd12]; | |
mul.wide.u32 %rd13, %r7, 256; | |
add.s64 %rd14, %rd5, %rd13; | |
mul.wide.u32 %rd15, %r6, 4; | |
add.s64 %rd16, %rd14, %rd15; | |
ld.global.nc.f32 %f1, [%rd16]; | |
cvt.rn.f16.f32 %h2, %f1; | |
add.rn.f16 %h3, %h1, %h2; | |
or.b32 %r17, %r9, %r15; | |
mul.wide.u32 %rd17, %r17, 2; | |
add.s64 %rd18, %rd10, %rd17; | |
ld.global.nc.b16 %h4, [%rd18]; | |
mul.wide.u32 %rd19, %r9, 4; | |
add.s64 %rd20, %rd14, %rd19; | |
ld.global.nc.f32 %f2, [%rd20]; | |
cvt.rn.f16.f32 %h5, %f2; | |
add.rn.f16 %h6, %h4, %h5; | |
or.b32 %r18, %r11, %r15; | |
mul.wide.u32 %rd21, %r18, 2; | |
add.s64 %rd22, %rd10, %rd21; | |
ld.global.nc.b16 %h7, [%rd22]; | |
mul.wide.u32 %rd23, %r11, 4; | |
add.s64 %rd24, %rd14, %rd23; | |
ld.global.nc.f32 %f3, [%rd24]; | |
cvt.rn.f16.f32 %h8, %f3; | |
add.rn.f16 %h9, %h7, %h8; | |
or.b32 %r19, %r13, %r15; | |
mul.wide.u32 %rd25, %r19, 2; | |
add.s64 %rd26, %rd10, %rd25; | |
ld.global.nc.b16 %h10, [%rd26]; | |
mul.wide.u32 %rd27, %r13, 4; | |
add.s64 %rd28, %rd14, %rd27; | |
ld.global.nc.f32 %f4, [%rd28]; | |
cvt.rn.f16.f32 %h11, %f4; | |
add.rn.f16 %h12, %h10, %h11; | |
st.global.v4.b16 [%rd8+445941760], {%h3, %h6, %h9, %h12}; | |
ret; | |
} | |
// .globl fusion_2169 | |
.visible .entry fusion_2169( | |
.param .u64 fusion_2169_param_0, | |
.param .u64 fusion_2169_param_1, | |
.param .u64 fusion_2169_param_2, | |
.param .u64 fusion_2169_param_3 | |
) | |
.reqntid 32, 1, 1 | |
{ | |
.local .align 4 .b8 __local_depot86[4]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<4>; | |
.reg .b16 %h<83>; | |
.reg .b32 %hh<9>; | |
.reg .f32 %f<57>; | |
.reg .b32 %r<37>; | |
.reg .b64 %rd<37>; | |
mov.u64 %SPL, __local_depot86; | |
cvta.local.u64 %SP, %SPL; | |
ld.param.u64 %rd4, [fusion_2169_param_0]; | |
ld.param.u64 %rd5, [fusion_2169_param_2]; | |
cvta.to.global.u64 %rd6, %rd5; | |
cvta.to.global.u64 %rd9, %rd4; | |
add.u64 %rd10, %SP, 0; | |
add.u64 %rd1, %SPL, 0; | |
mov.u32 %r1, %tid.x; | |
mov.u32 %r5, %ctaid.x; | |
shl.b32 %r6, %r1, 1; | |
shl.b32 %r7, %r5, 9; | |
or.b32 %r8, %r7, %r6; | |
mul.wide.u32 %rd11, %r8, 2; | |
add.s64 %rd12, %rd9, %rd11; | |
ld.global.nc.b32 %hh1, [%rd12]; | |
mov.b32 {%h1, %h2}, %hh1; | |
mul.wide.u32 %rd13, %r6, 4; | |
add.s64 %rd14, %rd6, %rd13; | |
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14]; | |
cvt.rn.f16.s32 %h3, %r9; | |
mov.b16 %h4, 0x3C00; | |
sub.rn.f16 %h5, %h4, %h3; | |
mov.b16 %h6, 0x70E2; | |
mul.rn.f16 %h7, %h5, %h6; | |
sub.rn.f16 %h8, %h1, %h7; | |
cvt.f32.f16 %f2, %h8; | |
max.f32 %f3, %f2, 0fFF800000; | |
cvt.rn.f16.s32 %h9, %r10; | |
sub.rn.f16 %h10, %h4, %h9; | |
mul.rn.f16 %h11, %h10, %h6; | |
sub.rn.f16 %h12, %h2, %h11; | |
cvt.f32.f16 %f4, %h12; | |
max.f32 %f5, %f3, %f4; | |
or.b32 %r11, %r6, 64; | |
ld.global.nc.b32 %hh2, [%rd12+128]; | |
mov.b32 {%h13, %h14}, %hh2; | |
mul.wide.u32 %rd15, %r11, 4; | |
add.s64 %rd16, %rd6, %rd15; | |
ld.global.nc.u32 %r12, [%rd16]; | |
cvt.rn.f16.s32 %h15, %r12; | |
sub.rn.f16 %h16, %h4, %h15; | |
mul.rn.f16 %h17, %h16, %h6; | |
sub.rn.f16 %h18, %h13, %h17; | |
cvt.f32.f16 %f6, %h18; | |
max.f32 %f7, %f5, %f6; | |
ld.global.nc.u32 %r13, [%rd14+260]; | |
cvt.rn.f16.s32 %h19, %r13; | |
sub.rn.f16 %h20, %h4, %h19; | |
mul.rn.f16 %h21, %h20, %h6; | |
sub.rn.f16 %h22, %h14, %h21; | |
cvt.f32.f16 %f8, %h22; | |
max.f32 %f9, %f7, %f8; | |
or.b32 %r14, %r6, 128; | |
ld.global.nc.b32 %hh3, [%rd12+256]; | |
mov.b32 {%h23, %h24}, %hh3; | |
mul.wide.u32 %rd17, %r14, 4; | |
add.s64 %rd18, %rd6, %rd17; | |
ld.global.nc.u32 %r15, [%rd18]; | |
cvt.rn.f16.s32 %h25, %r15; | |
sub.rn.f16 %h26, %h4, %h25; | |
mul.rn.f16 %h27, %h26, %h6; | |
sub.rn.f16 %h28, %h23, %h27; | |
cvt.f32.f16 %f10, %h28; | |
max.f32 %f11, %f9, %f10; | |
ld.global.nc.u32 %r16, [%rd14+516]; | |
cvt.rn.f16.s32 %h29, %r16; | |
sub.rn.f16 %h30, %h4, %h29; | |
mul.rn.f16 %h31, %h30, %h6; | |
sub.rn.f16 %h32, %h24, %h31; | |
cvt.f32.f16 %f12, %h32; | |
max.f32 %f13, %f11, %f12; | |
or.b32 %r17, %r6, 192; | |
ld.global.nc.b32 %hh4, [%rd12+384]; | |
mov.b32 {%h33, %h34}, %hh4; | |
mul.wide.u32 %rd19, %r17, 4; | |
add.s64 %rd20, %rd6, %rd19; | |
ld.global.nc.u32 %r18, [%rd20]; | |
cvt.rn.f16.s32 %h35, %r18; | |
sub.rn.f16 %h36, %h4, %h35; | |
mul.rn.f16 %h37, %h36, %h6; | |
sub.rn.f16 %h38, %h33, %h37; | |
cvt.f32.f16 %f14, %h38; | |
max.f32 %f15, %f13, %f14; | |
ld.global.nc.u32 %r19, [%rd14+772]; | |
cvt.rn.f16.s32 %h39, %r19; | |
sub.rn.f16 %h40, %h4, %h39; | |
mul.rn.f16 %h41, %h40, %h6; | |
sub.rn.f16 %h42, %h34, %h41; | |
cvt.f32.f16 %f16, %h42; | |
max.f32 %f17, %f15, %f16; | |
or.b32 %r20, %r6, 256; | |
ld.global.nc.b32 %hh5, [%rd12+512]; | |
mov.b32 {%h43, %h44}, %hh5; | |
mul.wide.u32 %rd21, %r20, 4; | |
add.s64 %rd22, %rd6, %rd21; | |
ld.global.nc.u32 %r21, [%rd22]; | |
cvt.rn.f16.s32 %h45, %r21; | |
sub.rn.f16 %h46, %h4, %h45; | |
mul.rn.f16 %h47, %h46, %h6; | |
sub.rn.f16 %h48, %h43, %h47; | |
cvt.f32.f16 %f18, %h48; | |
max.f32 %f19, %f17, %f18; | |
ld.global.nc.u32 %r22, [%rd14+1028]; | |
cvt.rn.f16.s32 %h49, %r22; | |
sub.rn.f16 %h50, %h4, %h49; | |
mul.rn.f16 %h51, %h50, %h6; | |
sub.rn.f16 %h52, %h44, %h51; | |
cvt.f32.f16 %f20, %h52; | |
max.f32 %f21, %f19, %f20; | |
or.b32 %r23, %r6, 320; | |
ld.global.nc.b32 %hh6, [%rd12+640]; | |
mov.b32 {%h53, %h54}, %hh6; | |
mul.wide.u32 %rd23, %r23, 4; | |
add.s64 %rd24, %rd6, %rd23; | |
ld.global.nc.u32 %r24, [%rd24]; | |
cvt.rn.f16.s32 %h55, %r24; | |
sub.rn.f16 %h56, %h4, %h55; | |
mul.rn.f16 %h57, %h56, %h6; | |
sub.rn.f16 %h58, %h53, %h57; | |
cvt.f32.f16 %f22, %h58; | |
max.f32 %f23, %f21, %f22; | |
ld.global.nc.u32 %r25, [%rd14+1284]; | |
cvt.rn.f16.s32 %h59, %r25; | |
sub.rn.f16 %h60, %h4, %h59; | |
mul.rn.f16 %h61, %h60, %h6; | |
sub.rn.f16 %h62, %h54, %h61; | |
cvt.f32.f16 %f24, %h62; | |
max.f32 %f25, %f23, %f24; | |
or.b32 %r26, %r6, 384; | |
ld.global.nc.b32 %hh7, [%rd12+768]; | |
mov.b32 {%h63, %h64}, %hh7; | |
mul.wide.u32 %rd25, %r26, 4; | |
add.s64 %rd26, %rd6, %rd25; | |
ld.global.nc.u32 %r27, [%rd26]; | |
cvt.rn.f16.s32 %h65, %r27; | |
sub.rn.f16 %h66, %h4, %h65; | |
mul.rn.f16 %h67, %h66, %h6; | |
sub.rn.f16 %h68, %h63, %h67; | |
cvt.f32.f16 %f26, %h68; | |
max.f32 %f27, %f25, %f26; | |
ld.global.nc.u32 %r28, [%rd14+1540]; | |
cvt.rn.f16.s32 %h69, %r28; | |
sub.rn.f16 %h70, %h4, %h69; | |
mul.rn.f16 %h71, %h70, %h6; | |
sub.rn.f16 %h72, %h64, %h71; | |
cvt.f32.f16 %f28, %h72; | |
max.f32 %f29, %f27, %f28; | |
or.b32 %r29, %r6, 448; | |
ld.global.nc.b32 %hh8, [%rd12+896]; | |
mov.b32 {%h73, %h74}, %hh8; | |
mul.wide.u32 %rd27, %r29, 4; | |
add.s64 %rd28, %rd6, %rd27; | |
ld.global.nc.u32 %r30, [%rd28]; | |
cvt.rn.f16.s32 %h75, %r30; | |
sub.rn.f16 %h76, %h4, %h75; | |
mul.rn.f16 %h77, %h76, %h6; | |
sub.rn.f16 %h78, %h73, %h77; | |
cvt.f32.f16 %f30, %h78; | |
max.f32 %f31, %f29, %f30; | |
ld.global.nc.u32 %r31, [%rd14+1796]; | |
cvt.rn.f16.s32 %h79, %r31; | |
sub.rn.f16 %h80, %h4, %h79; | |
mul.rn.f16 %h81, %h80, %h6; | |
sub.rn.f16 %h82, %h74, %h81; | |
cvt.f32.f16 %f32, %h82; | |
max.f32 %f33, %f31, %f32; | |
shfl.sync.down.b32 %f34, %f33, 16, 31, -1; | |
max.f32 %f35, %f33, %f34; | |
shfl.sync.down.b32 %f36, %f35, 8, 31, -1; | |
max.f32 %f37, %f35, %f36; | |
shfl.sync.down.b32 %f38, %f37, 4, 31, -1; | |
m |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment