Skip to content

Instantly share code, notes, and snippets.

@cheshire
Created May 21, 2021 02:59
Show Gist options
  • Save cheshire/2b1c002b61dec1a173163424602914f9 to your computer and use it in GitHub Desktop.
Save cheshire/2b1c002b61dec1a173163424602914f9 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
//
// Generated by LLVM NVPTX Back-End
//
.version 6.0
.target sm_70
.address_size 64
// .globl fusion_2287
.shared .align 4 .b8 shared_cache_0[128];
.shared .align 4 .b8 shared_cache_01[128];
.global .align 16 .b8 rng_state[16] = {149, 35, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
.visible .global .align 64 .b8 buffer_for_constant_217[8] = {32, 0, 0, 0, 0, 0, 0, 0};
.shared .align 4 .b8 shared_cache_02[128];
.shared .align 4 .b8 shared_cache_03[128];
.shared .align 4 .b8 shared_cache_04[128];
.shared .align 4 .b8 shared_cache_05[128];
.shared .align 4 .b8 shared_cache_06[128];
.shared .align 4 .b8 shared_cache_07[128];
.shared .align 4 .b8 shared_cache_08[128];
.shared .align 4 .b8 shared_cache_09[128];
.shared .align 4 .b8 shared_cache_010[128];
.shared .align 4 .b8 shared_cache_011[128];
.shared .align 4 .b8 shared_cache_012[128];
.shared .align 4 .b8 shared_cache_013[128];
.shared .align 4 .b8 shared_cache_014[128];
.shared .align 4 .b8 shared_cache_015[128];
.shared .align 4 .b8 shared_cache_016[128];
.shared .align 4 .b8 shared_cache_017[128];
.shared .align 4 .b8 shared_cache_018[128];
.shared .align 4 .b8 shared_cache_019[128];
.shared .align 4 .b8 shared_cache_020[128];
.shared .align 4 .b8 shared_cache_021[128];
.shared .align 4 .b8 shared_cache_022[128];
.shared .align 4 .b8 shared_cache_023[128];
.shared .align 4 .b8 shared_cache_024[128];
.shared .align 4 .b8 shared_cache_025[128];
.shared .align 4 .b8 shared_cache_026[128];
.shared .align 4 .b8 shared_cache_027[128];
.shared .align 4 .b8 shared_cache_028[128];
.shared .align 4 .b8 shared_cache_029[128];
.shared .align 4 .b8 shared_cache_030[128];
.shared .align 4 .b8 shared_cache_031[128];
.shared .align 4 .b8 shared_cache_032[128];
.shared .align 4 .b8 shared_cache_033[128];
.shared .align 4 .b8 shared_cache_034[128];
.shared .align 4 .b8 shared_cache_035[128];
.shared .align 4 .b8 shared_cache_036[128];
.shared .align 4 .b8 shared_cache_037[128];
.shared .align 4 .b8 shared_cache_038[128];
.shared .align 4 .b8 shared_cache_039[128];
.shared .align 4 .b8 shared_cache_040[128];
.shared .align 4 .b8 shared_cache_041[128];
.shared .align 4 .b8 shared_cache_042[128];
.shared .align 4 .b8 shared_cache_043[128];
.shared .align 4 .b8 shared_cache_044[128];
.shared .align 4 .b8 shared_cache_045[128];
.shared .align 4 .b8 shared_cache_046[128];
.shared .align 4 .b8 shared_cache_047[128];
.shared .align 4 .b8 shared_cache_048[128];
.shared .align 4 .b8 shared_cache_049[128];
.shared .align 4 .b8 shared_cache_050[128];
.shared .align 4 .b8 shared_cache_051[128];
.shared .align 4 .b8 shared_cache_052[128];
.shared .align 4 .b8 shared_cache_053[128];
.shared .align 4 .b8 shared_cache_054[128];
.shared .align 4 .b8 shared_cache_055[128];
.shared .align 4 .b8 shared_cache_056[128];
.shared .align 4 .b8 shared_cache_057[128];
.shared .align 4 .b8 shared_cache_058[128];
.shared .align 4 .b8 shared_cache_059[128];
.shared .align 4 .b8 shared_cache_060[128];
.shared .align 4 .b8 shared_cache_061[128];
.shared .align 4 .b8 shared_cache_062[128];
.shared .align 4 .b8 shared_cache_063[128];
.shared .align 4 .b8 shared_cache_064[128];
.shared .align 4 .b8 shared_cache_065[128];
.shared .align 4 .b8 shared_cache_066[128];
.shared .align 4 .b8 shared_cache_067[128];
.shared .align 4 .b8 shared_cache_068[128];
.shared .align 4 .b8 shared_cache_069[128];
.shared .align 4 .b8 shared_cache_070[128];
.shared .align 4 .b8 shared_cache_071[128];
.shared .align 4 .b8 shared_cache_072[128];
.shared .align 4 .b8 shared_cache_073[128];
.shared .align 4 .b8 shared_cache_074[128];
.shared .align 4 .b8 shared_cache_075[128];
.shared .align 4 .b8 shared_cache_076[128];
.shared .align 4 .b8 shared_cache_077[128];
.shared .align 4 .b8 shared_cache_078[128];
.shared .align 4 .b8 shared_cache_079[128];
.shared .align 4 .b8 shared_cache_080[128];
.shared .align 4 .b8 shared_cache_081[128];
.shared .align 4 .b8 shared_cache_082[128];
.shared .align 4 .b8 shared_cache_083[128];
.shared .align 4 .b8 shared_cache_084[128];
.shared .align 4 .b8 shared_cache_085[128];
.shared .align 4 .b8 shared_cache_086[128];
.shared .align 4 .b8 shared_cache_087[128];
.shared .align 4 .b8 shared_cache_088[128];
.shared .align 4 .b8 shared_cache_089[128];
.shared .align 4 .b8 shared_cache_090[128];
.shared .align 4 .b8 shared_cache_091[128];
.shared .align 4 .b8 shared_cache_092[128];
.shared .align 4 .b8 shared_cache_093[128];
.shared .align 4 .b8 shared_cache_094[128];
.shared .align 4 .b8 shared_cache_095[128];
.shared .align 4 .b8 shared_cache_096[128];
.shared .align 4 .b8 shared_cache_097[128];
.shared .align 4 .b8 shared_cache_098[128];
.shared .align 4 .b8 shared_cache_099[128];
.shared .align 4 .b8 shared_cache_0100[128];
.shared .align 4 .b8 shared_cache_0101[128];
.shared .align 4 .b8 shared_cache_0102[128];
.shared .align 4 .b8 shared_cache_0103[128];
.shared .align 4 .b8 shared_cache_0104[128];
.shared .align 4 .b8 shared_cache_0105[128];
.shared .align 4 .b8 shared_cache_0106[128];
.shared .align 4 .b8 shared_cache_0107[128];
.shared .align 4 .b8 shared_cache_0108[128];
.shared .align 4 .b8 shared_cache_0109[128];
.shared .align 4 .b8 shared_cache_0110[128];
.shared .align 4 .b8 shared_cache_0111[128];
.shared .align 4 .b8 shared_cache_0112[128];
.shared .align 4 .b8 shared_cache_0113[128];
.shared .align 4 .b8 shared_cache_0114[128];
.shared .align 4 .b8 shared_cache_0115[128];
.shared .align 4 .b8 shared_cache_0116[128];
.shared .align 4 .b8 shared_cache_0117[128];
.shared .align 4 .b8 shared_cache_0118[128];
.shared .align 4 .b8 shared_cache_0119[128];
.shared .align 4 .b8 shared_cache_0120[128];
.shared .align 4 .b8 shared_cache_0121[128];
.shared .align 4 .b8 shared_cache_0122[128];
.shared .align 4 .b8 shared_cache_0123[128];
.shared .align 4 .b8 shared_cache_0124[128];
.shared .align 4 .b8 shared_cache_0125[128];
.shared .align 4 .b8 shared_cache_0126[128];
.shared .align 4 .b8 shared_cache_0127[128];
.shared .align 4 .b8 shared_cache_0128[128];
.shared .align 4 .b8 shared_cache_0129[128];
.shared .align 4 .b8 shared_cache_0130[128];
.shared .align 4 .b8 shared_cache_0131[128];
.shared .align 4 .b8 shared_cache_0132[128];
.shared .align 4 .b8 shared_cache_0133[128];
.shared .align 4 .b8 shared_cache_0134[128];
.shared .align 4 .b8 shared_cache_0135[128];
.shared .align 4 .b8 shared_cache_0136[128];
.shared .align 4 .b8 shared_cache_0137[128];
.shared .align 4 .b8 shared_cache_0138[128];
.shared .align 4 .b8 shared_cache_0139[128];
.shared .align 4 .b8 shared_cache_0140[128];
.shared .align 4 .b8 shared_cache_0141[128];
.shared .align 4 .b8 shared_cache_0142[128];
.shared .align 4 .b8 shared_cache_0143[128];
.shared .align 4 .b8 shared_cache_0144[128];
.shared .align 4 .b8 shared_cache_0145[128];
.shared .align 4 .b8 shared_cache_0146[4224];
.shared .align 4 .b8 shared_cache_0147[4224];
.visible .global .align 64 .b8 buffer_for_constant_6681[2] = {0, 252};
.shared .align 4 .b8 shared_cache_0148[128];
.shared .align 4 .b8 shared_cache_0149[128];
.shared .align 2 .b8 shared_cache_0150[64];
.shared .align 4 .b8 shared_cache_0151[128];
.shared .align 4 .b8 shared_cache_0152[128];
.shared .align 4 .b8 shared_cache_1[128];
.shared .align 8 .b8 shared_cache_0153[256];
.shared .align 4 .b8 shared_cache_0154[128];
.shared .align 4 .b8 shared_cache_1155[128];
.shared .align 4 .b8 shared_cache_2[128];
.shared .align 4 .b8 shared_cache_0156[128];
.shared .align 4 .b8 shared_cache_0157[128];
.shared .align 4 .b8 shared_cache_0158[4224];
.visible .global .align 64 .b8 buffer_for_constant_1375[2];
.shared .align 4 .b8 shared_cache_0159[128];
.shared .align 4 .b8 shared_cache_1160[128];
.shared .align 4 .b8 shared_cache_0161[8448];
.shared .align 4 .b8 shared_cache_2162[8448];
.shared .align 4 .b8 shared_cache_3[8448];
.shared .align 4 .b8 shared_cache_0163[128];
.shared .align 4 .b8 shared_cache_1164[128];
.shared .align 4 .b8 shared_cache_0165[128];
.shared .align 4 .b8 shared_cache_1166[128];
.shared .align 4 .b8 shared_cache_0167[128];
.shared .align 4 .b8 shared_cache_0168[128];
.shared .align 4 .b8 shared_cache_1169[128];
.shared .align 4 .b8 shared_cache_0170[128];
.shared .align 4 .b8 shared_cache_1171[128];
.shared .align 4 .b8 shared_cache_0172[128];
.shared .align 4 .b8 shared_cache_0173[128];
.shared .align 4 .b8 shared_cache_1174[128];
.shared .align 4 .b8 shared_cache_0175[128];
.shared .align 4 .b8 shared_cache_1176[128];
.shared .align 4 .b8 shared_cache_0177[128];
.shared .align 4 .b8 shared_cache_0178[128];
.shared .align 4 .b8 shared_cache_1179[128];
.shared .align 4 .b8 shared_cache_0180[128];
.shared .align 4 .b8 shared_cache_1181[128];
.shared .align 4 .b8 shared_cache_0182[128];
.shared .align 4 .b8 shared_cache_0183[128];
.shared .align 4 .b8 shared_cache_1184[128];
.shared .align 4 .b8 shared_cache_0185[128];
.shared .align 4 .b8 shared_cache_1186[128];
.shared .align 4 .b8 shared_cache_0187[128];
.shared .align 4 .b8 shared_cache_0188[128];
.shared .align 4 .b8 shared_cache_1189[128];
.shared .align 4 .b8 shared_cache_0190[128];
.shared .align 4 .b8 shared_cache_1191[128];
.shared .align 4 .b8 shared_cache_0192[128];
.shared .align 4 .b8 shared_cache_0193[128];
.shared .align 4 .b8 shared_cache_1194[128];
.shared .align 4 .b8 shared_cache_0195[128];
.shared .align 4 .b8 shared_cache_1196[128];
.shared .align 4 .b8 shared_cache_0197[128];
.shared .align 4 .b8 shared_cache_0198[128];
.shared .align 4 .b8 shared_cache_1199[128];
.shared .align 4 .b8 shared_cache_0200[128];
.shared .align 4 .b8 shared_cache_1201[128];
.shared .align 4 .b8 shared_cache_0202[128];
.shared .align 4 .b8 shared_cache_0203[128];
.shared .align 4 .b8 shared_cache_1204[128];
.shared .align 4 .b8 shared_cache_0205[128];
.shared .align 4 .b8 shared_cache_1206[128];
.shared .align 4 .b8 shared_cache_0207[128];
.shared .align 4 .b8 shared_cache_0208[128];
.shared .align 4 .b8 shared_cache_1209[128];
.shared .align 4 .b8 shared_cache_0210[128];
.shared .align 4 .b8 shared_cache_1211[128];
.shared .align 4 .b8 shared_cache_0212[128];
.shared .align 4 .b8 shared_cache_0213[128];
.shared .align 4 .b8 shared_cache_1214[128];
.shared .align 4 .b8 shared_cache_0215[128];
.shared .align 4 .b8 shared_cache_1216[128];
.shared .align 4 .b8 shared_cache_0217[128];
.shared .align 4 .b8 shared_cache_0218[128];
.shared .align 4 .b8 shared_cache_1219[128];
.shared .align 4 .b8 shared_cache_0220[128];
.shared .align 4 .b8 shared_cache_1221[128];
.shared .align 4 .b8 shared_cache_0222[128];
.shared .align 4 .b8 shared_cache_0223[128];
.shared .align 4 .b8 shared_cache_1224[128];
.shared .align 4 .b8 shared_cache_0225[128];
.shared .align 4 .b8 shared_cache_1226[128];
.shared .align 4 .b8 shared_cache_0227[128];
.shared .align 4 .b8 shared_cache_0228[128];
.shared .align 4 .b8 shared_cache_1229[128];
.shared .align 4 .b8 shared_cache_0230[128];
.shared .align 4 .b8 shared_cache_1231[128];
.shared .align 4 .b8 shared_cache_0232[128];
.shared .align 4 .b8 shared_cache_0233[128];
.shared .align 4 .b8 shared_cache_1234[128];
.shared .align 4 .b8 shared_cache_0235[128];
.shared .align 4 .b8 shared_cache_1236[128];
.shared .align 4 .b8 shared_cache_0237[128];
.shared .align 4 .b8 shared_cache_0238[128];
.shared .align 4 .b8 shared_cache_1239[128];
.shared .align 4 .b8 shared_cache_0240[128];
.shared .align 4 .b8 shared_cache_1241[128];
.shared .align 4 .b8 shared_cache_0242[128];
.shared .align 4 .b8 shared_cache_0243[128];
.shared .align 4 .b8 shared_cache_1244[128];
.shared .align 4 .b8 shared_cache_0245[128];
.shared .align 4 .b8 shared_cache_1246[128];
.shared .align 4 .b8 shared_cache_0247[128];
.shared .align 4 .b8 shared_cache_0248[128];
.shared .align 4 .b8 shared_cache_1249[128];
.shared .align 4 .b8 shared_cache_0250[128];
.shared .align 4 .b8 shared_cache_1251[128];
.shared .align 4 .b8 shared_cache_0252[128];
.shared .align 4 .b8 shared_cache_0253[128];
.shared .align 4 .b8 shared_cache_1254[128];
.shared .align 4 .b8 shared_cache_0255[128];
.shared .align 4 .b8 shared_cache_1256[128];
.shared .align 4 .b8 shared_cache_0257[128];
.shared .align 4 .b8 shared_cache_0258[128];
.shared .align 4 .b8 shared_cache_1259[128];
.shared .align 4 .b8 shared_cache_0260[128];
.shared .align 4 .b8 shared_cache_1261[128];
.shared .align 4 .b8 shared_cache_0262[128];
.shared .align 4 .b8 shared_cache_0263[128];
.shared .align 4 .b8 shared_cache_1264[128];
.shared .align 4 .b8 shared_cache_0265[128];
.shared .align 4 .b8 shared_cache_1266[128];
.shared .align 4 .b8 shared_cache_0267[128];
.shared .align 4 .b8 shared_cache_0268[128];
.shared .align 4 .b8 shared_cache_1269[128];
.shared .align 4 .b8 shared_cache_0270[128];
.shared .align 4 .b8 shared_cache_1271[128];
.shared .align 4 .b8 shared_cache_0272[128];
.shared .align 4 .b8 shared_cache_0273[128];
.shared .align 4 .b8 shared_cache_1274[128];
.shared .align 4 .b8 shared_cache_0275[128];
.shared .align 4 .b8 shared_cache_1276[128];
.shared .align 4 .b8 shared_cache_0277[128];
.shared .align 4 .b8 shared_cache_0278[128];
.shared .align 4 .b8 shared_cache_1279[128];
.shared .align 4 .b8 shared_cache_0280[128];
.shared .align 4 .b8 shared_cache_1281[128];
.shared .align 4 .b8 shared_cache_0282[128];
.shared .align 4 .b8 shared_cache_0283[128];
.shared .align 4 .b8 shared_cache_1284[128];
.shared .align 4 .b8 shared_cache_0285[4224];
.shared .align 4 .b8 shared_cache_1286[4224];
.shared .align 4 .b8 shared_cache_2287[4224];
.shared .align 4 .b8 shared_cache_3288[4224];
.shared .align 4 .b8 shared_cache_0289[4224];
.shared .align 4 .b8 shared_cache_1290[4224];
.shared .align 4 .b8 shared_cache_2291[4224];
.shared .align 4 .b8 shared_cache_3292[4224];
.shared .align 4 .b8 shared_cache_0293[4224];
.shared .align 4 .b8 shared_cache_1294[4224];
.shared .align 4 .b8 shared_cache_2295[4224];
.shared .align 4 .b8 shared_cache_3296[4224];
.shared .align 4 .b8 shared_cache_0297[4224];
.shared .align 4 .b8 shared_cache_1298[4224];
.shared .align 4 .b8 shared_cache_2299[4224];
.shared .align 4 .b8 shared_cache_3300[4224];
.shared .align 4 .b8 shared_cache_0301[4224];
.shared .align 4 .b8 shared_cache_1302[4224];
.shared .align 4 .b8 shared_cache_2303[4224];
.shared .align 4 .b8 shared_cache_3304[4224];
.shared .align 4 .b8 shared_cache_0305[4224];
.shared .align 4 .b8 shared_cache_1306[4224];
.shared .align 4 .b8 shared_cache_2307[4224];
.shared .align 4 .b8 shared_cache_3308[4224];
.shared .align 4 .b8 shared_cache_0309[4224];
.shared .align 4 .b8 shared_cache_1310[4224];
.shared .align 4 .b8 shared_cache_2311[4224];
.shared .align 4 .b8 shared_cache_3312[4224];
.shared .align 4 .b8 shared_cache_0313[4224];
.shared .align 4 .b8 shared_cache_1314[4224];
.shared .align 4 .b8 shared_cache_2315[4224];
.shared .align 4 .b8 shared_cache_3316[4224];
.shared .align 4 .b8 shared_cache_0317[4224];
.shared .align 4 .b8 shared_cache_1318[4224];
.shared .align 4 .b8 shared_cache_2319[4224];
.shared .align 4 .b8 shared_cache_3320[4224];
.shared .align 4 .b8 shared_cache_0321[4224];
.shared .align 4 .b8 shared_cache_1322[4224];
.shared .align 4 .b8 shared_cache_2323[4224];
.shared .align 4 .b8 shared_cache_3324[4224];
.shared .align 4 .b8 shared_cache_0325[4224];
.shared .align 4 .b8 shared_cache_1326[4224];
.shared .align 4 .b8 shared_cache_2327[4224];
.shared .align 4 .b8 shared_cache_3328[4224];
.shared .align 4 .b8 shared_cache_0329[4224];
.shared .align 4 .b8 shared_cache_1330[4224];
.shared .align 4 .b8 shared_cache_2331[4224];
.shared .align 4 .b8 shared_cache_3332[4224];
.shared .align 4 .b8 shared_cache_0333[4224];
.shared .align 4 .b8 shared_cache_1334[4224];
.shared .align 4 .b8 shared_cache_2335[4224];
.shared .align 4 .b8 shared_cache_3336[4224];
.shared .align 4 .b8 shared_cache_0337[4224];
.shared .align 4 .b8 shared_cache_1338[4224];
.shared .align 4 .b8 shared_cache_2339[4224];
.shared .align 4 .b8 shared_cache_3340[4224];
.shared .align 4 .b8 shared_cache_0341[4224];
.shared .align 4 .b8 shared_cache_1342[4224];
.shared .align 4 .b8 shared_cache_2343[4224];
.shared .align 4 .b8 shared_cache_3344[4224];
.shared .align 4 .b8 shared_cache_0345[4224];
.shared .align 4 .b8 shared_cache_1346[4224];
.shared .align 4 .b8 shared_cache_2347[4224];
.shared .align 4 .b8 shared_cache_3348[4224];
.shared .align 4 .b8 shared_cache_0349[4224];
.shared .align 4 .b8 shared_cache_1350[4224];
.shared .align 4 .b8 shared_cache_2351[4224];
.shared .align 4 .b8 shared_cache_3352[4224];
.shared .align 4 .b8 shared_cache_0353[4224];
.shared .align 4 .b8 shared_cache_1354[4224];
.shared .align 4 .b8 shared_cache_2355[4224];
.shared .align 4 .b8 shared_cache_3356[4224];
.shared .align 4 .b8 shared_cache_0357[4224];
.shared .align 4 .b8 shared_cache_1358[4224];
.shared .align 4 .b8 shared_cache_2359[4224];
.shared .align 4 .b8 shared_cache_3360[4224];
.shared .align 4 .b8 shared_cache_0361[4224];
.shared .align 4 .b8 shared_cache_1362[4224];
.shared .align 4 .b8 shared_cache_2363[4224];
.shared .align 4 .b8 shared_cache_3364[4224];
.shared .align 4 .b8 shared_cache_0365[4224];
.shared .align 4 .b8 shared_cache_1366[4224];
.shared .align 4 .b8 shared_cache_2367[4224];
.shared .align 4 .b8 shared_cache_3368[4224];
.shared .align 4 .b8 shared_cache_0369[4224];
.shared .align 4 .b8 shared_cache_1370[4224];
.shared .align 4 .b8 shared_cache_2371[4224];
.shared .align 4 .b8 shared_cache_3372[4224];
.shared .align 4 .b8 shared_cache_0373[4224];
.shared .align 4 .b8 shared_cache_1374[4224];
.shared .align 4 .b8 shared_cache_2375[4224];
.shared .align 4 .b8 shared_cache_3376[4224];
.shared .align 4 .b8 shared_cache_0377[4224];
.shared .align 4 .b8 shared_cache_1378[4224];
.shared .align 4 .b8 shared_cache_2379[4224];
.shared .align 4 .b8 shared_cache_3380[4224];
.shared .align 4 .b8 shared_cache_0381[4224];
.shared .align 4 .b8 shared_cache_1382[4224];
.visible .global .align 64 .b8 buffer_for_constant_519[4];
.shared .align 4 .b8 shared_cache_0383[128];
.shared .align 4 .b8 shared_cache_0384[4224];
.shared .align 4 .b8 shared_cache_0385[4224];
.shared .align 4 .b8 shared_cache_0386[4224];
.shared .align 4 .b8 shared_cache_0387[4224];
.shared .align 4 .b8 shared_cache_0388[4224];
.shared .align 4 .b8 shared_cache_0389[128];
.shared .align 4 .b8 shared_cache_0390[4224];
.shared .align 4 .b8 shared_cache_0391[4224];
.shared .align 4 .b8 shared_cache_0392[4224];
.shared .align 4 .b8 shared_cache_0393[4224];
.shared .align 4 .b8 shared_cache_0394[4224];
.shared .align 4 .b8 shared_cache_0395[128];
.shared .align 4 .b8 shared_cache_0396[4224];
.shared .align 4 .b8 shared_cache_0397[4224];
.shared .align 4 .b8 shared_cache_0398[4224];
.shared .align 4 .b8 shared_cache_0399[4224];
.shared .align 4 .b8 shared_cache_0400[4224];
.shared .align 4 .b8 shared_cache_0401[128];
.shared .align 4 .b8 shared_cache_0402[4224];
.shared .align 4 .b8 shared_cache_0403[4224];
.shared .align 4 .b8 shared_cache_0404[4224];
.shared .align 4 .b8 shared_cache_0405[4224];
.shared .align 4 .b8 shared_cache_0406[4224];
.shared .align 4 .b8 shared_cache_0407[128];
.shared .align 4 .b8 shared_cache_0408[4224];
.shared .align 4 .b8 shared_cache_0409[4224];
.shared .align 4 .b8 shared_cache_0410[4224];
.shared .align 4 .b8 shared_cache_0411[4224];
.shared .align 4 .b8 shared_cache_0412[4224];
.shared .align 4 .b8 shared_cache_0413[128];
.shared .align 4 .b8 shared_cache_0414[4224];
.shared .align 4 .b8 shared_cache_0415[4224];
.shared .align 4 .b8 shared_cache_0416[4224];
.shared .align 4 .b8 shared_cache_0417[4224];
.shared .align 4 .b8 shared_cache_0418[4224];
.shared .align 4 .b8 shared_cache_0419[128];
.shared .align 4 .b8 shared_cache_0420[4224];
.shared .align 4 .b8 shared_cache_0421[4224];
.shared .align 4 .b8 shared_cache_0422[4224];
.shared .align 4 .b8 shared_cache_0423[4224];
.shared .align 4 .b8 shared_cache_0424[4224];
.shared .align 4 .b8 shared_cache_0425[128];
.shared .align 4 .b8 shared_cache_0426[4224];
.shared .align 4 .b8 shared_cache_0427[4224];
.shared .align 4 .b8 shared_cache_0428[4224];
.shared .align 4 .b8 shared_cache_0429[4224];
.shared .align 4 .b8 shared_cache_0430[4224];
.shared .align 4 .b8 shared_cache_0431[128];
.shared .align 4 .b8 shared_cache_0432[4224];
.shared .align 4 .b8 shared_cache_0433[4224];
.shared .align 4 .b8 shared_cache_0434[4224];
.shared .align 4 .b8 shared_cache_0435[4224];
.shared .align 4 .b8 shared_cache_0436[4224];
.shared .align 4 .b8 shared_cache_0437[128];
.shared .align 4 .b8 shared_cache_0438[4224];
.shared .align 4 .b8 shared_cache_0439[4224];
.shared .align 4 .b8 shared_cache_0440[4224];
.shared .align 4 .b8 shared_cache_0441[4224];
.shared .align 4 .b8 shared_cache_0442[4224];
.shared .align 4 .b8 shared_cache_0443[128];
.shared .align 4 .b8 shared_cache_0444[4224];
.shared .align 4 .b8 shared_cache_0445[4224];
.shared .align 4 .b8 shared_cache_0446[4224];
.shared .align 4 .b8 shared_cache_0447[4224];
.shared .align 4 .b8 shared_cache_0448[4224];
.shared .align 4 .b8 shared_cache_0449[128];
.shared .align 4 .b8 shared_cache_0450[4224];
.shared .align 4 .b8 shared_cache_0451[4224];
.shared .align 4 .b8 shared_cache_0452[4224];
.shared .align 4 .b8 shared_cache_0453[4224];
.shared .align 4 .b8 shared_cache_0454[4224];
.shared .align 4 .b8 shared_cache_0455[128];
.shared .align 4 .b8 shared_cache_0456[4224];
.shared .align 4 .b8 shared_cache_0457[4224];
.shared .align 4 .b8 shared_cache_0458[4224];
.shared .align 4 .b8 shared_cache_0459[4224];
.shared .align 4 .b8 shared_cache_0460[4224];
.shared .align 4 .b8 shared_cache_0461[128];
.shared .align 4 .b8 shared_cache_0462[4224];
.shared .align 4 .b8 shared_cache_0463[4224];
.shared .align 4 .b8 shared_cache_0464[4224];
.shared .align 4 .b8 shared_cache_0465[4224];
.shared .align 4 .b8 shared_cache_0466[4224];
.shared .align 4 .b8 shared_cache_0467[128];
.shared .align 4 .b8 shared_cache_0468[4224];
.shared .align 4 .b8 shared_cache_0469[4224];
.shared .align 4 .b8 shared_cache_0470[4224];
.shared .align 4 .b8 shared_cache_0471[4224];
.shared .align 4 .b8 shared_cache_0472[4224];
.shared .align 4 .b8 shared_cache_0473[128];
.shared .align 4 .b8 shared_cache_0474[4224];
.shared .align 4 .b8 shared_cache_0475[4224];
.shared .align 4 .b8 shared_cache_0476[4224];
.shared .align 4 .b8 shared_cache_0477[4224];
.shared .align 4 .b8 shared_cache_0478[4224];
.shared .align 4 .b8 shared_cache_0479[128];
.shared .align 4 .b8 shared_cache_0480[4224];
.shared .align 4 .b8 shared_cache_0481[4224];
.shared .align 4 .b8 shared_cache_0482[4224];
.shared .align 4 .b8 shared_cache_0483[4224];
.shared .align 4 .b8 shared_cache_0484[4224];
.shared .align 4 .b8 shared_cache_0485[128];
.shared .align 4 .b8 shared_cache_0486[4224];
.shared .align 4 .b8 shared_cache_0487[4224];
.shared .align 4 .b8 shared_cache_0488[4224];
.shared .align 4 .b8 shared_cache_0489[4224];
.shared .align 4 .b8 shared_cache_0490[4224];
.shared .align 4 .b8 shared_cache_0491[128];
.shared .align 4 .b8 shared_cache_0492[4224];
.shared .align 4 .b8 shared_cache_0493[4224];
.shared .align 4 .b8 shared_cache_0494[4224];
.shared .align 4 .b8 shared_cache_0495[4224];
.shared .align 4 .b8 shared_cache_0496[4224];
.shared .align 4 .b8 shared_cache_0497[128];
.shared .align 4 .b8 shared_cache_0498[4224];
.shared .align 4 .b8 shared_cache_0499[4224];
.shared .align 4 .b8 shared_cache_0500[4224];
.shared .align 4 .b8 shared_cache_0501[4224];
.shared .align 4 .b8 shared_cache_0502[4224];
.shared .align 4 .b8 shared_cache_0503[128];
.shared .align 4 .b8 shared_cache_0504[4224];
.shared .align 4 .b8 shared_cache_0505[4224];
.shared .align 4 .b8 shared_cache_0506[4224];
.shared .align 4 .b8 shared_cache_0507[4224];
.shared .align 4 .b8 shared_cache_0508[4224];
.shared .align 4 .b8 shared_cache_0509[128];
.shared .align 4 .b8 shared_cache_0510[4224];
.shared .align 4 .b8 shared_cache_0511[4224];
.shared .align 4 .b8 shared_cache_0512[4224];
.shared .align 4 .b8 shared_cache_0513[4224];
.shared .align 4 .b8 shared_cache_0514[4224];
.shared .align 4 .b8 shared_cache_0515[128];
.shared .align 4 .b8 shared_cache_0516[4224];
.shared .align 4 .b8 shared_cache_0517[4224];
.shared .align 4 .b8 shared_cache_0518[4224];
.shared .align 4 .b8 shared_cache_0519[4224];
.shared .align 4 .b8 shared_cache_0520[4224];
.shared .align 4 .b8 shared_cache_0521[128];
.shared .align 4 .b8 shared_cache_0522[4224];
.shared .align 4 .b8 shared_cache_0523[4224];
.shared .align 4 .b8 shared_cache_0524[4224];
.shared .align 4 .b8 shared_cache_0525[4224];
.shared .align 4 .b8 shared_cache_0526[4224];
.shared .align 4 .b8 shared_cache_0527[128];
.visible .entry fusion_2287(
.param .u64 fusion_2287_param_0,
.param .u64 fusion_2287_param_1,
.param .u64 fusion_2287_param_2
)
.reqntid 256, 1, 1
{
.reg .pred %p<8>;
.reg .b16 %h<10>;
.reg .b32 %r<12>;
.reg .b64 %rd<9>;
ld.param.u64 %rd4, [fusion_2287_param_0];
ld.param.u64 %rd5, [fusion_2287_param_1];
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
shl.b32 %r5, %r3, 10;
shl.b32 %r6, %r4, 2;
or.b32 %r1, %r6, %r5;
shr.u32 %r7, %r1, 3;
or.b32 %r8, %r6, 1;
and.b32 %r2, %r8, 5;
and.b32 %r9, %r4, 1;
setp.eq.b32 %p1, %r9, 1;
mov.pred %p2, 0;
xor.pred %p3, %p1, %p2;
not.pred %p4, %p3;
mul.wide.u32 %rd7, %r7, 4;
add.s64 %rd2, %rd6, %rd7;
mov.b16 %h8, 0x0000;
@%p4 bra LBB0_3;
bra.uni LBB0_1;
LBB0_3:
ld.global.nc.u32 %r10, [%rd2];
setp.eq.s32 %p5, %r10, 0;
selp.b16 %h8, 0x3C00, 0x0000, %p5;
LBB0_1:
mul.wide.u32 %rd8, %r1, 2;
add.s64 %rd3, %rd1, %rd8;
st.global.b16 [%rd3], %h8;
setp.eq.s32 %p6, %r2, 1;
mov.b16 %h9, 0x0000;
@%p6 bra LBB0_4;
bra.uni LBB0_2;
LBB0_4:
ld.global.nc.u32 %r11, [%rd2];
setp.eq.s32 %p7, %r11, 1;
selp.b16 %h9, 0x3C00, 0x0000, %p7;
LBB0_2:
st.global.b16 [%rd3+2], %h9;
mov.b16 %h7, 0x0000;
st.global.v2.b16 [%rd3+4], {%h7, %h7};
ret;
}
// .globl fusion_2285
.visible .entry fusion_2285(
.param .u64 fusion_2285_param_0,
.param .u64 fusion_2285_param_1,
.param .u64 fusion_2285_param_2,
.param .u64 fusion_2285_param_3,
.param .u64 fusion_2285_param_4
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<9>;
.reg .b32 %r<11>;
.reg .b64 %rd<23>;
ld.param.u64 %rd1, [fusion_2285_param_0];
ld.param.u64 %rd2, [fusion_2285_param_3];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2285_param_1];
ld.param.u64 %rd5, [fusion_2285_param_2];
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd7, %rd4;
cvta.to.global.u64 %rd8, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
mul.wide.u32 %rd9, %r1, 4;
add.s64 %rd10, %rd3, %rd9;
ld.global.nc.u32 %r8, [%rd10];
max.s32 %r9, %r8, 0;
min.s32 %r10, %r9, 30521;
mul.wide.u32 %rd11, %r10, 4096;
add.s64 %rd12, %rd8, %rd11;
mul.wide.u32 %rd13, %r4, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd15, %r5, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.v4.f32 {%f2, %f3, %f4, %f5}, [%rd16];
cvt.rn.f16.f32 %h2, %f2;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd17, %r5, 2;
add.s64 %rd18, %rd7, %rd17;
mul.wide.u32 %rd19, %r6, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f6, [%rd20];
cvt.rn.f16.f32 %h4, %f6;
cvt.rn.f16.f32 %h5, %f3;
add.rn.f16 %h6, %h4, %h5;
mul.wide.u32 %rd21, %r7, 4;
add.s64 %rd22, %rd12, %rd21;
ld.global.nc.f32 %f7, [%rd22];
cvt.rn.f16.f32 %h7, %f7;
cvt.rn.f16.f32 %h8, %f4;
add.rn.f16 %h9, %h7, %h8;
ld.global.nc.f32 %f8, [%rd14+12];
cvt.rn.f16.f32 %h10, %f8;
cvt.rn.f16.f32 %h11, %f5;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd18], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2286
.visible .entry fusion_2286(
.param .u64 fusion_2286_param_0,
.param .u64 fusion_2286_param_1,
.param .u64 fusion_2286_param_2
)
.reqntid 256, 1, 1
{
.reg .pred %p<4>;
.reg .b16 %h<14>;
.reg .f32 %f<5>;
.reg .b32 %r<10>;
.reg .b64 %rd<18>;
ld.param.u64 %rd6, [fusion_2286_param_0];
ld.param.u64 %rd7, [fusion_2286_param_1];
cvta.to.global.u64 %rd1, %rd7;
cvta.to.global.u64 %rd2, %rd6;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %tid.x;
shl.b32 %r6, %r4, 10;
shl.b32 %r7, %r5, 2;
or.b32 %r1, %r7, %r6;
setp.lt.u32 %p1, %r4, 2;
cvt.u64.u32 %rd3, %r4;
mul.wide.u32 %rd8, %r4, 4096;
add.s64 %rd9, %rd1, %rd8;
mul.wide.u32 %rd10, %r7, 4;
add.s64 %rd4, %rd9, %rd10;
mov.b16 %h11, 0x0000;
@%p1 bra LBB2_5;
bra.uni LBB2_1;
LBB2_5:
ld.global.nc.f32 %f1, [%rd4];
cvt.rn.f16.f32 %h11, %f1;
LBB2_1:
cvt.u32.u64 %r8, %rd3;
setp.lt.u32 %p2, %r8, 2;
mul.wide.u32 %rd11, %r1, 2;
add.s64 %rd5, %rd2, %rd11;
st.global.b16 [%rd5], %h11;
@%p2 bra LBB2_6;
bra.uni LBB2_2;
LBB2_6:
or.b32 %r2, %r7, 1;
or.b32 %r3, %r7, 2;
shl.b64 %rd12, %rd3, 12;
add.s64 %rd13, %rd1, %rd12;
mul.wide.u32 %rd14, %r2, 4;
add.s64 %rd15, %rd13, %rd14;
ld.global.nc.f32 %f2, [%rd15];
cvt.rn.f16.f32 %h9, %f2;
st.global.b16 [%rd5+2], %h9;
mul.wide.u32 %rd16, %r3, 4;
add.s64 %rd17, %rd13, %rd16;
ld.global.nc.f32 %f3, [%rd17];
cvt.rn.f16.f32 %h12, %f3;
bra.uni LBB2_3;
LBB2_2:
mov.b16 %h12, 0x0000;
st.global.b16 [%rd5+2], %h12;
LBB2_3:
st.global.b16 [%rd5+4], %h12;
mov.b16 %h13, 0x0000;
@%p2 bra LBB2_7;
bra.uni LBB2_4;
LBB2_7:
ld.global.nc.f32 %f4, [%rd4+12];
cvt.rn.f16.f32 %h13, %f4;
LBB2_4:
st.global.b16 [%rd5+6], %h13;
ret;
}
// .globl fusion_2283
.visible .entry fusion_2283(
.param .u64 fusion_2283_param_0,
.param .u64 fusion_2283_param_1,
.param .u64 fusion_2283_param_2
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot3[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<56>;
.reg .b32 %r<7>;
.reg .b64 %rd<22>;
mov.u64 %SPL, __local_depot3;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2283_param_0];
cvta.to.global.u64 %rd8, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd10, %r4, 2048;
add.s64 %rd11, %rd8, %rd10;
mul.wide.u32 %rd12, %r5, 2;
add.s64 %rd13, %rd11, %rd12;
ld.global.nc.b32 %hh1, [%rd13];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
add.rn.f32 %f3, %f2, 0f00000000;
cvt.f32.f16 %f4, %h2;
add.rn.f32 %f5, %f3, %f4;
ld.global.nc.b32 %hh2, [%rd13+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f6, %h3;
add.rn.f32 %f7, %f5, %f6;
cvt.f32.f16 %f8, %h4;
add.rn.f32 %f9, %f7, %f8;
ld.global.nc.b32 %hh3, [%rd13+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f10, %h5;
add.rn.f32 %f11, %f9, %f10;
cvt.f32.f16 %f12, %h6;
add.rn.f32 %f13, %f11, %f12;
ld.global.nc.b32 %hh4, [%rd13+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f14, %h7;
add.rn.f32 %f15, %f13, %f14;
cvt.f32.f16 %f16, %h8;
add.rn.f32 %f17, %f15, %f16;
ld.global.nc.b32 %hh5, [%rd13+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f18, %h9;
add.rn.f32 %f19, %f17, %f18;
cvt.f32.f16 %f20, %h10;
add.rn.f32 %f21, %f19, %f20;
ld.global.nc.b32 %hh6, [%rd13+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f22, %h11;
add.rn.f32 %f23, %f21, %f22;
cvt.f32.f16 %f24, %h12;
add.rn.f32 %f25, %f23, %f24;
ld.global.nc.b32 %hh7, [%rd13+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f26, %h13;
add.rn.f32 %f27, %f25, %f26;
cvt.f32.f16 %f28, %h14;
add.rn.f32 %f29, %f27, %f28;
ld.global.nc.b32 %hh8, [%rd13+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f30, %h15;
add.rn.f32 %f31, %f29, %f30;
cvt.f32.f16 %f32, %h16;
add.rn.f32 %f33, %f31, %f32;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
add.rn.f32 %f35, %f34, %f33;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
add.rn.f32 %f37, %f36, %f35;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
add.rn.f32 %f39, %f38, %f37;
shfl.sync.down.b32 %f40, %f39, 2, 31, -1;
add.rn.f32 %f41, %f40, %f39;
shfl.sync.down.b32 %f42, %f41, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd16, shared_cache_0;
@%p1 bra LBB3_3;
bra.uni LBB3_1;
LBB3_3:
mul.wide.u32 %rd15, %r3, 4;
add.s64 %rd3, %rd16, %rd15;
add.rn.f32 %f1, %f42, %f41;
st.shared.f32 [%rd3], %f1;
LBB3_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB3_4;
bra.uni LBB3_2;
LBB3_4:
add.u64 %rd9, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd17, %r2, 4;
add.s64 %rd4, %rd16, %rd17;
cvta.shared.u64 %rd19, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd21, %rd19, %rd9, %p3;
ld.f32 %f43, [%rd21];
shfl.sync.down.b32 %f44, %f43, 16, 31, -1;
add.rn.f32 %f45, %f43, %f44;
shfl.sync.down.b32 %f46, %f45, 8, 31, -1;
add.rn.f32 %f47, %f45, %f46;
shfl.sync.down.b32 %f48, %f47, 4, 31, -1;
add.rn.f32 %f49, %f47, %f48;
shfl.sync.down.b32 %f50, %f49, 2, 31, -1;
add.rn.f32 %f51, %f49, %f50;
shfl.sync.down.b32 %f52, %f51, 1, 31, -1;
add.rn.f32 %f53, %f51, %f52;
st.f32 [%rd21], %f53;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB3_2;
ld.param.u64 %rd6, [fusion_2283_param_1];
cvta.to.global.u64 %rd7, %rd6;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd2, %rd7, %rd14;
ld.shared.f32 %f54, [%rd4];
atom.global.add.f32 %f55, [%rd2], %f54;
LBB3_2:
ret;
}
// .globl fusion_2281
.visible .entry fusion_2281(
.param .u64 fusion_2281_param_0,
.param .u64 fusion_2281_param_1,
.param .u64 fusion_2281_param_2,
.param .u64 fusion_2281_param_3
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot4[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<90>;
.reg .b32 %r<7>;
.reg .b64 %rd<25>;
mov.u64 %SPL, __local_depot4;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2281_param_0];
ld.param.u64 %rd6, [fusion_2281_param_2];
cvta.to.global.u64 %rd7, %rd6;
cvta.to.global.u64 %rd10, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd12, %r4, 2048;
add.s64 %rd13, %rd10, %rd12;
mul.wide.u32 %rd14, %r5, 2;
add.s64 %rd15, %rd13, %rd14;
ld.global.nc.b32 %hh1, [%rd15];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
mul.wide.u32 %rd16, %r4, 4;
add.s64 %rd17, %rd7, %rd16;
ld.global.nc.f32 %f3, [%rd17];
mul.rn.f32 %f4, %f3, 0f3A800000;
sub.rn.f32 %f5, %f2, %f4;
mul.rn.f32 %f6, %f5, %f5;
add.rn.f32 %f7, %f6, 0f00000000;
cvt.f32.f16 %f8, %h2;
sub.rn.f32 %f9, %f8, %f4;
mul.rn.f32 %f10, %f9, %f9;
add.rn.f32 %f11, %f7, %f10;
ld.global.nc.b32 %hh2, [%rd15+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f12, %h3;
sub.rn.f32 %f13, %f12, %f4;
mul.rn.f32 %f14, %f13, %f13;
add.rn.f32 %f15, %f11, %f14;
cvt.f32.f16 %f16, %h4;
sub.rn.f32 %f17, %f16, %f4;
mul.rn.f32 %f18, %f17, %f17;
add.rn.f32 %f19, %f15, %f18;
ld.global.nc.b32 %hh3, [%rd15+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f20, %h5;
sub.rn.f32 %f21, %f20, %f4;
mul.rn.f32 %f22, %f21, %f21;
add.rn.f32 %f23, %f19, %f22;
cvt.f32.f16 %f24, %h6;
sub.rn.f32 %f25, %f24, %f4;
mul.rn.f32 %f26, %f25, %f25;
add.rn.f32 %f27, %f23, %f26;
ld.global.nc.b32 %hh4, [%rd15+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f28, %h7;
sub.rn.f32 %f29, %f28, %f4;
mul.rn.f32 %f30, %f29, %f29;
add.rn.f32 %f31, %f27, %f30;
cvt.f32.f16 %f32, %h8;
sub.rn.f32 %f33, %f32, %f4;
mul.rn.f32 %f34, %f33, %f33;
add.rn.f32 %f35, %f31, %f34;
ld.global.nc.b32 %hh5, [%rd15+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f36, %h9;
sub.rn.f32 %f37, %f36, %f4;
mul.rn.f32 %f38, %f37, %f37;
add.rn.f32 %f39, %f35, %f38;
cvt.f32.f16 %f40, %h10;
sub.rn.f32 %f41, %f40, %f4;
mul.rn.f32 %f42, %f41, %f41;
add.rn.f32 %f43, %f39, %f42;
ld.global.nc.b32 %hh6, [%rd15+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f44, %h11;
sub.rn.f32 %f45, %f44, %f4;
mul.rn.f32 %f46, %f45, %f45;
add.rn.f32 %f47, %f43, %f46;
cvt.f32.f16 %f48, %h12;
sub.rn.f32 %f49, %f48, %f4;
mul.rn.f32 %f50, %f49, %f49;
add.rn.f32 %f51, %f47, %f50;
ld.global.nc.b32 %hh7, [%rd15+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f52, %h13;
sub.rn.f32 %f53, %f52, %f4;
mul.rn.f32 %f54, %f53, %f53;
add.rn.f32 %f55, %f51, %f54;
cvt.f32.f16 %f56, %h14;
sub.rn.f32 %f57, %f56, %f4;
mul.rn.f32 %f58, %f57, %f57;
add.rn.f32 %f59, %f55, %f58;
ld.global.nc.b32 %hh8, [%rd15+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f60, %h15;
sub.rn.f32 %f61, %f60, %f4;
mul.rn.f32 %f62, %f61, %f61;
add.rn.f32 %f63, %f59, %f62;
cvt.f32.f16 %f64, %h16;
sub.rn.f32 %f65, %f64, %f4;
mul.rn.f32 %f66, %f65, %f65;
add.rn.f32 %f67, %f63, %f66;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f68, %f67, 16, 31, -1;
add.rn.f32 %f69, %f68, %f67;
shfl.sync.down.b32 %f70, %f69, 8, 31, -1;
add.rn.f32 %f71, %f70, %f69;
shfl.sync.down.b32 %f72, %f71, 4, 31, -1;
add.rn.f32 %f73, %f72, %f71;
shfl.sync.down.b32 %f74, %f73, 2, 31, -1;
add.rn.f32 %f75, %f74, %f73;
shfl.sync.down.b32 %f76, %f75, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd19, shared_cache_01;
@%p1 bra LBB4_3;
bra.uni LBB4_1;
LBB4_3:
mul.wide.u32 %rd18, %r3, 4;
add.s64 %rd3, %rd19, %rd18;
add.rn.f32 %f1, %f76, %f75;
st.shared.f32 [%rd3], %f1;
LBB4_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB4_4;
bra.uni LBB4_2;
LBB4_4:
add.u64 %rd11, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd20, %r2, 4;
add.s64 %rd4, %rd19, %rd20;
cvta.shared.u64 %rd22, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd24, %rd22, %rd11, %p3;
ld.f32 %f77, [%rd24];
shfl.sync.down.b32 %f78, %f77, 16, 31, -1;
add.rn.f32 %f79, %f77, %f78;
shfl.sync.down.b32 %f80, %f79, 8, 31, -1;
add.rn.f32 %f81, %f79, %f80;
shfl.sync.down.b32 %f82, %f81, 4, 31, -1;
add.rn.f32 %f83, %f81, %f82;
shfl.sync.down.b32 %f84, %f83, 2, 31, -1;
add.rn.f32 %f85, %f83, %f84;
shfl.sync.down.b32 %f86, %f85, 1, 31, -1;
add.rn.f32 %f87, %f85, %f86;
st.f32 [%rd24], %f87;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB4_2;
ld.param.u64 %rd8, [fusion_2281_param_1];
cvta.to.global.u64 %rd9, %rd8;
add.s64 %rd2, %rd9, %rd16;
ld.shared.f32 %f88, [%rd4];
atom.global.add.f32 %f89, [%rd2], %f88;
LBB4_2:
ret;
}
// .globl rng_get_and_update_state
.visible .entry rng_get_and_update_state(
.param .u64 rng_get_and_update_state_param_0,
.param .u64 rng_get_and_update_state_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 524288;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 524288;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2278
.visible .entry fusion_2278(
.param .u64 fusion_2278_param_0,
.param .u64 fusion_2278_param_1,
.param .u64 fusion_2278_param_2,
.param .u64 fusion_2278_param_3,
.param .u64 fusion_2278_param_4,
.param .u64 fusion_2278_param_5,
.param .u64 fusion_2278_param_6,
.param .u64 fusion_2278_param_7
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<27>;
.reg .b32 %hh<3>;
.reg .f32 %f<47>;
.reg .b32 %r<31>;
.reg .b64 %rd<138>;
ld.param.u64 %rd1, [fusion_2278_param_0];
ld.param.u64 %rd2, [fusion_2278_param_6];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2278_param_1];
ld.param.u64 %rd5, [fusion_2278_param_5];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2278_param_2];
ld.param.u64 %rd8, [fusion_2278_param_4];
cvta.to.global.u64 %rd9, %rd8;
ld.param.u64 %rd10, [fusion_2278_param_3];
cvta.to.global.u64 %rd11, %rd10;
cvta.to.global.u64 %rd12, %rd7;
cvta.to.global.u64 %rd13, %rd4;
cvta.to.global.u64 %rd14, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
shr.u32 %r8, %r5, 2;
ld.global.nc.v2.u64 {%rd15, %rd16}, [%rd12];
cvt.u64.u32 %rd17, %r8;
add.s64 %rd18, %rd15, %rd17;
setp.lt.u64 %p1, %rd18, %rd15;
and.b64 %rd19, %rd18, 4294967295;
mul.lo.s64 %rd20, %rd19, 3528531795;
selp.u64 %rd21, 1, 0, %p1;
add.s64 %rd22, %rd16, %rd21;
xor.b64 %rd23, %rd22, %rd20;
shr.u64 %rd24, %rd23, 32;
mul.lo.s64 %rd25, %rd24, 3449720151;
shr.u64 %rd26, %rd25, 32;
and.b64 %rd27, %rd22, 4294967295;
mul.lo.s64 %rd28, %rd27, 3449720151;
and.b64 %rd29, %rd28, 4294967295;
xor.b64 %rd30, %rd29, %rd26;
xor.b64 %rd31, %rd30, 2654435769;
mul.lo.s64 %rd32, %rd31, 3528531795;
shr.u64 %rd33, %rd32, 32;
xor.b64 %rd34, %rd28, %rd18;
shr.u64 %rd35, %rd34, 32;
mul.lo.s64 %rd36, %rd35, 3528531795;
and.b64 %rd37, %rd36, 4294967295;
xor.b64 %rd38, %rd37, %rd33;
xor.b64 %rd39, %rd38, 1993301258;
mul.lo.s64 %rd40, %rd39, 3449720151;
shr.u64 %rd41, %rd40, 32;
shr.u64 %rd42, %rd36, 32;
and.b64 %rd43, %rd20, 4294967295;
xor.b64 %rd44, %rd43, %rd42;
xor.b64 %rd45, %rd44, 3144134277;
mul.lo.s64 %rd46, %rd45, 3449720151;
and.b64 %rd47, %rd46, 4294967295;
xor.b64 %rd48, %rd47, %rd41;
xor.b64 %rd49, %rd48, 3668340011;
mul.lo.s64 %rd50, %rd49, 3528531795;
shr.u64 %rd51, %rd50, 32;
shr.u64 %rd52, %rd46, 32;
and.b64 %rd53, %rd25, 4294967295;
xor.b64 %rd54, %rd53, %rd52;
xor.b64 %rd55, %rd54, 1013904242;
mul.lo.s64 %rd56, %rd55, 3528531795;
and.b64 %rd57, %rd56, 4294967295;
xor.b64 %rd58, %rd57, %rd51;
xor.b64 %rd59, %rd58, 3986602516;
mul.lo.s64 %rd60, %rd59, 3449720151;
shr.u64 %rd61, %rd60, 32;
shr.u64 %rd62, %rd56, 32;
and.b64 %rd63, %rd32, 4294967295;
xor.b64 %rd64, %rd63, %rd62;
xor.b64 %rd65, %rd64, 842468239;
mul.lo.s64 %rd66, %rd65, 3449720151;
and.b64 %rd67, %rd66, 4294967295;
xor.b64 %rd68, %rd67, %rd61;
xor.b64 %rd69, %rd68, 387276957;
mul.lo.s64 %rd70, %rd69, 3528531795;
shr.u64 %rd71, %rd70, 32;
shr.u64 %rd72, %rd66, 32;
and.b64 %rd73, %rd40, 4294967295;
xor.b64 %rd74, %rd73, %rd72;
xor.b64 %rd75, %rd74, 2027808484;
mul.lo.s64 %rd76, %rd75, 3528531795;
and.b64 %rd77, %rd76, 4294967295;
shr.u64 %rd78, %rd76, 32;
and.b64 %rd79, %rd50, 4294967295;
xor.b64 %rd80, %rd79, %rd78;
xor.b64 %rd81, %rd80, 2835769497;
mul.lo.s64 %rd82, %rd81, 3449720151;
and.b64 %rd83, %rd82, 4294967295;
shr.u64 %rd84, %rd82, 32;
and.b64 %rd85, %rd60, 4294967295;
xor.b64 %rd86, %rd85, %rd84;
xor.b64 %rd87, %rd86, 3041712726;
mul.lo.s64 %rd88, %rd87, 3528531795;
and.b64 %rd89, %rd88, 4294967295;
xor.b64 %rd90, %rd77, %rd71;
xor.b64 %rd91, %rd90, 1684936478;
mul.lo.s64 %rd92, %rd91, 3449720151;
shr.u64 %rd93, %rd92, 32;
xor.b64 %rd94, %rd83, %rd93;
xor.b64 %rd95, %rd94, 1401181199;
mul.lo.s64 %rd96, %rd95, 3528531795;
shr.u64 %rd97, %rd96, 32;
xor.b64 %rd98, %rd89, %rd97;
xor.b64 %rd99, %rd98, 3678237736;
mul.lo.s64 %rd100, %rd99, 3449720151;
shr.u64 %rd101, %rd100, 32;
cvt.u32.u64 %r9, %rd101;
shr.u64 %rd102, %rd88, 32;
xor.b64 %rd103, %rd102, %rd70;
cvt.u32.u64 %r10, %rd103;
xor.b32 %r11, %r10, 534103459;
mul.lo.s32 %r12, %r11, -845247145;
xor.b32 %r13, %r12, %r9;
shr.u32 %r14, %r13, 9;
xor.b32 %r15, %r14, 4716963;
cvt.rn.f32.u32 %f1, %r15;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h1, %f2;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p2, %h1, %h2;
mul.wide.u32 %rd104, %r5, 2;
add.s64 %rd105, %rd13, %rd104;
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd105];
mov.b32 %hh1, {%h5, %h6};
mov.b32 %hh2, {%h3, %h4};
mov.b32 {%h7, %h8}, %hh2;
mov.b32 {%h9, %h10}, %hh1;
cvt.f32.f16 %f3, %h7;
mul.wide.u32 %rd106, %r1, 4;
add.s64 %rd107, %rd11, %rd106;
ld.global.nc.f32 %f4, [%rd107];
mul.rn.f32 %f5, %f4, 0f3A800000;
add.rn.f32 %f6, %f5, 0f2B8CBCCC;
rsqrt.approx.f32 %f7, %f6;
mul.wide.u32 %rd108, %r4, 4;
add.s64 %rd109, %rd3, %rd108;
ld.global.nc.f32 %f8, [%rd109];
mul.rn.f32 %f9, %f7, %f8;
mul.rn.f32 %f10, %f9, %f3;
add.s64 %rd110, %rd6, %rd108;
ld.global.nc.f32 %f11, [%rd110];
add.s64 %rd111, %rd9, %rd106;
ld.global.nc.f32 %f12, [%rd111];
mul.rn.f32 %f13, %f12, 0f3A800000;
mul.rn.f32 %f14, %f9, %f13;
sub.rn.f32 %f15, %f11, %f14;
add.rn.f32 %f16, %f10, %f15;
cvt.rn.f16.f32 %h11, %f16;
mov.b16 %h12, 0x3C72;
mul.rn.f16 %h13, %h11, %h12;
selp.b16 %h14, %h13, 0x0000, %p2;
add.s64 %rd112, %rd14, %rd104;
xor.b64 %rd113, %rd60, %rd84;
xor.b64 %rd114, %rd113, 3041712726;
mul.lo.s64 %rd115, %rd114, 3528531795;
xor.b64 %rd116, %rd97, %rd115;
cvt.u32.u64 %r16, %rd116;
xor.b32 %r17, %r16, -616729560;
mul.lo.s32 %r18, %r17, -845247145;
shr.u32 %r19, %r18, 9;
cvt.rn.f32.u32 %f17, %r19;
mul.rn.f32 %f18, %f17, 0f34000000;
cvt.rn.f16.f32 %h15, %f18;
setp.ge.f16 %p3, %h15, %h2;
cvt.f32.f16 %f19, %h8;
mul.wide.u32 %rd117, %r6, 4;
add.s64 %rd118, %rd3, %rd117;
ld.global.nc.f32 %f20, [%rd118];
mul.rn.f32 %f21, %f7, %f20;
mul.rn.f32 %f22, %f21, %f19;
add.s64 %rd119, %rd6, %rd117;
ld.global.nc.f32 %f23, [%rd119];
mul.rn.f32 %f24, %f13, %f21;
sub.rn.f32 %f25, %f23, %f24;
add.rn.f32 %f26, %f22, %f25;
cvt.rn.f16.f32 %h16, %f26;
mul.rn.f16 %h17, %h16, %h12;
selp.b16 %h18, %h17, 0x0000, %p3;
and.b64 %rd120, %rd92, 4294967295;
and.b64 %rd121, %rd70, 4294967295;
xor.b64 %rd122, %rd121, %rd102;
xor.b64 %rd123, %rd122, 534103459;
mul.lo.s64 %rd124, %rd123, 3449720151;
shr.u64 %rd125, %rd124, 32;
xor.b64 %rd126, %rd120, %rd125;
xor.b64 %rd127, %rd126, 4055616968;
mul.lo.s64 %rd128, %rd127, 3528531795;
shr.u64 %rd129, %rd128, 32;
cvt.u32.u64 %r20, %rd129;
xor.b64 %rd130, %rd93, %rd82;
cvt.u32.u64 %r21, %rd130;
xor.b32 %r22, %r21, 1401181199;
mul.lo.s32 %r23, %r22, -766435501;
xor.b32 %r24, %r23, %r20;
shr.u32 %r25, %r24, 9;
xor.b32 %r26, %r25, 4936337;
cvt.rn.f32.u32 %f27, %r26;
mul.rn.f32 %f28, %f27, 0f34000000;
cvt.rn.f16.f32 %h19, %f28;
setp.ge.f16 %p4, %h19, %h2;
cvt.f32.f16 %f29, %h9;
mul.wide.u32 %rd131, %r7, 4;
add.s64 %rd132, %rd3, %rd131;
ld.global.nc.f32 %f30, [%rd132];
mul.rn.f32 %f31, %f7, %f30;
mul.rn.f32 %f32, %f31, %f29;
add.s64 %rd133, %rd6, %rd131;
ld.global.nc.f32 %f33, [%rd133];
mul.rn.f32 %f34, %f13, %f31;
sub.rn.f32 %f35, %f33, %f34;
add.rn.f32 %f36, %f32, %f35;
cvt.rn.f16.f32 %h20, %f36;
mul.rn.f16 %h21, %h20, %h12;
selp.b16 %h22, %h21, 0x0000, %p4;
xor.b64 %rd134, %rd71, %rd76;
xor.b64 %rd135, %rd134, 1684936478;
mul.lo.s64 %rd136, %rd135, 3449720151;
xor.b64 %rd137, %rd125, %rd136;
cvt.u32.u64 %r27, %rd137;
xor.b32 %r28, %r27, -239350328;
mul.lo.s32 %r29, %r28, -766435501;
shr.u32 %r30, %r29, 9;
cvt.rn.f32.u32 %f37, %r30;
mul.rn.f32 %f38, %f37, 0f34000000;
cvt.rn.f16.f32 %h23, %f38;
setp.ge.f16 %p5, %h23, %h2;
cvt.f32.f16 %f39, %h10;
ld.global.nc.f32 %f40, [%rd109+12];
mul.rn.f32 %f41, %f7, %f40;
mul.rn.f32 %f42, %f41, %f39;
ld.global.nc.f32 %f43, [%rd110+12];
mul.rn.f32 %f44, %f13, %f41;
sub.rn.f32 %f45, %f43, %f44;
add.rn.f32 %f46, %f42, %f45;
cvt.rn.f16.f32 %h24, %f46;
mul.rn.f16 %h25, %h24, %h12;
selp.b16 %h26, %h25, 0x0000, %p5;
st.global.v4.b16 [%rd112], {%h14, %h18, %h22, %h26};
ret;
}
// .globl fusion_2710
.visible .entry fusion_2710(
.param .u64 fusion_2710_param_0,
.param .u64 fusion_2710_param_1,
.param .u64 fusion_2710_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2710_param_0];
ld.param.u64 %rd2, [fusion_2710_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2274
.visible .entry fusion_2274(
.param .u64 fusion_2274_param_0,
.param .u64 fusion_2274_param_1,
.param .u64 fusion_2274_param_2,
.param .u64 fusion_2274_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2274_param_0];
ld.param.u64 %rd2, [fusion_2274_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2274_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd5, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd6, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2711
.visible .entry fusion_2711(
.param .u64 fusion_2711_param_0,
.param .u64 fusion_2711_param_1,
.param .u64 fusion_2711_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2711_param_0];
ld.param.u64 %rd2, [fusion_2711_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2275
.visible .entry fusion_2275(
.param .u64 fusion_2275_param_0,
.param .u64 fusion_2275_param_1,
.param .u64 fusion_2275_param_2,
.param .u64 fusion_2275_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2275_param_0];
ld.param.u64 %rd2, [fusion_2275_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2275_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd5, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd6, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2271
.visible .entry fusion_2271(
.param .u64 fusion_2271_param_0,
.param .u64 fusion_2271_param_1,
.param .u64 fusion_2271_param_2,
.param .u64 fusion_2271_param_3
)
.reqntid 32, 1, 1
{
.local .align 4 .b8 __local_depot11[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<4>;
.reg .b16 %h<83>;
.reg .b32 %hh<9>;
.reg .f32 %f<57>;
.reg .b32 %r<37>;
.reg .b64 %rd<37>;
mov.u64 %SPL, __local_depot11;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd4, [fusion_2271_param_0];
ld.param.u64 %rd5, [fusion_2271_param_2];
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd9, %rd4;
add.u64 %rd10, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r5, %ctaid.x;
shl.b32 %r6, %r1, 1;
shl.b32 %r7, %r5, 9;
or.b32 %r8, %r7, %r6;
mul.wide.u32 %rd11, %r8, 2;
add.s64 %rd12, %rd9, %rd11;
ld.global.nc.b32 %hh1, [%rd12];
mov.b32 {%h1, %h2}, %hh1;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd6, %rd13;
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14];
cvt.rn.f16.s32 %h3, %r9;
mov.b16 %h4, 0x3C00;
sub.rn.f16 %h5, %h4, %h3;
mov.b16 %h6, 0x70E2;
mul.rn.f16 %h7, %h5, %h6;
sub.rn.f16 %h8, %h1, %h7;
cvt.f32.f16 %f2, %h8;
max.f32 %f3, %f2, 0fFF800000;
cvt.rn.f16.s32 %h9, %r10;
sub.rn.f16 %h10, %h4, %h9;
mul.rn.f16 %h11, %h10, %h6;
sub.rn.f16 %h12, %h2, %h11;
cvt.f32.f16 %f4, %h12;
max.f32 %f5, %f3, %f4;
or.b32 %r11, %r6, 64;
ld.global.nc.b32 %hh2, [%rd12+128];
mov.b32 {%h13, %h14}, %hh2;
mul.wide.u32 %rd15, %r11, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.u32 %r12, [%rd16];
cvt.rn.f16.s32 %h15, %r12;
sub.rn.f16 %h16, %h4, %h15;
mul.rn.f16 %h17, %h16, %h6;
sub.rn.f16 %h18, %h13, %h17;
cvt.f32.f16 %f6, %h18;
max.f32 %f7, %f5, %f6;
ld.global.nc.u32 %r13, [%rd14+260];
cvt.rn.f16.s32 %h19, %r13;
sub.rn.f16 %h20, %h4, %h19;
mul.rn.f16 %h21, %h20, %h6;
sub.rn.f16 %h22, %h14, %h21;
cvt.f32.f16 %f8, %h22;
max.f32 %f9, %f7, %f8;
or.b32 %r14, %r6, 128;
ld.global.nc.b32 %hh3, [%rd12+256];
mov.b32 {%h23, %h24}, %hh3;
mul.wide.u32 %rd17, %r14, 4;
add.s64 %rd18, %rd6, %rd17;
ld.global.nc.u32 %r15, [%rd18];
cvt.rn.f16.s32 %h25, %r15;
sub.rn.f16 %h26, %h4, %h25;
mul.rn.f16 %h27, %h26, %h6;
sub.rn.f16 %h28, %h23, %h27;
cvt.f32.f16 %f10, %h28;
max.f32 %f11, %f9, %f10;
ld.global.nc.u32 %r16, [%rd14+516];
cvt.rn.f16.s32 %h29, %r16;
sub.rn.f16 %h30, %h4, %h29;
mul.rn.f16 %h31, %h30, %h6;
sub.rn.f16 %h32, %h24, %h31;
cvt.f32.f16 %f12, %h32;
max.f32 %f13, %f11, %f12;
or.b32 %r17, %r6, 192;
ld.global.nc.b32 %hh4, [%rd12+384];
mov.b32 {%h33, %h34}, %hh4;
mul.wide.u32 %rd19, %r17, 4;
add.s64 %rd20, %rd6, %rd19;
ld.global.nc.u32 %r18, [%rd20];
cvt.rn.f16.s32 %h35, %r18;
sub.rn.f16 %h36, %h4, %h35;
mul.rn.f16 %h37, %h36, %h6;
sub.rn.f16 %h38, %h33, %h37;
cvt.f32.f16 %f14, %h38;
max.f32 %f15, %f13, %f14;
ld.global.nc.u32 %r19, [%rd14+772];
cvt.rn.f16.s32 %h39, %r19;
sub.rn.f16 %h40, %h4, %h39;
mul.rn.f16 %h41, %h40, %h6;
sub.rn.f16 %h42, %h34, %h41;
cvt.f32.f16 %f16, %h42;
max.f32 %f17, %f15, %f16;
or.b32 %r20, %r6, 256;
ld.global.nc.b32 %hh5, [%rd12+512];
mov.b32 {%h43, %h44}, %hh5;
mul.wide.u32 %rd21, %r20, 4;
add.s64 %rd22, %rd6, %rd21;
ld.global.nc.u32 %r21, [%rd22];
cvt.rn.f16.s32 %h45, %r21;
sub.rn.f16 %h46, %h4, %h45;
mul.rn.f16 %h47, %h46, %h6;
sub.rn.f16 %h48, %h43, %h47;
cvt.f32.f16 %f18, %h48;
max.f32 %f19, %f17, %f18;
ld.global.nc.u32 %r22, [%rd14+1028];
cvt.rn.f16.s32 %h49, %r22;
sub.rn.f16 %h50, %h4, %h49;
mul.rn.f16 %h51, %h50, %h6;
sub.rn.f16 %h52, %h44, %h51;
cvt.f32.f16 %f20, %h52;
max.f32 %f21, %f19, %f20;
or.b32 %r23, %r6, 320;
ld.global.nc.b32 %hh6, [%rd12+640];
mov.b32 {%h53, %h54}, %hh6;
mul.wide.u32 %rd23, %r23, 4;
add.s64 %rd24, %rd6, %rd23;
ld.global.nc.u32 %r24, [%rd24];
cvt.rn.f16.s32 %h55, %r24;
sub.rn.f16 %h56, %h4, %h55;
mul.rn.f16 %h57, %h56, %h6;
sub.rn.f16 %h58, %h53, %h57;
cvt.f32.f16 %f22, %h58;
max.f32 %f23, %f21, %f22;
ld.global.nc.u32 %r25, [%rd14+1284];
cvt.rn.f16.s32 %h59, %r25;
sub.rn.f16 %h60, %h4, %h59;
mul.rn.f16 %h61, %h60, %h6;
sub.rn.f16 %h62, %h54, %h61;
cvt.f32.f16 %f24, %h62;
max.f32 %f25, %f23, %f24;
or.b32 %r26, %r6, 384;
ld.global.nc.b32 %hh7, [%rd12+768];
mov.b32 {%h63, %h64}, %hh7;
mul.wide.u32 %rd25, %r26, 4;
add.s64 %rd26, %rd6, %rd25;
ld.global.nc.u32 %r27, [%rd26];
cvt.rn.f16.s32 %h65, %r27;
sub.rn.f16 %h66, %h4, %h65;
mul.rn.f16 %h67, %h66, %h6;
sub.rn.f16 %h68, %h63, %h67;
cvt.f32.f16 %f26, %h68;
max.f32 %f27, %f25, %f26;
ld.global.nc.u32 %r28, [%rd14+1540];
cvt.rn.f16.s32 %h69, %r28;
sub.rn.f16 %h70, %h4, %h69;
mul.rn.f16 %h71, %h70, %h6;
sub.rn.f16 %h72, %h64, %h71;
cvt.f32.f16 %f28, %h72;
max.f32 %f29, %f27, %f28;
or.b32 %r29, %r6, 448;
ld.global.nc.b32 %hh8, [%rd12+896];
mov.b32 {%h73, %h74}, %hh8;
mul.wide.u32 %rd27, %r29, 4;
add.s64 %rd28, %rd6, %rd27;
ld.global.nc.u32 %r30, [%rd28];
cvt.rn.f16.s32 %h75, %r30;
sub.rn.f16 %h76, %h4, %h75;
mul.rn.f16 %h77, %h76, %h6;
sub.rn.f16 %h78, %h73, %h77;
cvt.f32.f16 %f30, %h78;
max.f32 %f31, %f29, %f30;
ld.global.nc.u32 %r31, [%rd14+1796];
cvt.rn.f16.s32 %h79, %r31;
sub.rn.f16 %h80, %h4, %h79;
mul.rn.f16 %h81, %h80, %h6;
sub.rn.f16 %h82, %h74, %h81;
cvt.f32.f16 %f32, %h82;
max.f32 %f33, %f31, %f32;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
max.f32 %f35, %f33, %f34;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
max.f32 %f37, %f35, %f36;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
max.f32 %f39, %f37, %f38;
shfl.sync.down.b32 %f40, %f39, 2, 31, -1;
max.f32 %f41, %f39, %f40;
shfl.sync.down.b32 %f42, %f41, 1, 31, -1;
setp.eq.s32 %p1, %r1, 0;
@%p1 bra LBB11_3;
bra.uni LBB11_1;
LBB11_3:
max.f32 %f1, %f41, %f42;
st.shared.f32 [shared_cache_02], %f1;
LBB11_1:
bar.sync 0;
mul.wide.u32 %rd32, %r1, 4;
mov.u64 %rd33, shared_cache_02;
add.s64 %rd3, %rd33, %rd32;
cvta.shared.u64 %rd34, %rd3;
mov.u32 %r34, -8388608;
st.local.u32 [%rd1], %r34;
selp.b64 %rd36, %rd34, %rd10, %p1;
ld.f32 %f43, [%rd36];
shfl.sync.down.b32 %f44, %f43, 16, 31, -1;
max.f32 %f45, %f43, %f44;
shfl.sync.down.b32 %f46, %f45, 8, 31, -1;
max.f32 %f47, %f45, %f46;
shfl.sync.down.b32 %f48, %f47, 4, 31, -1;
max.f32 %f49, %f47, %f48;
shfl.sync.down.b32 %f50, %f49, 2, 31, -1;
max.f32 %f51, %f49, %f50;
shfl.sync.down.b32 %f52, %f51, 1, 31, -1;
max.f32 %f53, %f51, %f52;
st.f32 [%rd36], %f53;
@%p1 bra LBB11_4;
bra.uni LBB11_2;
LBB11_4:
ld.param.u64 %rd7, [fusion_2271_param_1];
shr.u32 %r33, %r5, 9;
cvta.to.global.u64 %rd8, %rd7;
and.b32 %r32, %r5, 511;
mul.wide.u32 %rd29, %r33, 2048;
add.s64 %rd30, %rd8, %rd29;
mul.wide.u32 %rd31, %r32, 4;
add.s64 %rd2, %rd30, %rd31;
ld.global.u32 %r36, [%rd2];
LBB11_5:
mov.b32 %f54, %r36;
ld.shared.f32 %f55, [%rd3];
max.f32 %f56, %f54, %f55;
mov.b32 %r35, %f56;
atom.global.cas.b32 %r4, [%rd2], %r36, %r35;
setp.eq.s32 %p3, %r4, %r36;
mov.u32 %r36, %r4;
@%p3 bra LBB11_2;
bra.uni LBB11_5;
LBB11_2:
ret;
}
// .globl fusion_2269
.visible .entry fusion_2269(
.param .u64 fusion_2269_param_0,
.param .u64 fusion_2269_param_1,
.param .u64 fusion_2269_param_2,
.param .u64 fusion_2269_param_3,
.param .u64 fusion_2269_param_4
)
.reqntid 32, 1, 1
{
.local .align 4 .b8 __local_depot12[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<35>;
.reg .b16 %h<83>;
.reg .b32 %hh<9>;
.reg .f32 %f<249>;
.reg .b32 %r<32>;
.reg .b64 %rd<41>;
mov.u64 %SPL, __local_depot12;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd4, [fusion_2269_param_0];
ld.param.u64 %rd5, [fusion_2269_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd8, [fusion_2269_param_2];
cvta.to.global.u64 %rd9, %rd8;
cvta.to.global.u64 %rd11, %rd4;
add.u64 %rd12, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 9;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd13, %r5, 2;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.b32 %hh1, [%rd14];
mov.b32 {%h1, %h2}, %hh1;
mul.wide.u32 %rd15, %r3, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.v2.u32 {%r6, %r7}, [%rd16];
cvt.rn.f16.s32 %h3, %r6;
mov.b16 %h4, 0x3C00;
sub.rn.f16 %h5, %h4, %h3;
mov.b16 %h6, 0x70E2;
mul.rn.f16 %h7, %h5, %h6;
sub.rn.f16 %h8, %h1, %h7;
cvt.f32.f16 %f2, %h8;
mul.wide.u32 %rd17, %r2, 4;
add.s64 %rd18, %rd9, %rd17;
ld.global.nc.f32 %f3, [%rd18];
sub.rn.f32 %f4, %f2, %f3;
mul.rn.f32 %f5, %f4, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f6, %f5;
add.rn.f32 %f7, %f6, 0f00000000;
ex2.approx.f32 %f8, %f7;
fma.rn.f32 %f9, %f6, 0fBF317200, %f4;
fma.rn.f32 %f10, %f6, 0fB5BFBE8E, %f9;
mul.rn.f32 %f11, %f10, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f12, %f11;
mul.rn.f32 %f13, %f8, %f12;
setp.lt.f32 %p1, %f4, 0fC2D20000;
setp.gt.f32 %p2, %f4, 0f42D20000;
add.rn.f32 %f14, %f13, 0f00000000;
selp.f32 %f15, 0f00000000, %f14, %p1;
selp.f32 %f16, 0f7F800000, %f15, %p2;
cvt.rn.f16.s32 %h9, %r7;
sub.rn.f16 %h10, %h4, %h9;
mul.rn.f16 %h11, %h10, %h6;
sub.rn.f16 %h12, %h2, %h11;
cvt.f32.f16 %f17, %h12;
sub.rn.f32 %f18, %f17, %f3;
mul.rn.f32 %f19, %f18, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f20, %f19;
add.rn.f32 %f21, %f20, 0f00000000;
ex2.approx.f32 %f22, %f21;
fma.rn.f32 %f23, %f20, 0fBF317200, %f18;
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23;
mul.rn.f32 %f25, %f24, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f26, %f25;
mul.rn.f32 %f27, %f22, %f26;
setp.lt.f32 %p3, %f18, 0fC2D20000;
selp.f32 %f28, 0f00000000, %f27, %p3;
setp.gt.f32 %p4, %f18, 0f42D20000;
selp.f32 %f29, 0f7F800000, %f28, %p4;
add.rn.f32 %f30, %f16, %f29;
or.b32 %r8, %r3, 64;
ld.global.nc.b32 %hh2, [%rd14+128];
mov.b32 {%h13, %h14}, %hh2;
mul.wide.u32 %rd19, %r8, 4;
add.s64 %rd20, %rd6, %rd19;
ld.global.nc.u32 %r9, [%rd20];
cvt.rn.f16.s32 %h15, %r9;
sub.rn.f16 %h16, %h4, %h15;
mul.rn.f16 %h17, %h16, %h6;
sub.rn.f16 %h18, %h13, %h17;
cvt.f32.f16 %f31, %h18;
sub.rn.f32 %f32, %f31, %f3;
mul.rn.f32 %f33, %f32, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f34, %f33;
add.rn.f32 %f35, %f34, 0f00000000;
ex2.approx.f32 %f36, %f35;
fma.rn.f32 %f37, %f34, 0fBF317200, %f32;
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37;
mul.rn.f32 %f39, %f38, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f40, %f39;
mul.rn.f32 %f41, %f36, %f40;
setp.lt.f32 %p5, %f32, 0fC2D20000;
selp.f32 %f42, 0f00000000, %f41, %p5;
setp.gt.f32 %p6, %f32, 0f42D20000;
selp.f32 %f43, 0f7F800000, %f42, %p6;
add.rn.f32 %f44, %f30, %f43;
ld.global.nc.u32 %r10, [%rd16+260];
cvt.rn.f16.s32 %h19, %r10;
sub.rn.f16 %h20, %h4, %h19;
mul.rn.f16 %h21, %h20, %h6;
sub.rn.f16 %h22, %h14, %h21;
cvt.f32.f16 %f45, %h22;
sub.rn.f32 %f46, %f45, %f3;
mul.rn.f32 %f47, %f46, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f48, %f47;
add.rn.f32 %f49, %f48, 0f00000000;
ex2.approx.f32 %f50, %f49;
fma.rn.f32 %f51, %f48, 0fBF317200, %f46;
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51;
mul.rn.f32 %f53, %f52, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f54, %f53;
mul.rn.f32 %f55, %f50, %f54;
setp.lt.f32 %p7, %f46, 0fC2D20000;
selp.f32 %f56, 0f00000000, %f55, %p7;
setp.gt.f32 %p8, %f46, 0f42D20000;
selp.f32 %f57, 0f7F800000, %f56, %p8;
add.rn.f32 %f58, %f44, %f57;
or.b32 %r11, %r3, 128;
ld.global.nc.b32 %hh3, [%rd14+256];
mov.b32 {%h23, %h24}, %hh3;
mul.wide.u32 %rd21, %r11, 4;
add.s64 %rd22, %rd6, %rd21;
ld.global.nc.u32 %r12, [%rd22];
cvt.rn.f16.s32 %h25, %r12;
sub.rn.f16 %h26, %h4, %h25;
mul.rn.f16 %h27, %h26, %h6;
sub.rn.f16 %h28, %h23, %h27;
cvt.f32.f16 %f59, %h28;
sub.rn.f32 %f60, %f59, %f3;
mul.rn.f32 %f61, %f60, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f62, %f61;
add.rn.f32 %f63, %f62, 0f00000000;
ex2.approx.f32 %f64, %f63;
fma.rn.f32 %f65, %f62, 0fBF317200, %f60;
fma.rn.f32 %f66, %f62, 0fB5BFBE8E, %f65;
mul.rn.f32 %f67, %f66, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f68, %f67;
mul.rn.f32 %f69, %f64, %f68;
setp.lt.f32 %p9, %f60, 0fC2D20000;
selp.f32 %f70, 0f00000000, %f69, %p9;
setp.gt.f32 %p10, %f60, 0f42D20000;
selp.f32 %f71, 0f7F800000, %f70, %p10;
add.rn.f32 %f72, %f58, %f71;
ld.global.nc.u32 %r13, [%rd16+516];
cvt.rn.f16.s32 %h29, %r13;
sub.rn.f16 %h30, %h4, %h29;
mul.rn.f16 %h31, %h30, %h6;
sub.rn.f16 %h32, %h24, %h31;
cvt.f32.f16 %f73, %h32;
sub.rn.f32 %f74, %f73, %f3;
mul.rn.f32 %f75, %f74, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f76, %f75;
add.rn.f32 %f77, %f76, 0f00000000;
ex2.approx.f32 %f78, %f77;
fma.rn.f32 %f79, %f76, 0fBF317200, %f74;
fma.rn.f32 %f80, %f76, 0fB5BFBE8E, %f79;
mul.rn.f32 %f81, %f80, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f82, %f81;
mul.rn.f32 %f83, %f78, %f82;
setp.lt.f32 %p11, %f74, 0fC2D20000;
selp.f32 %f84, 0f00000000, %f83, %p11;
setp.gt.f32 %p12, %f74, 0f42D20000;
selp.f32 %f85, 0f7F800000, %f84, %p12;
add.rn.f32 %f86, %f72, %f85;
or.b32 %r14, %r3, 192;
ld.global.nc.b32 %hh4, [%rd14+384];
mov.b32 {%h33, %h34}, %hh4;
mul.wide.u32 %rd23, %r14, 4;
add.s64 %rd24, %rd6, %rd23;
ld.global.nc.u32 %r15, [%rd24];
cvt.rn.f16.s32 %h35, %r15;
sub.rn.f16 %h36, %h4, %h35;
mul.rn.f16 %h37, %h36, %h6;
sub.rn.f16 %h38, %h33, %h37;
cvt.f32.f16 %f87, %h38;
sub.rn.f32 %f88, %f87, %f3;
mul.rn.f32 %f89, %f88, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f90, %f89;
add.rn.f32 %f91, %f90, 0f00000000;
ex2.approx.f32 %f92, %f91;
fma.rn.f32 %f93, %f90, 0fBF317200, %f88;
fma.rn.f32 %f94, %f90, 0fB5BFBE8E, %f93;
mul.rn.f32 %f95, %f94, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f96, %f95;
mul.rn.f32 %f97, %f92, %f96;
setp.lt.f32 %p13, %f88, 0fC2D20000;
selp.f32 %f98, 0f00000000, %f97, %p13;
setp.gt.f32 %p14, %f88, 0f42D20000;
selp.f32 %f99, 0f7F800000, %f98, %p14;
add.rn.f32 %f100, %f86, %f99;
ld.global.nc.u32 %r16, [%rd16+772];
cvt.rn.f16.s32 %h39, %r16;
sub.rn.f16 %h40, %h4, %h39;
mul.rn.f16 %h41, %h40, %h6;
sub.rn.f16 %h42, %h34, %h41;
cvt.f32.f16 %f101, %h42;
sub.rn.f32 %f102, %f101, %f3;
mul.rn.f32 %f103, %f102, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f104, %f103;
add.rn.f32 %f105, %f104, 0f00000000;
ex2.approx.f32 %f106, %f105;
fma.rn.f32 %f107, %f104, 0fBF317200, %f102;
fma.rn.f32 %f108, %f104, 0fB5BFBE8E, %f107;
mul.rn.f32 %f109, %f108, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f110, %f109;
mul.rn.f32 %f111, %f106, %f110;
setp.lt.f32 %p15, %f102, 0fC2D20000;
selp.f32 %f112, 0f00000000, %f111, %p15;
setp.gt.f32 %p16, %f102, 0f42D20000;
selp.f32 %f113, 0f7F800000, %f112, %p16;
add.rn.f32 %f114, %f100, %f113;
or.b32 %r17, %r3, 256;
ld.global.nc.b32 %hh5, [%rd14+512];
mov.b32 {%h43, %h44}, %hh5;
mul.wide.u32 %rd25, %r17, 4;
add.s64 %rd26, %rd6, %rd25;
ld.global.nc.u32 %r18, [%rd26];
cvt.rn.f16.s32 %h45, %r18;
sub.rn.f16 %h46, %h4, %h45;
mul.rn.f16 %h47, %h46, %h6;
sub.rn.f16 %h48, %h43, %h47;
cvt.f32.f16 %f115, %h48;
sub.rn.f32 %f116, %f115, %f3;
mul.rn.f32 %f117, %f116, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f118, %f117;
add.rn.f32 %f119, %f118, 0f00000000;
ex2.approx.f32 %f120, %f119;
fma.rn.f32 %f121, %f118, 0fBF317200, %f116;
fma.rn.f32 %f122, %f118, 0fB5BFBE8E, %f121;
mul.rn.f32 %f123, %f122, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f124, %f123;
mul.rn.f32 %f125, %f120, %f124;
setp.lt.f32 %p17, %f116, 0fC2D20000;
selp.f32 %f126, 0f00000000, %f125, %p17;
setp.gt.f32 %p18, %f116, 0f42D20000;
selp.f32 %f127, 0f7F800000, %f126, %p18;
add.rn.f32 %f128, %f114, %f127;
ld.global.nc.u32 %r19, [%rd16+1028];
cvt.rn.f16.s32 %h49, %r19;
sub.rn.f16 %h50, %h4, %h49;
mul.rn.f16 %h51, %h50, %h6;
sub.rn.f16 %h52, %h44, %h51;
cvt.f32.f16 %f129, %h52;
sub.rn.f32 %f130, %f129, %f3;
mul.rn.f32 %f131, %f130, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f132, %f131;
add.rn.f32 %f133, %f132, 0f00000000;
ex2.approx.f32 %f134, %f133;
fma.rn.f32 %f135, %f132, 0fBF317200, %f130;
fma.rn.f32 %f136, %f132, 0fB5BFBE8E, %f135;
mul.rn.f32 %f137, %f136, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f138, %f137;
mul.rn.f32 %f139, %f134, %f138;
setp.lt.f32 %p19, %f130, 0fC2D20000;
selp.f32 %f140, 0f00000000, %f139, %p19;
setp.gt.f32 %p20, %f130, 0f42D20000;
selp.f32 %f141, 0f7F800000, %f140, %p20;
add.rn.f32 %f142, %f128, %f141;
or.b32 %r20, %r3, 320;
ld.global.nc.b32 %hh6, [%rd14+640];
mov.b32 {%h53, %h54}, %hh6;
mul.wide.u32 %rd27, %r20, 4;
add.s64 %rd28, %rd6, %rd27;
ld.global.nc.u32 %r21, [%rd28];
cvt.rn.f16.s32 %h55, %r21;
sub.rn.f16 %h56, %h4, %h55;
mul.rn.f16 %h57, %h56, %h6;
sub.rn.f16 %h58, %h53, %h57;
cvt.f32.f16 %f143, %h58;
sub.rn.f32 %f144, %f143, %f3;
mul.rn.f32 %f145, %f144, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f146, %f145;
add.rn.f32 %f147, %f146, 0f00000000;
ex2.approx.f32 %f148, %f147;
fma.rn.f32 %f149, %f146, 0fBF317200, %f144;
fma.rn.f32 %f150, %f146, 0fB5BFBE8E, %f149;
mul.rn.f32 %f151, %f150, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f152, %f151;
mul.rn.f32 %f153, %f148, %f152;
setp.lt.f32 %p21, %f144, 0fC2D20000;
selp.f32 %f154, 0f00000000, %f153, %p21;
setp.gt.f32 %p22, %f144, 0f42D20000;
selp.f32 %f155, 0f7F800000, %f154, %p22;
add.rn.f32 %f156, %f142, %f155;
ld.global.nc.u32 %r22, [%rd16+1284];
cvt.rn.f16.s32 %h59, %r22;
sub.rn.f16 %h60, %h4, %h59;
mul.rn.f16 %h61, %h60, %h6;
sub.rn.f16 %h62, %h54, %h61;
cvt.f32.f16 %f157, %h62;
sub.rn.f32 %f158, %f157, %f3;
mul.rn.f32 %f159, %f158, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f160, %f159;
add.rn.f32 %f161, %f160, 0f00000000;
ex2.approx.f32 %f162, %f161;
fma.rn.f32 %f163, %f160, 0fBF317200, %f158;
fma.rn.f32 %f164, %f160, 0fB5BFBE8E, %f163;
mul.rn.f32 %f165, %f164, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f166, %f165;
mul.rn.f32 %f167, %f162, %f166;
setp.lt.f32 %p23, %f158, 0fC2D20000;
selp.f32 %f168, 0f00000000, %f167, %p23;
setp.gt.f32 %p24, %f158, 0f42D20000;
selp.f32 %f169, 0f7F800000, %f168, %p24;
add.rn.f32 %f170, %f156, %f169;
or.b32 %r23, %r3, 384;
ld.global.nc.b32 %hh7, [%rd14+768];
mov.b32 {%h63, %h64}, %hh7;
mul.wide.u32 %rd29, %r23, 4;
add.s64 %rd30, %rd6, %rd29;
ld.global.nc.u32 %r24, [%rd30];
cvt.rn.f16.s32 %h65, %r24;
sub.rn.f16 %h66, %h4, %h65;
mul.rn.f16 %h67, %h66, %h6;
sub.rn.f16 %h68, %h63, %h67;
cvt.f32.f16 %f171, %h68;
sub.rn.f32 %f172, %f171, %f3;
mul.rn.f32 %f173, %f172, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f174, %f173;
add.rn.f32 %f175, %f174, 0f00000000;
ex2.approx.f32 %f176, %f175;
fma.rn.f32 %f177, %f174, 0fBF317200, %f172;
fma.rn.f32 %f178, %f174, 0fB5BFBE8E, %f177;
mul.rn.f32 %f179, %f178, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f180, %f179;
mul.rn.f32 %f181, %f176, %f180;
setp.lt.f32 %p25, %f172, 0fC2D20000;
selp.f32 %f182, 0f00000000, %f181, %p25;
setp.gt.f32 %p26, %f172, 0f42D20000;
selp.f32 %f183, 0f7F800000, %f182, %p26;
add.rn.f32 %f184, %f170, %f183;
ld.global.nc.u32 %r25, [%rd16+1540];
cvt.rn.f16.s32 %h69, %r25;
sub.rn.f16 %h70, %h4, %h69;
mul.rn.f16 %h71, %h70, %h6;
sub.rn.f16 %h72, %h64, %h71;
cvt.f32.f16 %f185, %h72;
sub.rn.f32 %f186, %f185, %f3;
mul.rn.f32 %f187, %f186, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f188, %f187;
add.rn.f32 %f189, %f188, 0f00000000;
ex2.approx.f32 %f190, %f189;
fma.rn.f32 %f191, %f188, 0fBF317200, %f186;
fma.rn.f32 %f192, %f188, 0fB5BFBE8E, %f191;
mul.rn.f32 %f193, %f192, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f194, %f193;
mul.rn.f32 %f195, %f190, %f194;
setp.lt.f32 %p27, %f186, 0fC2D20000;
selp.f32 %f196, 0f00000000, %f195, %p27;
setp.gt.f32 %p28, %f186, 0f42D20000;
selp.f32 %f197, 0f7F800000, %f196, %p28;
add.rn.f32 %f198, %f184, %f197;
or.b32 %r26, %r3, 448;
ld.global.nc.b32 %hh8, [%rd14+896];
mov.b32 {%h73, %h74}, %hh8;
mul.wide.u32 %rd31, %r26, 4;
add.s64 %rd32, %rd6, %rd31;
ld.global.nc.u32 %r27, [%rd32];
cvt.rn.f16.s32 %h75, %r27;
sub.rn.f16 %h76, %h4, %h75;
mul.rn.f16 %h77, %h76, %h6;
sub.rn.f16 %h78, %h73, %h77;
cvt.f32.f16 %f199, %h78;
sub.rn.f32 %f200, %f199, %f3;
mul.rn.f32 %f201, %f200, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f202, %f201;
add.rn.f32 %f203, %f202, 0f00000000;
ex2.approx.f32 %f204, %f203;
fma.rn.f32 %f205, %f202, 0fBF317200, %f200;
fma.rn.f32 %f206, %f202, 0fB5BFBE8E, %f205;
mul.rn.f32 %f207, %f206, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f208, %f207;
mul.rn.f32 %f209, %f204, %f208;
setp.lt.f32 %p29, %f200, 0fC2D20000;
selp.f32 %f210, 0f00000000, %f209, %p29;
setp.gt.f32 %p30, %f200, 0f42D20000;
selp.f32 %f211, 0f7F800000, %f210, %p30;
add.rn.f32 %f212, %f198, %f211;
ld.global.nc.u32 %r28, [%rd16+1796];
cvt.rn.f16.s32 %h79, %r28;
sub.rn.f16 %h80, %h4, %h79;
mul.rn.f16 %h81, %h80, %h6;
sub.rn.f16 %h82, %h74, %h81;
cvt.f32.f16 %f213, %h82;
sub.rn.f32 %f214, %f213, %f3;
mul.rn.f32 %f215, %f214, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f216, %f215;
add.rn.f32 %f217, %f216, 0f00000000;
ex2.approx.f32 %f218, %f217;
fma.rn.f32 %f219, %f216, 0fBF317200, %f214;
fma.rn.f32 %f220, %f216, 0fB5BFBE8E, %f219;
mul.rn.f32 %f221, %f220, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f222, %f221;
mul.rn.f32 %f223, %f218, %f222;
setp.lt.f32 %p31, %f214, 0fC2D20000;
selp.f32 %f224, 0f00000000, %f223, %p31;
setp.gt.f32 %p32, %f214, 0f42D20000;
selp.f32 %f225, 0f7F800000, %f224, %p32;
add.rn.f32 %f226, %f212, %f225;
shfl.sync.down.b32 %f227, %f226, 16, 31, -1;
add.rn.f32 %f228, %f227, %f226;
shfl.sync.down.b32 %f229, %f228, 8, 31, -1;
add.rn.f32 %f230, %f229, %f228;
shfl.sync.down.b32 %f231, %f230, 4, 31, -1;
add.rn.f32 %f232, %f231, %f230;
shfl.sync.down.b32 %f233, %f232, 2, 31, -1;
add.rn.f32 %f234, %f233, %f232;
shfl.sync.down.b32 %f235, %f234, 1, 31, -1;
setp.eq.s32 %p33, %r1, 0;
@%p33 bra LBB12_3;
bra.uni LBB12_1;
LBB12_3:
add.rn.f32 %f1, %f235, %f234;
st.shared.f32 [shared_cache_03], %f1;
LBB12_1:
bar.sync 0;
mul.wide.u32 %rd36, %r1, 4;
mov.u64 %rd37, shared_cache_03;
add.s64 %rd3, %rd37, %rd36;
cvta.shared.u64 %rd38, %rd3;
mov.u32 %r31, 0;
st.local.u32 [%rd1], %r31;
selp.b64 %rd40, %rd38, %rd12, %p33;
ld.f32 %f236, [%rd40];
shfl.sync.down.b32 %f237, %f236, 16, 31, -1;
add.rn.f32 %f238, %f236, %f237;
shfl.sync.down.b32 %f239, %f238, 8, 31, -1;
add.rn.f32 %f240, %f238, %f239;
shfl.sync.down.b32 %f241, %f240, 4, 31, -1;
add.rn.f32 %f242, %f240, %f241;
shfl.sync.down.b32 %f243, %f242, 2, 31, -1;
add.rn.f32 %f244, %f242, %f243;
shfl.sync.down.b32 %f245, %f244, 1, 31, -1;
add.rn.f32 %f246, %f244, %f245;
st.f32 [%rd40], %f246;
@%p33 bra LBB12_4;
bra.uni LBB12_2;
LBB12_4:
ld.param.u64 %rd7, [fusion_2269_param_1];
shr.u32 %r30, %r2, 9;
cvta.to.global.u64 %rd10, %rd7;
and.b32 %r29, %r2, 511;
mul.wide.u32 %rd33, %r30, 2048;
add.s64 %rd34, %rd10, %rd33;
mul.wide.u32 %rd35, %r29, 4;
add.s64 %rd2, %rd34, %rd35;
ld.shared.f32 %f247, [%rd3];
atom.global.add.f32 %f248, [%rd2], %f247;
LBB12_2:
ret;
}
// .globl fusion_2268
.visible .entry fusion_2268(
.param .u64 fusion_2268_param_0,
.param .u64 fusion_2268_param_1,
.param .u64 fusion_2268_param_2,
.param .u64 fusion_2268_param_3,
.param .u64 fusion_2268_param_4,
.param .u64 fusion_2268_param_5
)
.reqntid 256, 1, 1
{
.reg .pred %p<9>;
.reg .b16 %h<27>;
.reg .b32 %hh<3>;
.reg .f32 %f<59>;
.reg .b32 %r<18>;
.reg .b64 %rd<26>;
ld.param.u64 %rd1, [fusion_2268_param_0];
ld.param.u64 %rd2, [fusion_2268_param_4];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2268_param_1];
ld.param.u64 %rd5, [fusion_2268_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2268_param_2];
cvta.to.global.u64 %rd8, %rd7;
cvta.to.global.u64 %rd9, %rd4;
cvta.to.global.u64 %rd10, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
or.b32 %r8, %r4, 3;
shr.u32 %r9, %r5, 9;
and.b32 %r10, %r8, 511;
and.b32 %r11, %r7, 510;
and.b32 %r12, %r6, 509;
and.b32 %r13, %r4, 508;
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd10, %rd11;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
mul.wide.u32 %rd13, %r13, 4;
add.s64 %rd14, %rd3, %rd13;
ld.global.nc.u32 %r14, [%rd14];
cvt.rn.f16.s32 %h9, %r14;
mov.b16 %h10, 0x3C00;
sub.rn.f16 %h11, %h10, %h9;
mov.b16 %h12, 0x70E2;
mul.rn.f16 %h13, %h11, %h12;
sub.rn.f16 %h14, %h5, %h13;
cvt.f32.f16 %f1, %h14;
mul.wide.u32 %rd15, %r9, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.f32 %f2, [%rd16];
sub.rn.f32 %f3, %f1, %f2;
mul.rn.f32 %f4, %f3, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f5, %f4;
add.rn.f32 %f6, %f5, 0f00000000;
ex2.approx.f32 %f7, %f6;
fma.rn.f32 %f8, %f5, 0fBF317200, %f3;
fma.rn.f32 %f9, %f5, 0fB5BFBE8E, %f8;
mul.rn.f32 %f10, %f9, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f11, %f10;
mul.rn.f32 %f12, %f7, %f11;
setp.lt.f32 %p1, %f3, 0fC2D20000;
selp.f32 %f13, 0f00000000, %f12, %p1;
setp.gt.f32 %p2, %f3, 0f42D20000;
selp.f32 %f14, 0f7F800000, %f13, %p2;
add.s64 %rd17, %rd8, %rd15;
ld.global.nc.f32 %f15, [%rd17];
div.full.f32 %f16, %f14, %f15;
mul.wide.u32 %rd18, %r5, 4;
add.s64 %rd19, %rd9, %rd18;
mul.wide.u32 %rd20, %r12, 4;
add.s64 %rd21, %rd3, %rd20;
ld.global.nc.u32 %r15, [%rd21];
cvt.rn.f16.s32 %h15, %r15;
sub.rn.f16 %h16, %h10, %h15;
mul.rn.f16 %h17, %h16, %h12;
sub.rn.f16 %h18, %h6, %h17;
cvt.f32.f16 %f17, %h18;
sub.rn.f32 %f18, %f17, %f2;
mul.rn.f32 %f19, %f18, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f20, %f19;
add.rn.f32 %f21, %f20, 0f00000000;
ex2.approx.f32 %f22, %f21;
fma.rn.f32 %f23, %f20, 0fBF317200, %f18;
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23;
mul.rn.f32 %f25, %f24, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f26, %f25;
mul.rn.f32 %f27, %f22, %f26;
setp.lt.f32 %p3, %f18, 0fC2D20000;
selp.f32 %f28, 0f00000000, %f27, %p3;
setp.gt.f32 %p4, %f18, 0f42D20000;
selp.f32 %f29, 0f7F800000, %f28, %p4;
div.full.f32 %f30, %f29, %f15;
mul.wide.u32 %rd22, %r11, 4;
add.s64 %rd23, %rd3, %rd22;
ld.global.nc.u32 %r16, [%rd23];
cvt.rn.f16.s32 %h19, %r16;
sub.rn.f16 %h20, %h10, %h19;
mul.rn.f16 %h21, %h20, %h12;
sub.rn.f16 %h22, %h7, %h21;
cvt.f32.f16 %f31, %h22;
sub.rn.f32 %f32, %f31, %f2;
mul.rn.f32 %f33, %f32, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f34, %f33;
add.rn.f32 %f35, %f34, 0f00000000;
ex2.approx.f32 %f36, %f35;
fma.rn.f32 %f37, %f34, 0fBF317200, %f32;
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37;
mul.rn.f32 %f39, %f38, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f40, %f39;
mul.rn.f32 %f41, %f36, %f40;
setp.lt.f32 %p5, %f32, 0fC2D20000;
selp.f32 %f42, 0f00000000, %f41, %p5;
setp.gt.f32 %p6, %f32, 0f42D20000;
selp.f32 %f43, 0f7F800000, %f42, %p6;
div.full.f32 %f44, %f43, %f15;
mul.wide.u32 %rd24, %r10, 4;
add.s64 %rd25, %rd3, %rd24;
ld.global.nc.u32 %r17, [%rd25];
cvt.rn.f16.s32 %h23, %r17;
sub.rn.f16 %h24, %h10, %h23;
mul.rn.f16 %h25, %h24, %h12;
sub.rn.f16 %h26, %h8, %h25;
cvt.f32.f16 %f45, %h26;
sub.rn.f32 %f46, %f45, %f2;
mul.rn.f32 %f47, %f46, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f48, %f47;
add.rn.f32 %f49, %f48, 0f00000000;
ex2.approx.f32 %f50, %f49;
fma.rn.f32 %f51, %f48, 0fBF317200, %f46;
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51;
mul.rn.f32 %f53, %f52, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f54, %f53;
mul.rn.f32 %f55, %f50, %f54;
setp.lt.f32 %p7, %f46, 0fC2D20000;
selp.f32 %f56, 0f00000000, %f55, %p7;
setp.gt.f32 %p8, %f46, 0f42D20000;
selp.f32 %f57, 0f7F800000, %f56, %p8;
div.full.f32 %f58, %f57, %f15;
st.global.v4.f32 [%rd19], {%f16, %f30, %f44, %f58};
ret;
}
// .globl rng_get_and_update_state_3
.visible .entry rng_get_and_update_state_3(
.param .u64 rng_get_and_update_state_3_param_0,
.param .u64 rng_get_and_update_state_3_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_3_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 4194304;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 4194304;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2267
.visible .entry fusion_2267(
.param .u64 fusion_2267_param_0,
.param .u64 fusion_2267_param_1,
.param .u64 fusion_2267_param_2,
.param .u64 fusion_2267_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<19>;
.reg .f32 %f<13>;
.reg .b32 %r<29>;
.reg .b64 %rd<119>;
ld.param.u64 %rd1, [fusion_2267_param_0];
ld.param.u64 %rd2, [fusion_2267_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2267_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
ld.global.nc.v2.u64 {%rd7, %rd8}, [%rd3];
shr.u32 %r6, %r5, 2;
cvt.u64.u32 %rd9, %r6;
add.s64 %rd10, %rd7, %rd9;
setp.lt.u64 %p1, %rd10, %rd7;
and.b64 %rd11, %rd10, 4294967295;
mul.lo.s64 %rd12, %rd11, 3528531795;
selp.u64 %rd13, 1, 0, %p1;
add.s64 %rd14, %rd8, %rd13;
xor.b64 %rd15, %rd14, %rd12;
shr.u64 %rd16, %rd15, 32;
mul.lo.s64 %rd17, %rd16, 3449720151;
shr.u64 %rd18, %rd17, 32;
and.b64 %rd19, %rd14, 4294967295;
mul.lo.s64 %rd20, %rd19, 3449720151;
and.b64 %rd21, %rd20, 4294967295;
xor.b64 %rd22, %rd21, %rd18;
xor.b64 %rd23, %rd22, 2654435769;
mul.lo.s64 %rd24, %rd23, 3528531795;
shr.u64 %rd25, %rd24, 32;
xor.b64 %rd26, %rd20, %rd10;
shr.u64 %rd27, %rd26, 32;
mul.lo.s64 %rd28, %rd27, 3528531795;
and.b64 %rd29, %rd28, 4294967295;
xor.b64 %rd30, %rd29, %rd25;
xor.b64 %rd31, %rd30, 1993301258;
mul.lo.s64 %rd32, %rd31, 3449720151;
shr.u64 %rd33, %rd32, 32;
shr.u64 %rd34, %rd28, 32;
and.b64 %rd35, %rd12, 4294967295;
xor.b64 %rd36, %rd35, %rd34;
xor.b64 %rd37, %rd36, 3144134277;
mul.lo.s64 %rd38, %rd37, 3449720151;
and.b64 %rd39, %rd38, 4294967295;
xor.b64 %rd40, %rd39, %rd33;
xor.b64 %rd41, %rd40, 3668340011;
mul.lo.s64 %rd42, %rd41, 3528531795;
shr.u64 %rd43, %rd42, 32;
shr.u64 %rd44, %rd38, 32;
and.b64 %rd45, %rd17, 4294967295;
xor.b64 %rd46, %rd45, %rd44;
xor.b64 %rd47, %rd46, 1013904242;
mul.lo.s64 %rd48, %rd47, 3528531795;
and.b64 %rd49, %rd48, 4294967295;
xor.b64 %rd50, %rd49, %rd43;
xor.b64 %rd51, %rd50, 3986602516;
mul.lo.s64 %rd52, %rd51, 3449720151;
shr.u64 %rd53, %rd52, 32;
shr.u64 %rd54, %rd48, 32;
and.b64 %rd55, %rd24, 4294967295;
xor.b64 %rd56, %rd55, %rd54;
xor.b64 %rd57, %rd56, 842468239;
mul.lo.s64 %rd58, %rd57, 3449720151;
and.b64 %rd59, %rd58, 4294967295;
xor.b64 %rd60, %rd59, %rd53;
xor.b64 %rd61, %rd60, 387276957;
mul.lo.s64 %rd62, %rd61, 3528531795;
shr.u64 %rd63, %rd62, 32;
shr.u64 %rd64, %rd58, 32;
and.b64 %rd65, %rd32, 4294967295;
xor.b64 %rd66, %rd65, %rd64;
xor.b64 %rd67, %rd66, 2027808484;
mul.lo.s64 %rd68, %rd67, 3528531795;
and.b64 %rd69, %rd68, 4294967295;
xor.b64 %rd70, %rd69, %rd63;
xor.b64 %rd71, %rd70, 1684936478;
mul.lo.s64 %rd72, %rd71, 3449720151;
shr.u64 %rd73, %rd72, 32;
shr.u64 %rd74, %rd68, 32;
and.b64 %rd75, %rd42, 4294967295;
xor.b64 %rd76, %rd75, %rd74;
xor.b64 %rd77, %rd76, 2835769497;
mul.lo.s64 %rd78, %rd77, 3449720151;
and.b64 %rd79, %rd78, 4294967295;
xor.b64 %rd80, %rd79, %rd73;
xor.b64 %rd81, %rd80, 1401181199;
mul.lo.s64 %rd82, %rd81, 3528531795;
shr.u64 %rd83, %rd82, 32;
shr.u64 %rd84, %rd78, 32;
and.b64 %rd85, %rd52, 4294967295;
xor.b64 %rd86, %rd85, %rd84;
xor.b64 %rd87, %rd86, 3041712726;
mul.lo.s64 %rd88, %rd87, 3528531795;
and.b64 %rd89, %rd88, 4294967295;
xor.b64 %rd90, %rd89, %rd83;
xor.b64 %rd91, %rd90, 3678237736;
mul.lo.s64 %rd92, %rd91, 3449720151;
shr.u64 %rd93, %rd92, 32;
cvt.u32.u64 %r7, %rd93;
shr.u64 %rd94, %rd88, 32;
xor.b64 %rd95, %rd94, %rd62;
cvt.u32.u64 %r8, %rd95;
xor.b32 %r9, %r8, 534103459;
mul.lo.s32 %r10, %r9, -845247145;
xor.b32 %r11, %r10, %r7;
shr.u32 %r12, %r11, 9;
xor.b32 %r13, %r12, 4716963;
cvt.rn.f32.u32 %f1, %r13;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h1, %f2;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p2, %h1, %h2;
mul.wide.u32 %rd96, %r5, 4;
add.s64 %rd97, %rd5, %rd96;
ld.global.nc.v4.f32 {%f3, %f4, %f5, %f6}, [%rd97];
cvt.rn.f16.f32 %h3, %f3;
mov.b16 %h4, 0x3C72;
mul.rn.f16 %h5, %h3, %h4;
selp.b16 %h6, %h5, 0x0000, %p2;
mul.wide.u32 %rd98, %r5, 2;
add.s64 %rd99, %rd6, %rd98;
xor.b64 %rd100, %rd84, %rd52;
xor.b64 %rd101, %rd100, 3041712726;
mul.lo.s64 %rd102, %rd101, 3528531795;
xor.b64 %rd103, %rd83, %rd102;
cvt.u32.u64 %r14, %rd103;
xor.b32 %r15, %r14, -616729560;
mul.lo.s32 %r16, %r15, -845247145;
shr.u32 %r17, %r16, 9;
cvt.rn.f32.u32 %f7, %r17;
mul.rn.f32 %f8, %f7, 0f34000000;
cvt.rn.f16.f32 %h7, %f8;
setp.ge.f16 %p3, %h7, %h2;
cvt.rn.f16.f32 %h8, %f4;
mul.rn.f16 %h9, %h8, %h4;
selp.b16 %h10, %h9, 0x0000, %p3;
and.b64 %rd104, %rd62, 4294967295;
xor.b64 %rd105, %rd104, %rd94;
xor.b64 %rd106, %rd105, 534103459;
mul.lo.s64 %rd107, %rd106, 3449720151;
shr.u64 %rd108, %rd107, 32;
and.b64 %rd109, %rd72, 4294967295;
xor.b64 %rd110, %rd109, %rd108;
xor.b64 %rd111, %rd110, 4055616968;
mul.lo.s64 %rd112, %rd111, 3528531795;
shr.u64 %rd113, %rd112, 32;
cvt.u32.u64 %r18, %rd113;
xor.b64 %rd114, %rd73, %rd78;
cvt.u32.u64 %r19, %rd114;
xor.b32 %r20, %r19, 1401181199;
mul.lo.s32 %r21, %r20, -766435501;
xor.b32 %r22, %r21, %r18;
shr.u32 %r23, %r22, 9;
xor.b32 %r24, %r23, 4936337;
cvt.rn.f32.u32 %f9, %r24;
mul.rn.f32 %f10, %f9, 0f34000000;
cvt.rn.f16.f32 %h11, %f10;
setp.ge.f16 %p4, %h11, %h2;
cvt.rn.f16.f32 %h12, %f5;
mul.rn.f16 %h13, %h12, %h4;
selp.b16 %h14, %h13, 0x0000, %p4;
xor.b64 %rd115, %rd63, %rd68;
xor.b64 %rd116, %rd115, 1684936478;
mul.lo.s64 %rd117, %rd116, 3449720151;
xor.b64 %rd118, %rd108, %rd117;
cvt.u32.u64 %r25, %rd118;
xor.b32 %r26, %r25, -239350328;
mul.lo.s32 %r27, %r26, -766435501;
shr.u32 %r28, %r27, 9;
cvt.rn.f32.u32 %f11, %r28;
mul.rn.f32 %f12, %f11, 0f34000000;
cvt.rn.f16.f32 %h15, %f12;
setp.ge.f16 %p5, %h15, %h2;
cvt.rn.f16.f32 %h16, %f6;
mul.rn.f16 %h17, %h16, %h4;
selp.b16 %h18, %h17, 0x0000, %p5;
st.global.v4.b16 [%rd99], {%h6, %h10, %h14, %h18};
ret;
}
// .globl fusion_2709
.visible .entry fusion_2709(
.param .u64 fusion_2709_param_0,
.param .u64 fusion_2709_param_1,
.param .u64 fusion_2709_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2709_param_0];
ld.param.u64 %rd2, [fusion_2709_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2265
.visible .entry fusion_2265(
.param .u64 fusion_2265_param_0,
.param .u64 fusion_2265_param_1,
.param .u64 fusion_2265_param_2,
.param .u64 fusion_2265_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2265_param_0];
ld.param.u64 %rd2, [fusion_2265_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2265_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd6, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd5, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2264
.visible .entry fusion_2264(
.param .u64 fusion_2264_param_0,
.param .u64 fusion_2264_param_1,
.param .u64 fusion_2264_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .b32 %r<12>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [fusion_2264_param_0];
ld.param.u64 %rd2, [fusion_2264_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
and.b32 %r8, %r4, 60;
shr.u32 %r9, %r2, 4;
mul.wide.u32 %rd5, %r9, 65536;
add.s64 %rd6, %rd3, %rd5;
mul.wide.u32 %rd7, %r1, 128;
add.s64 %rd8, %rd6, %rd7;
mul.wide.u32 %rd9, %r8, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd4, %rd11;
and.b32 %r10, %r6, 61;
mul.wide.u32 %rd13, %r10, 2;
add.s64 %rd14, %rd8, %rd13;
ld.global.nc.b16 %h2, [%rd14];
and.b32 %r11, %r7, 62;
mul.wide.u32 %rd15, %r11, 2;
add.s64 %rd16, %rd8, %rd15;
ld.global.nc.b16 %h3, [%rd16];
ld.global.nc.b16 %h4, [%rd10+6];
st.global.v4.b16 [%rd12], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2708
.visible .entry fusion_2708(
.param .u64 fusion_2708_param_0,
.param .u64 fusion_2708_param_1,
.param .u64 fusion_2708_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2708_param_0];
ld.param.u64 %rd2, [fusion_2708_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl rng_get_and_update_state_1
.visible .entry rng_get_and_update_state_1(
.param .u64 rng_get_and_update_state_1_param_0,
.param .u64 rng_get_and_update_state_1_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_1_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 524288;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 524288;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2263
.visible .entry fusion_2263(
.param .u64 fusion_2263_param_0,
.param .u64 fusion_2263_param_1,
.param .u64 fusion_2263_param_2,
.param .u64 fusion_2263_param_3,
.param .u64 fusion_2263_param_4,
.param .u64 fusion_2263_param_5
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<43>;
.reg .b32 %hh<5>;
.reg .f32 %f<13>;
.reg .b32 %r<31>;
.reg .b64 %rd<129>;
ld.param.u64 %rd1, [fusion_2263_param_0];
ld.param.u64 %rd2, [fusion_2263_param_4];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2263_param_1];
ld.param.u64 %rd5, [fusion_2263_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2263_param_2];
cvta.to.global.u64 %rd8, %rd7;
cvta.to.global.u64 %rd9, %rd4;
cvta.to.global.u64 %rd10, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd9, %rd11;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
shr.u32 %r8, %r5, 2;
ld.global.nc.v2.u64 {%rd13, %rd14}, [%rd6];
cvt.u64.u32 %rd15, %r8;
add.s64 %rd16, %rd13, %rd15;
setp.lt.u64 %p1, %rd16, %rd13;
and.b64 %rd17, %rd16, 4294967295;
mul.lo.s64 %rd18, %rd17, 3528531795;
selp.u64 %rd19, 1, 0, %p1;
add.s64 %rd20, %rd14, %rd19;
xor.b64 %rd21, %rd20, %rd18;
shr.u64 %rd22, %rd21, 32;
mul.lo.s64 %rd23, %rd22, 3449720151;
shr.u64 %rd24, %rd23, 32;
and.b64 %rd25, %rd20, 4294967295;
mul.lo.s64 %rd26, %rd25, 3449720151;
and.b64 %rd27, %rd26, 4294967295;
xor.b64 %rd28, %rd27, %rd24;
xor.b64 %rd29, %rd28, 2654435769;
mul.lo.s64 %rd30, %rd29, 3528531795;
shr.u64 %rd31, %rd30, 32;
xor.b64 %rd32, %rd26, %rd16;
shr.u64 %rd33, %rd32, 32;
mul.lo.s64 %rd34, %rd33, 3528531795;
and.b64 %rd35, %rd34, 4294967295;
xor.b64 %rd36, %rd35, %rd31;
xor.b64 %rd37, %rd36, 1993301258;
mul.lo.s64 %rd38, %rd37, 3449720151;
shr.u64 %rd39, %rd38, 32;
shr.u64 %rd40, %rd34, 32;
and.b64 %rd41, %rd18, 4294967295;
xor.b64 %rd42, %rd41, %rd40;
xor.b64 %rd43, %rd42, 3144134277;
mul.lo.s64 %rd44, %rd43, 3449720151;
and.b64 %rd45, %rd44, 4294967295;
xor.b64 %rd46, %rd45, %rd39;
xor.b64 %rd47, %rd46, 3668340011;
mul.lo.s64 %rd48, %rd47, 3528531795;
shr.u64 %rd49, %rd48, 32;
shr.u64 %rd50, %rd44, 32;
and.b64 %rd51, %rd23, 4294967295;
xor.b64 %rd52, %rd51, %rd50;
xor.b64 %rd53, %rd52, 1013904242;
mul.lo.s64 %rd54, %rd53, 3528531795;
and.b64 %rd55, %rd54, 4294967295;
xor.b64 %rd56, %rd55, %rd49;
xor.b64 %rd57, %rd56, 3986602516;
mul.lo.s64 %rd58, %rd57, 3449720151;
shr.u64 %rd59, %rd58, 32;
shr.u64 %rd60, %rd54, 32;
and.b64 %rd61, %rd30, 4294967295;
xor.b64 %rd62, %rd61, %rd60;
xor.b64 %rd63, %rd62, 842468239;
mul.lo.s64 %rd64, %rd63, 3449720151;
and.b64 %rd65, %rd64, 4294967295;
xor.b64 %rd66, %rd65, %rd59;
xor.b64 %rd67, %rd66, 387276957;
mul.lo.s64 %rd68, %rd67, 3528531795;
shr.u64 %rd69, %rd68, 32;
shr.u64 %rd70, %rd64, 32;
and.b64 %rd71, %rd38, 4294967295;
xor.b64 %rd72, %rd71, %rd70;
xor.b64 %rd73, %rd72, 2027808484;
mul.lo.s64 %rd74, %rd73, 3528531795;
and.b64 %rd75, %rd74, 4294967295;
shr.u64 %rd76, %rd74, 32;
and.b64 %rd77, %rd48, 4294967295;
xor.b64 %rd78, %rd77, %rd76;
xor.b64 %rd79, %rd78, 2835769497;
mul.lo.s64 %rd80, %rd79, 3449720151;
and.b64 %rd81, %rd80, 4294967295;
shr.u64 %rd82, %rd80, 32;
and.b64 %rd83, %rd58, 4294967295;
xor.b64 %rd84, %rd83, %rd82;
xor.b64 %rd85, %rd84, 3041712726;
mul.lo.s64 %rd86, %rd85, 3528531795;
and.b64 %rd87, %rd86, 4294967295;
xor.b64 %rd88, %rd75, %rd69;
xor.b64 %rd89, %rd88, 1684936478;
mul.lo.s64 %rd90, %rd89, 3449720151;
shr.u64 %rd91, %rd90, 32;
xor.b64 %rd92, %rd81, %rd91;
xor.b64 %rd93, %rd92, 1401181199;
mul.lo.s64 %rd94, %rd93, 3528531795;
shr.u64 %rd95, %rd94, 32;
xor.b64 %rd96, %rd87, %rd95;
xor.b64 %rd97, %rd96, 3678237736;
mul.lo.s64 %rd98, %rd97, 3449720151;
shr.u64 %rd99, %rd98, 32;
cvt.u32.u64 %r9, %rd99;
shr.u64 %rd100, %rd86, 32;
xor.b64 %rd101, %rd100, %rd68;
cvt.u32.u64 %r10, %rd101;
xor.b32 %r11, %r10, 534103459;
mul.lo.s32 %r12, %r11, -845247145;
xor.b32 %r13, %r12, %r9;
shr.u32 %r14, %r13, 9;
xor.b32 %r15, %r14, 4716963;
cvt.rn.f32.u32 %f1, %r15;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h9, %f2;
mov.b16 %h10, 0x2E66;
setp.ge.f16 %p2, %h9, %h10;
add.s64 %rd102, %rd8, %rd11;
ld.global.nc.v4.b16 {%h11, %h12, %h13, %h14}, [%rd102];
mov.b32 %hh3, {%h13, %h14};
mov.b32 %hh4, {%h11, %h12};
mov.b32 {%h15, %h16}, %hh4;
mov.b32 {%h17, %h18}, %hh3;
mul.wide.u32 %rd103, %r4, 4;
add.s64 %rd104, %rd3, %rd103;
ld.global.nc.f32 %f3, [%rd104];
cvt.rn.f16.f32 %h19, %f3;
add.rn.f16 %h20, %h15, %h19;
mov.b16 %h21, 0x3C72;
mul.rn.f16 %h22, %h20, %h21;
selp.b16 %h23, %h22, 0x0000, %p2;
add.rn.f16 %h24, %h5, %h23;
add.s64 %rd105, %rd10, %rd11;
xor.b64 %rd106, %rd58, %rd82;
xor.b64 %rd107, %rd106, 3041712726;
mul.lo.s64 %rd108, %rd107, 3528531795;
xor.b64 %rd109, %rd95, %rd108;
cvt.u32.u64 %r16, %rd109;
xor.b32 %r17, %r16, -616729560;
mul.lo.s32 %r18, %r17, -845247145;
shr.u32 %r19, %r18, 9;
cvt.rn.f32.u32 %f4, %r19;
mul.rn.f32 %f5, %f4, 0f34000000;
cvt.rn.f16.f32 %h25, %f5;
setp.ge.f16 %p3, %h25, %h10;
mul.wide.u32 %rd110, %r6, 4;
add.s64 %rd111, %rd3, %rd110;
ld.global.nc.f32 %f6, [%rd111];
cvt.rn.f16.f32 %h26, %f6;
add.rn.f16 %h27, %h16, %h26;
mul.rn.f16 %h28, %h27, %h21;
selp.b16 %h29, %h28, 0x0000, %p3;
add.rn.f16 %h30, %h6, %h29;
and.b64 %rd112, %rd90, 4294967295;
and.b64 %rd113, %rd68, 4294967295;
xor.b64 %rd114, %rd113, %rd100;
xor.b64 %rd115, %rd114, 534103459;
mul.lo.s64 %rd116, %rd115, 3449720151;
shr.u64 %rd117, %rd116, 32;
xor.b64 %rd118, %rd112, %rd117;
xor.b64 %rd119, %rd118, 4055616968;
mul.lo.s64 %rd120, %rd119, 3528531795;
shr.u64 %rd121, %rd120, 32;
cvt.u32.u64 %r20, %rd121;
xor.b64 %rd122, %rd91, %rd80;
cvt.u32.u64 %r21, %rd122;
xor.b32 %r22, %r21, 1401181199;
mul.lo.s32 %r23, %r22, -766435501;
xor.b32 %r24, %r23, %r20;
shr.u32 %r25, %r24, 9;
xor.b32 %r26, %r25, 4936337;
cvt.rn.f32.u32 %f7, %r26;
mul.rn.f32 %f8, %f7, 0f34000000;
cvt.rn.f16.f32 %h31, %f8;
setp.ge.f16 %p4, %h31, %h10;
mul.wide.u32 %rd123, %r7, 4;
add.s64 %rd124, %rd3, %rd123;
ld.global.nc.f32 %f9, [%rd124];
cvt.rn.f16.f32 %h32, %f9;
add.rn.f16 %h33, %h17, %h32;
mul.rn.f16 %h34, %h33, %h21;
selp.b16 %h35, %h34, 0x0000, %p4;
add.rn.f16 %h36, %h7, %h35;
xor.b64 %rd125, %rd69, %rd74;
xor.b64 %rd126, %rd125, 1684936478;
mul.lo.s64 %rd127, %rd126, 3449720151;
xor.b64 %rd128, %rd117, %rd127;
cvt.u32.u64 %r27, %rd128;
xor.b32 %r28, %r27, -239350328;
mul.lo.s32 %r29, %r28, -766435501;
shr.u32 %r30, %r29, 9;
cvt.rn.f32.u32 %f10, %r30;
mul.rn.f32 %f11, %f10, 0f34000000;
cvt.rn.f16.f32 %h37, %f11;
setp.ge.f16 %p5, %h37, %h10;
ld.global.nc.f32 %f12, [%rd104+12];
cvt.rn.f16.f32 %h38, %f12;
add.rn.f16 %h39, %h18, %h38;
mul.rn.f16 %h40, %h39, %h21;
selp.b16 %h41, %h40, 0x0000, %p5;
add.rn.f16 %h42, %h8, %h41;
st.global.v4.b16 [%rd105], {%h24, %h30, %h36, %h42};
ret;
}
// .globl fusion_2262
.visible .entry fusion_2262(
.param .u64 fusion_2262_param_0,
.param .u64 fusion_2262_param_1,
.param .u64 fusion_2262_param_2
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot22[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<56>;
.reg .b32 %r<7>;
.reg .b64 %rd<22>;
mov.u64 %SPL, __local_depot22;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2262_param_0];
cvta.to.global.u64 %rd8, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd10, %r4, 2048;
add.s64 %rd11, %rd8, %rd10;
mul.wide.u32 %rd12, %r5, 2;
add.s64 %rd13, %rd11, %rd12;
ld.global.nc.b32 %hh1, [%rd13];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
add.rn.f32 %f3, %f2, 0f00000000;
cvt.f32.f16 %f4, %h2;
add.rn.f32 %f5, %f3, %f4;
ld.global.nc.b32 %hh2, [%rd13+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f6, %h3;
add.rn.f32 %f7, %f5, %f6;
cvt.f32.f16 %f8, %h4;
add.rn.f32 %f9, %f7, %f8;
ld.global.nc.b32 %hh3, [%rd13+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f10, %h5;
add.rn.f32 %f11, %f9, %f10;
cvt.f32.f16 %f12, %h6;
add.rn.f32 %f13, %f11, %f12;
ld.global.nc.b32 %hh4, [%rd13+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f14, %h7;
add.rn.f32 %f15, %f13, %f14;
cvt.f32.f16 %f16, %h8;
add.rn.f32 %f17, %f15, %f16;
ld.global.nc.b32 %hh5, [%rd13+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f18, %h9;
add.rn.f32 %f19, %f17, %f18;
cvt.f32.f16 %f20, %h10;
add.rn.f32 %f21, %f19, %f20;
ld.global.nc.b32 %hh6, [%rd13+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f22, %h11;
add.rn.f32 %f23, %f21, %f22;
cvt.f32.f16 %f24, %h12;
add.rn.f32 %f25, %f23, %f24;
ld.global.nc.b32 %hh7, [%rd13+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f26, %h13;
add.rn.f32 %f27, %f25, %f26;
cvt.f32.f16 %f28, %h14;
add.rn.f32 %f29, %f27, %f28;
ld.global.nc.b32 %hh8, [%rd13+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f30, %h15;
add.rn.f32 %f31, %f29, %f30;
cvt.f32.f16 %f32, %h16;
add.rn.f32 %f33, %f31, %f32;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
add.rn.f32 %f35, %f34, %f33;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
add.rn.f32 %f37, %f36, %f35;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
add.rn.f32 %f39, %f38, %f37;
shfl.sync.down.b32 %f40, %f39, 2, 31, -1;
add.rn.f32 %f41, %f40, %f39;
shfl.sync.down.b32 %f42, %f41, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd16, shared_cache_04;
@%p1 bra LBB22_3;
bra.uni LBB22_1;
LBB22_3:
mul.wide.u32 %rd15, %r3, 4;
add.s64 %rd3, %rd16, %rd15;
add.rn.f32 %f1, %f42, %f41;
st.shared.f32 [%rd3], %f1;
LBB22_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB22_4;
bra.uni LBB22_2;
LBB22_4:
add.u64 %rd9, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd17, %r2, 4;
add.s64 %rd4, %rd16, %rd17;
cvta.shared.u64 %rd19, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd21, %rd19, %rd9, %p3;
ld.f32 %f43, [%rd21];
shfl.sync.down.b32 %f44, %f43, 16, 31, -1;
add.rn.f32 %f45, %f43, %f44;
shfl.sync.down.b32 %f46, %f45, 8, 31, -1;
add.rn.f32 %f47, %f45, %f46;
shfl.sync.down.b32 %f48, %f47, 4, 31, -1;
add.rn.f32 %f49, %f47, %f48;
shfl.sync.down.b32 %f50, %f49, 2, 31, -1;
add.rn.f32 %f51, %f49, %f50;
shfl.sync.down.b32 %f52, %f51, 1, 31, -1;
add.rn.f32 %f53, %f51, %f52;
st.f32 [%rd21], %f53;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB22_2;
ld.param.u64 %rd6, [fusion_2262_param_1];
cvta.to.global.u64 %rd7, %rd6;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd2, %rd7, %rd14;
ld.shared.f32 %f54, [%rd4];
atom.global.add.f32 %f55, [%rd2], %f54;
LBB22_2:
ret;
}
// .globl fusion_2259
.visible .entry fusion_2259(
.param .u64 fusion_2259_param_0,
.param .u64 fusion_2259_param_1,
.param .u64 fusion_2259_param_2,
.param .u64 fusion_2259_param_3
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot23[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<90>;
.reg .b32 %r<7>;
.reg .b64 %rd<25>;
mov.u64 %SPL, __local_depot23;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2259_param_0];
ld.param.u64 %rd6, [fusion_2259_param_2];
cvta.to.global.u64 %rd7, %rd6;
cvta.to.global.u64 %rd10, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd12, %r4, 2048;
add.s64 %rd13, %rd10, %rd12;
mul.wide.u32 %rd14, %r5, 2;
add.s64 %rd15, %rd13, %rd14;
ld.global.nc.b32 %hh1, [%rd15];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
mul.wide.u32 %rd16, %r4, 4;
add.s64 %rd17, %rd7, %rd16;
ld.global.nc.f32 %f3, [%rd17];
mul.rn.f32 %f4, %f3, 0f3A800000;
sub.rn.f32 %f5, %f2, %f4;
mul.rn.f32 %f6, %f5, %f5;
add.rn.f32 %f7, %f6, 0f00000000;
cvt.f32.f16 %f8, %h2;
sub.rn.f32 %f9, %f8, %f4;
mul.rn.f32 %f10, %f9, %f9;
add.rn.f32 %f11, %f7, %f10;
ld.global.nc.b32 %hh2, [%rd15+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f12, %h3;
sub.rn.f32 %f13, %f12, %f4;
mul.rn.f32 %f14, %f13, %f13;
add.rn.f32 %f15, %f11, %f14;
cvt.f32.f16 %f16, %h4;
sub.rn.f32 %f17, %f16, %f4;
mul.rn.f32 %f18, %f17, %f17;
add.rn.f32 %f19, %f15, %f18;
ld.global.nc.b32 %hh3, [%rd15+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f20, %h5;
sub.rn.f32 %f21, %f20, %f4;
mul.rn.f32 %f22, %f21, %f21;
add.rn.f32 %f23, %f19, %f22;
cvt.f32.f16 %f24, %h6;
sub.rn.f32 %f25, %f24, %f4;
mul.rn.f32 %f26, %f25, %f25;
add.rn.f32 %f27, %f23, %f26;
ld.global.nc.b32 %hh4, [%rd15+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f28, %h7;
sub.rn.f32 %f29, %f28, %f4;
mul.rn.f32 %f30, %f29, %f29;
add.rn.f32 %f31, %f27, %f30;
cvt.f32.f16 %f32, %h8;
sub.rn.f32 %f33, %f32, %f4;
mul.rn.f32 %f34, %f33, %f33;
add.rn.f32 %f35, %f31, %f34;
ld.global.nc.b32 %hh5, [%rd15+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f36, %h9;
sub.rn.f32 %f37, %f36, %f4;
mul.rn.f32 %f38, %f37, %f37;
add.rn.f32 %f39, %f35, %f38;
cvt.f32.f16 %f40, %h10;
sub.rn.f32 %f41, %f40, %f4;
mul.rn.f32 %f42, %f41, %f41;
add.rn.f32 %f43, %f39, %f42;
ld.global.nc.b32 %hh6, [%rd15+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f44, %h11;
sub.rn.f32 %f45, %f44, %f4;
mul.rn.f32 %f46, %f45, %f45;
add.rn.f32 %f47, %f43, %f46;
cvt.f32.f16 %f48, %h12;
sub.rn.f32 %f49, %f48, %f4;
mul.rn.f32 %f50, %f49, %f49;
add.rn.f32 %f51, %f47, %f50;
ld.global.nc.b32 %hh7, [%rd15+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f52, %h13;
sub.rn.f32 %f53, %f52, %f4;
mul.rn.f32 %f54, %f53, %f53;
add.rn.f32 %f55, %f51, %f54;
cvt.f32.f16 %f56, %h14;
sub.rn.f32 %f57, %f56, %f4;
mul.rn.f32 %f58, %f57, %f57;
add.rn.f32 %f59, %f55, %f58;
ld.global.nc.b32 %hh8, [%rd15+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f60, %h15;
sub.rn.f32 %f61, %f60, %f4;
mul.rn.f32 %f62, %f61, %f61;
add.rn.f32 %f63, %f59, %f62;
cvt.f32.f16 %f64, %h16;
sub.rn.f32 %f65, %f64, %f4;
mul.rn.f32 %f66, %f65, %f65;
add.rn.f32 %f67, %f63, %f66;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f68, %f67, 16, 31, -1;
add.rn.f32 %f69, %f68, %f67;
shfl.sync.down.b32 %f70, %f69, 8, 31, -1;
add.rn.f32 %f71, %f70, %f69;
shfl.sync.down.b32 %f72, %f71, 4, 31, -1;
add.rn.f32 %f73, %f72, %f71;
shfl.sync.down.b32 %f74, %f73, 2, 31, -1;
add.rn.f32 %f75, %f74, %f73;
shfl.sync.down.b32 %f76, %f75, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd19, shared_cache_05;
@%p1 bra LBB23_3;
bra.uni LBB23_1;
LBB23_3:
mul.wide.u32 %rd18, %r3, 4;
add.s64 %rd3, %rd19, %rd18;
add.rn.f32 %f1, %f76, %f75;
st.shared.f32 [%rd3], %f1;
LBB23_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB23_4;
bra.uni LBB23_2;
LBB23_4:
add.u64 %rd11, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd20, %r2, 4;
add.s64 %rd4, %rd19, %rd20;
cvta.shared.u64 %rd22, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd24, %rd22, %rd11, %p3;
ld.f32 %f77, [%rd24];
shfl.sync.down.b32 %f78, %f77, 16, 31, -1;
add.rn.f32 %f79, %f77, %f78;
shfl.sync.down.b32 %f80, %f79, 8, 31, -1;
add.rn.f32 %f81, %f79, %f80;
shfl.sync.down.b32 %f82, %f81, 4, 31, -1;
add.rn.f32 %f83, %f81, %f82;
shfl.sync.down.b32 %f84, %f83, 2, 31, -1;
add.rn.f32 %f85, %f83, %f84;
shfl.sync.down.b32 %f86, %f85, 1, 31, -1;
add.rn.f32 %f87, %f85, %f86;
st.f32 [%rd24], %f87;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB23_2;
ld.param.u64 %rd8, [fusion_2259_param_1];
cvta.to.global.u64 %rd9, %rd8;
add.s64 %rd2, %rd9, %rd16;
ld.shared.f32 %f88, [%rd4];
atom.global.add.f32 %f89, [%rd2], %f88;
LBB23_2:
ret;
}
// .globl fusion_2255
.visible .entry fusion_2255(
.param .u64 fusion_2255_param_0,
.param .u64 fusion_2255_param_1,
.param .u64 fusion_2255_param_2,
.param .u64 fusion_2255_param_3,
.param .u64 fusion_2255_param_4,
.param .u64 fusion_2255_param_5,
.param .u64 fusion_2255_param_6
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .b32 %hh<3>;
.reg .f32 %f<39>;
.reg .b32 %r<8>;
.reg .b64 %rd<28>;
ld.param.u64 %rd1, [fusion_2255_param_0];
ld.param.u64 %rd2, [fusion_2255_param_5];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2255_param_1];
ld.param.u64 %rd5, [fusion_2255_param_4];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2255_param_2];
ld.param.u64 %rd8, [fusion_2255_param_3];
cvta.to.global.u64 %rd9, %rd8;
cvta.to.global.u64 %rd10, %rd7;
cvta.to.global.u64 %rd11, %rd4;
cvta.to.global.u64 %rd12, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
mul.wide.u32 %rd13, %r5, 2;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd14];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
cvt.f32.f16 %f1, %h5;
mul.wide.u32 %rd15, %r1, 4;
add.s64 %rd16, %rd10, %rd15;
ld.global.nc.f32 %f2, [%rd16];
mul.rn.f32 %f3, %f2, 0f3A800000;
add.rn.f32 %f4, %f3, 0f2B8CBCCC;
rsqrt.approx.f32 %f5, %f4;
mul.wide.u32 %rd17, %r4, 4;
add.s64 %rd18, %rd3, %rd17;
ld.global.nc.f32 %f6, [%rd18];
mul.rn.f32 %f7, %f5, %f6;
mul.rn.f32 %f8, %f7, %f1;
add.s64 %rd19, %rd6, %rd17;
ld.global.nc.f32 %f9, [%rd19];
add.s64 %rd20, %rd9, %rd15;
ld.global.nc.f32 %f10, [%rd20];
mul.rn.f32 %f11, %f10, 0f3A800000;
mul.rn.f32 %f12, %f7, %f11;
sub.rn.f32 %f13, %f9, %f12;
add.rn.f32 %f14, %f8, %f13;
cvt.rn.f16.f32 %h9, %f14;
add.s64 %rd21, %rd11, %rd13;
cvt.f32.f16 %f15, %h6;
mul.wide.u32 %rd22, %r6, 4;
add.s64 %rd23, %rd3, %rd22;
ld.global.nc.f32 %f16, [%rd23];
mul.rn.f32 %f17, %f5, %f16;
mul.rn.f32 %f18, %f17, %f15;
add.s64 %rd24, %rd6, %rd22;
ld.global.nc.f32 %f19, [%rd24];
mul.rn.f32 %f20, %f11, %f17;
sub.rn.f32 %f21, %f19, %f20;
add.rn.f32 %f22, %f18, %f21;
cvt.rn.f16.f32 %h10, %f22;
cvt.f32.f16 %f23, %h7;
mul.wide.u32 %rd25, %r7, 4;
add.s64 %rd26, %rd3, %rd25;
ld.global.nc.f32 %f24, [%rd26];
mul.rn.f32 %f25, %f5, %f24;
mul.rn.f32 %f26, %f25, %f23;
add.s64 %rd27, %rd6, %rd25;
ld.global.nc.f32 %f27, [%rd27];
mul.rn.f32 %f28, %f11, %f25;
sub.rn.f32 %f29, %f27, %f28;
add.rn.f32 %f30, %f26, %f29;
cvt.rn.f16.f32 %h11, %f30;
cvt.f32.f16 %f31, %h8;
ld.global.nc.f32 %f32, [%rd18+12];
mul.rn.f32 %f33, %f5, %f32;
mul.rn.f32 %f34, %f33, %f31;
ld.global.nc.f32 %f35, [%rd19+12];
mul.rn.f32 %f36, %f11, %f33;
sub.rn.f32 %f37, %f35, %f36;
add.rn.f32 %f38, %f34, %f37;
cvt.rn.f16.f32 %h12, %f38;
st.global.v4.b16 [%rd21], {%h9, %h10, %h11, %h12};
ret;
}
// .globl convert_1393
.visible .entry convert_1393(
.param .u64 convert_1393_param_0,
.param .u64 convert_1393_param_1,
.param .u64 convert_1393_param_2
)
.reqntid 128, 1, 1
{
.reg .pred %p<3>;
.reg .b16 %h<29>;
.reg .f32 %f<29>;
.reg .b32 %r<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [convert_1393_param_0];
ld.param.u64 %rd4, [convert_1393_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd3;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
shl.b32 %r1, %r3, 9;
shl.b32 %r2, %r4, 2;
or.b32 %r5, %r1, %r2;
mul.wide.u32 %rd7, %r5, 4;
add.s64 %rd1, %rd5, %rd7;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd8, %r5, 2;
add.s64 %rd2, %rd6, %rd8;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4};
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440];
cvt.rn.f16.f32 %h5, %f5;
cvt.rn.f16.f32 %h6, %f6;
cvt.rn.f16.f32 %h7, %f7;
cvt.rn.f16.f32 %h8, %f8;
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8};
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880];
cvt.rn.f16.f32 %h9, %f9;
cvt.rn.f16.f32 %h10, %f10;
cvt.rn.f16.f32 %h11, %f11;
cvt.rn.f16.f32 %h12, %f12;
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12};
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320];
cvt.rn.f16.f32 %h13, %f13;
cvt.rn.f16.f32 %h14, %f14;
cvt.rn.f16.f32 %h15, %f15;
cvt.rn.f16.f32 %h16, %f16;
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16};
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760];
cvt.rn.f16.f32 %h17, %f17;
cvt.rn.f16.f32 %h18, %f18;
cvt.rn.f16.f32 %h19, %f19;
cvt.rn.f16.f32 %h20, %f20;
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20};
add.s32 %r6, %r5, 3276800;
setp.gt.u32 %p1, %r6, 4194303;
@%p1 bra LBB25_2;
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200];
cvt.rn.f16.f32 %h21, %f21;
cvt.rn.f16.f32 %h22, %f22;
cvt.rn.f16.f32 %h23, %f23;
cvt.rn.f16.f32 %h24, %f24;
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24};
LBB25_2:
add.s32 %r7, %r1, 3932160;
or.b32 %r8, %r7, %r2;
setp.gt.u32 %p2, %r8, 4194303;
@%p2 bra LBB25_4;
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640];
cvt.rn.f16.f32 %h25, %f25;
cvt.rn.f16.f32 %h26, %f26;
cvt.rn.f16.f32 %h27, %f27;
cvt.rn.f16.f32 %h28, %f28;
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28};
LBB25_4:
ret;
}
// .globl fusion_2250
.visible .entry fusion_2250(
.param .u64 fusion_2250_param_0,
.param .u64 fusion_2250_param_1,
.param .u64 fusion_2250_param_2,
.param .u64 fusion_2250_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<21>;
.reg .b16 %h<21>;
.reg .b32 %hh<3>;
.reg .f32 %f<150>;
.reg .b32 %r<25>;
.reg .b64 %rd<18>;
ld.param.u64 %rd1, [fusion_2250_param_0];
ld.param.u64 %rd2, [fusion_2250_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2250_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r5, 1;
or.b32 %r7, %r5, 2;
or.b32 %r8, %r5, 3;
and.b32 %r9, %r8, 4095;
and.b32 %r10, %r7, 4094;
and.b32 %r11, %r6, 4093;
and.b32 %r12, %r5, 4092;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd5, %rd7;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd8];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
mul.wide.u32 %rd9, %r12, 4;
add.s64 %rd10, %rd3, %rd9;
ld.global.nc.f32 %f1, [%rd10];
cvt.rn.f16.f32 %h9, %f1;
add.rn.f16 %h10, %h5, %h9;
cvt.f32.f16 %f2, %h10;
mul.rn.f32 %f3, %f2, %f2;
mul.rn.f32 %f4, %f3, %f2;
mul.rn.f32 %f5, %f4, 0f3D372713;
add.rn.f32 %f6, %f5, %f2;
mul.rn.f32 %f7, %f6, 0f3F4C422A;
abs.f32 %f8, %f7;
setp.lt.f32 %p1, %f8, 0f39D1B717;
setp.lt.f32 %p2, %f7, 0fC1100000;
selp.f32 %f9, 0fC1100000, %f7, %p2;
setp.gt.f32 %p3, %f9, 0f41100000;
selp.f32 %f10, 0f41100000, %f9, %p3;
mul.rn.f32 %f11, %f10, %f10;
mul.rn.f32 %f12, %f11, 0f259F25C0;
mov.f32 %f13, 0f2A61337E;
sub.rn.f32 %f14, %f13, %f12;
mul.rn.f32 %f15, %f11, %f14;
add.rn.f32 %f16, %f15, 0fAEBD37FF;
mul.rn.f32 %f17, %f11, %f16;
add.rn.f32 %f18, %f17, 0f335C0041;
mul.rn.f32 %f19, %f11, %f18;
add.rn.f32 %f20, %f19, 0f3779434A;
mul.rn.f32 %f21, %f11, %f20;
add.rn.f32 %f22, %f21, 0f3A270DED;
mul.rn.f32 %f23, %f11, %f22;
add.rn.f32 %f24, %f23, 0f3BA059DC;
mul.rn.f32 %f25, %f10, %f24;
mul.rn.f32 %f26, %f11, 0f35A0D3D8;
add.rn.f32 %f27, %f26, 0f38F895D6;
mul.rn.f32 %f28, %f11, %f27;
add.rn.f32 %f29, %f28, 0f3B14AA05;
mul.rn.f32 %f30, %f11, %f29;
add.rn.f32 %f31, %f30, 0f3BA059DD;
div.full.f32 %f32, %f25, %f31;
selp.f32 %f33, %f7, %f32, %p1;
mov.b32 %r13, %f7;
shr.u32 %r14, %r13, 31;
and.b32 %r15, %r14, 1;
setp.eq.b32 %p4, %r15, 1;
selp.f32 %f34, 0fBF800000, 0f3F800000, %p4;
setp.ltu.f32 %p5, %f8, 0f41A00000;
selp.f32 %f35, %f33, %f34, %p5;
add.rn.f32 %f36, %f35, 0f3F800000;
mul.rn.f32 %f37, %f36, 0f3F000000;
mul.rn.f32 %f38, %f37, %f2;
cvt.rn.f16.f32 %h11, %f38;
add.s64 %rd11, %rd6, %rd7;
mul.wide.u32 %rd12, %r11, 4;
add.s64 %rd13, %rd3, %rd12;
ld.global.nc.f32 %f39, [%rd13];
cvt.rn.f16.f32 %h12, %f39;
add.rn.f16 %h13, %h6, %h12;
cvt.f32.f16 %f40, %h13;
mul.rn.f32 %f41, %f40, %f40;
mul.rn.f32 %f42, %f41, %f40;
mul.rn.f32 %f43, %f42, 0f3D372713;
add.rn.f32 %f44, %f43, %f40;
mul.rn.f32 %f45, %f44, 0f3F4C422A;
abs.f32 %f46, %f45;
setp.lt.f32 %p6, %f46, 0f39D1B717;
setp.lt.f32 %p7, %f45, 0fC1100000;
selp.f32 %f47, 0fC1100000, %f45, %p7;
setp.gt.f32 %p8, %f47, 0f41100000;
selp.f32 %f48, 0f41100000, %f47, %p8;
mul.rn.f32 %f49, %f48, %f48;
mul.rn.f32 %f50, %f49, 0f259F25C0;
sub.rn.f32 %f51, %f13, %f50;
mul.rn.f32 %f52, %f49, %f51;
add.rn.f32 %f53, %f52, 0fAEBD37FF;
mul.rn.f32 %f54, %f49, %f53;
add.rn.f32 %f55, %f54, 0f335C0041;
mul.rn.f32 %f56, %f49, %f55;
add.rn.f32 %f57, %f56, 0f3779434A;
mul.rn.f32 %f58, %f49, %f57;
add.rn.f32 %f59, %f58, 0f3A270DED;
mul.rn.f32 %f60, %f49, %f59;
add.rn.f32 %f61, %f60, 0f3BA059DC;
mul.rn.f32 %f62, %f48, %f61;
mul.rn.f32 %f63, %f49, 0f35A0D3D8;
add.rn.f32 %f64, %f63, 0f38F895D6;
mul.rn.f32 %f65, %f49, %f64;
add.rn.f32 %f66, %f65, 0f3B14AA05;
mul.rn.f32 %f67, %f49, %f66;
add.rn.f32 %f68, %f67, 0f3BA059DD;
div.full.f32 %f69, %f62, %f68;
selp.f32 %f70, %f45, %f69, %p6;
mov.b32 %r16, %f45;
shr.u32 %r17, %r16, 31;
and.b32 %r18, %r17, 1;
setp.eq.b32 %p9, %r18, 1;
selp.f32 %f71, 0fBF800000, 0f3F800000, %p9;
setp.ltu.f32 %p10, %f46, 0f41A00000;
selp.f32 %f72, %f70, %f71, %p10;
add.rn.f32 %f73, %f72, 0f3F800000;
mul.rn.f32 %f74, %f73, 0f3F000000;
mul.rn.f32 %f75, %f74, %f40;
cvt.rn.f16.f32 %h14, %f75;
mul.wide.u32 %rd14, %r10, 4;
add.s64 %rd15, %rd3, %rd14;
ld.global.nc.f32 %f76, [%rd15];
cvt.rn.f16.f32 %h15, %f76;
add.rn.f16 %h16, %h7, %h15;
cvt.f32.f16 %f77, %h16;
mul.rn.f32 %f78, %f77, %f77;
mul.rn.f32 %f79, %f78, %f77;
mul.rn.f32 %f80, %f79, 0f3D372713;
add.rn.f32 %f81, %f80, %f77;
mul.rn.f32 %f82, %f81, 0f3F4C422A;
abs.f32 %f83, %f82;
setp.lt.f32 %p11, %f83, 0f39D1B717;
setp.lt.f32 %p12, %f82, 0fC1100000;
selp.f32 %f84, 0fC1100000, %f82, %p12;
setp.gt.f32 %p13, %f84, 0f41100000;
selp.f32 %f85, 0f41100000, %f84, %p13;
mul.rn.f32 %f86, %f85, %f85;
mul.rn.f32 %f87, %f86, 0f259F25C0;
sub.rn.f32 %f88, %f13, %f87;
mul.rn.f32 %f89, %f86, %f88;
add.rn.f32 %f90, %f89, 0fAEBD37FF;
mul.rn.f32 %f91, %f86, %f90;
add.rn.f32 %f92, %f91, 0f335C0041;
mul.rn.f32 %f93, %f86, %f92;
add.rn.f32 %f94, %f93, 0f3779434A;
mul.rn.f32 %f95, %f86, %f94;
add.rn.f32 %f96, %f95, 0f3A270DED;
mul.rn.f32 %f97, %f86, %f96;
add.rn.f32 %f98, %f97, 0f3BA059DC;
mul.rn.f32 %f99, %f85, %f98;
mul.rn.f32 %f100, %f86, 0f35A0D3D8;
add.rn.f32 %f101, %f100, 0f38F895D6;
mul.rn.f32 %f102, %f86, %f101;
add.rn.f32 %f103, %f102, 0f3B14AA05;
mul.rn.f32 %f104, %f86, %f103;
add.rn.f32 %f105, %f104, 0f3BA059DD;
div.full.f32 %f106, %f99, %f105;
selp.f32 %f107, %f82, %f106, %p11;
mov.b32 %r19, %f82;
shr.u32 %r20, %r19, 31;
and.b32 %r21, %r20, 1;
setp.eq.b32 %p14, %r21, 1;
selp.f32 %f108, 0fBF800000, 0f3F800000, %p14;
setp.ltu.f32 %p15, %f83, 0f41A00000;
selp.f32 %f109, %f107, %f108, %p15;
add.rn.f32 %f110, %f109, 0f3F800000;
mul.rn.f32 %f111, %f110, 0f3F000000;
mul.rn.f32 %f112, %f111, %f77;
cvt.rn.f16.f32 %h17, %f112;
mul.wide.u32 %rd16, %r9, 4;
add.s64 %rd17, %rd3, %rd16;
ld.global.nc.f32 %f113, [%rd17];
cvt.rn.f16.f32 %h18, %f113;
add.rn.f16 %h19, %h8, %h18;
cvt.f32.f16 %f114, %h19;
mul.rn.f32 %f115, %f114, %f114;
mul.rn.f32 %f116, %f115, %f114;
mul.rn.f32 %f117, %f116, 0f3D372713;
add.rn.f32 %f118, %f117, %f114;
mul.rn.f32 %f119, %f118, 0f3F4C422A;
abs.f32 %f120, %f119;
setp.lt.f32 %p16, %f120, 0f39D1B717;
setp.lt.f32 %p17, %f119, 0fC1100000;
selp.f32 %f121, 0fC1100000, %f119, %p17;
setp.gt.f32 %p18, %f121, 0f41100000;
selp.f32 %f122, 0f41100000, %f121, %p18;
mul.rn.f32 %f123, %f122, %f122;
mul.rn.f32 %f124, %f123, 0f259F25C0;
sub.rn.f32 %f125, %f13, %f124;
mul.rn.f32 %f126, %f123, %f125;
add.rn.f32 %f127, %f126, 0fAEBD37FF;
mul.rn.f32 %f128, %f123, %f127;
add.rn.f32 %f129, %f128, 0f335C0041;
mul.rn.f32 %f130, %f123, %f129;
add.rn.f32 %f131, %f130, 0f3779434A;
mul.rn.f32 %f132, %f123, %f131;
add.rn.f32 %f133, %f132, 0f3A270DED;
mul.rn.f32 %f134, %f123, %f133;
add.rn.f32 %f135, %f134, 0f3BA059DC;
mul.rn.f32 %f136, %f122, %f135;
mul.rn.f32 %f137, %f123, 0f35A0D3D8;
add.rn.f32 %f138, %f137, 0f38F895D6;
mul.rn.f32 %f139, %f123, %f138;
add.rn.f32 %f140, %f139, 0f3B14AA05;
mul.rn.f32 %f141, %f123, %f140;
add.rn.f32 %f142, %f141, 0f3BA059DD;
div.full.f32 %f143, %f136, %f142;
selp.f32 %f144, %f119, %f143, %p16;
mov.b32 %r22, %f119;
shr.u32 %r23, %r22, 31;
and.b32 %r24, %r23, 1;
setp.eq.b32 %p19, %r24, 1;
selp.f32 %f145, 0fBF800000, 0f3F800000, %p19;
setp.ltu.f32 %p20, %f120, 0f41A00000;
selp.f32 %f146, %f144, %f145, %p20;
add.rn.f32 %f147, %f146, 0f3F800000;
mul.rn.f32 %f148, %f147, 0f3F000000;
mul.rn.f32 %f149, %f148, %f114;
cvt.rn.f16.f32 %h20, %f149;
st.global.v4.b16 [%rd11], {%h11, %h14, %h17, %h20};
ret;
}
// .globl convert_1395
.visible .entry convert_1395(
.param .u64 convert_1395_param_0,
.param .u64 convert_1395_param_1,
.param .u64 convert_1395_param_2
)
.reqntid 128, 1, 1
{
.reg .pred %p<3>;
.reg .b16 %h<29>;
.reg .f32 %f<29>;
.reg .b32 %r<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [convert_1395_param_0];
ld.param.u64 %rd4, [convert_1395_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd3;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
shl.b32 %r1, %r3, 9;
shl.b32 %r2, %r4, 2;
or.b32 %r5, %r1, %r2;
mul.wide.u32 %rd7, %r5, 4;
add.s64 %rd1, %rd5, %rd7;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd8, %r5, 2;
add.s64 %rd2, %rd6, %rd8;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4};
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440];
cvt.rn.f16.f32 %h5, %f5;
cvt.rn.f16.f32 %h6, %f6;
cvt.rn.f16.f32 %h7, %f7;
cvt.rn.f16.f32 %h8, %f8;
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8};
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880];
cvt.rn.f16.f32 %h9, %f9;
cvt.rn.f16.f32 %h10, %f10;
cvt.rn.f16.f32 %h11, %f11;
cvt.rn.f16.f32 %h12, %f12;
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12};
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320];
cvt.rn.f16.f32 %h13, %f13;
cvt.rn.f16.f32 %h14, %f14;
cvt.rn.f16.f32 %h15, %f15;
cvt.rn.f16.f32 %h16, %f16;
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16};
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760];
cvt.rn.f16.f32 %h17, %f17;
cvt.rn.f16.f32 %h18, %f18;
cvt.rn.f16.f32 %h19, %f19;
cvt.rn.f16.f32 %h20, %f20;
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20};
add.s32 %r6, %r5, 3276800;
setp.gt.u32 %p1, %r6, 4194303;
@%p1 bra LBB27_2;
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200];
cvt.rn.f16.f32 %h21, %f21;
cvt.rn.f16.f32 %h22, %f22;
cvt.rn.f16.f32 %h23, %f23;
cvt.rn.f16.f32 %h24, %f24;
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24};
LBB27_2:
add.s32 %r7, %r1, 3932160;
or.b32 %r8, %r7, %r2;
setp.gt.u32 %p2, %r8, 4194303;
@%p2 bra LBB27_4;
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640];
cvt.rn.f16.f32 %h25, %f25;
cvt.rn.f16.f32 %h26, %f26;
cvt.rn.f16.f32 %h27, %f27;
cvt.rn.f16.f32 %h28, %f28;
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28};
LBB27_4:
ret;
}
// .globl rng_get_and_update_state_2
.visible .entry rng_get_and_update_state_2(
.param .u64 rng_get_and_update_state_2_param_0,
.param .u64 rng_get_and_update_state_2_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_2_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 524288;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 524288;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2248
.visible .entry fusion_2248(
.param .u64 fusion_2248_param_0,
.param .u64 fusion_2248_param_1,
.param .u64 fusion_2248_param_2,
.param .u64 fusion_2248_param_3,
.param .u64 fusion_2248_param_4,
.param .u64 fusion_2248_param_5,
.param .u64 fusion_2248_param_6,
.param .u64 fusion_2248_param_7,
.param .u64 fusion_2248_param_8,
.param .u64 fusion_2248_param_9
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot29[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<75>;
.reg .b16 %h<145>;
.reg .f32 %f<254>;
.reg .b32 %r<350>;
.reg .b64 %rd<2739>;
mov.u64 %SPL, __local_depot29;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd463, [fusion_2248_param_0];
ld.param.u64 %rd464, [fusion_2248_param_8];
cvta.to.global.u64 %rd1, %rd464;
ld.param.u64 %rd465, [fusion_2248_param_1];
ld.param.u64 %rd466, [fusion_2248_param_7];
cvta.to.global.u64 %rd2, %rd466;
ld.param.u64 %rd467, [fusion_2248_param_2];
ld.param.u64 %rd468, [fusion_2248_param_6];
cvta.to.global.u64 %rd3, %rd468;
ld.param.u64 %rd470, [fusion_2248_param_5];
cvta.to.global.u64 %rd4, %rd470;
ld.param.u64 %rd471, [fusion_2248_param_4];
cvta.to.global.u64 %rd5, %rd471;
cvta.to.global.u64 %rd7, %rd467;
cvta.to.global.u64 %rd8, %rd465;
cvta.to.global.u64 %rd9, %rd463;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 10;
or.b32 %r48, %r4, %r3;
shr.u32 %r49, %r48, 2;
and.b32 %r5, %r1, 1;
setp.eq.s32 %p1, %r5, 0;
ld.global.nc.u64 %rd11, [%rd7];
cvt.u64.u32 %rd473, %r49;
add.s64 %rd12, %rd11, %rd473;
setp.lt.u64 %p69, %rd12, %rd11;
and.b64 %rd2384, %rd12, 4294967295;
@%p1 bra LBB29_1;
bra.uni LBB29_4;
LBB29_1:
mul.lo.s64 %rd2446, %rd2384, 3528531795;
ld.global.nc.u64 %rd2461, [%rd7+8];
selp.u64 %rd516, 1, 0, %p69;
add.s64 %rd517, %rd2461, %rd516;
xor.b64 %rd518, %rd517, %rd2446;
shr.u64 %rd519, %rd518, 32;
mul.lo.s64 %rd2449, %rd519, 3449720151;
shr.u64 %rd520, %rd2449, 32;
and.b64 %rd521, %rd517, 4294967295;
mul.lo.s64 %rd522, %rd521, 3449720151;
and.b64 %rd523, %rd522, 4294967295;
xor.b64 %rd524, %rd523, %rd520;
xor.b64 %rd525, %rd524, 2654435769;
mul.lo.s64 %rd2452, %rd525, 3528531795;
xor.b64 %rd2442, %rd522, %rd12;
mov.u32 %r312, -1879881855;
mov.u32 %r311, -845247145;
mov.u32 %r310, 534103459;
mov.u64 %rd2460, 3678237736;
mov.u64 %rd2459, 3041712726;
mov.u64 %rd2458, 1401181199;
mov.u64 %rd2457, 2835769497;
mov.u64 %rd2456, 1684936478;
mov.u64 %rd2455, 2027808484;
mov.u64 %rd2454, 387276957;
mov.u64 %rd2453, 842468239;
mov.u64 %rd2451, 3986602516;
mov.u64 %rd2450, 1013904242;
mov.u64 %rd2448, 3668340011;
mov.u64 %rd2447, 3144134277;
mov.u64 %rd2445, 3449720151;
mov.u64 %rd2444, 1993301258;
mov.u64 %rd2443, 3528531795;
bra.uni LBB29_5;
LBB29_4:
mov.u32 %r311, -766435501;
mov.u64 %rd2459, 1684936478;
mov.u64 %rd2458, 534103459;
mov.u64 %rd2457, 387276957;
mov.u64 %rd2456, 3041712726;
mov.u64 %rd2455, 3986602516;
mov.u64 %rd2454, 2835769497;
mov.u64 %rd2453, 3668340011;
mov.u64 %rd2451, 2027808484;
mov.u64 %rd2450, 1993301258;
mov.u64 %rd2448, 842468239;
mov.u64 %rd2447, 2654435769;
mov.u64 %rd2445, 3528531795;
mov.u64 %rd2444, 1013904242;
mov.u64 %rd2443, 3449720151;
mov.u32 %r312, -1767562579;
mov.u32 %r310, 1401181199;
mov.u64 %rd2460, 4055616968;
ld.global.nc.u64 %rd2461, [%rd7+8];
selp.u64 %rd489, 1, 0, %p69;
add.s64 %rd490, %rd2461, %rd489;
and.b64 %rd491, %rd490, 4294967295;
mul.lo.s64 %rd2446, %rd491, 3449720151;
xor.b64 %rd492, %rd2446, %rd12;
shr.u64 %rd493, %rd492, 32;
mul.lo.s64 %rd2449, %rd493, 3528531795;
shr.u64 %rd494, %rd2449, 32;
mul.lo.s64 %rd496, %rd2384, 3528531795;
and.b64 %rd497, %rd496, 4294967295;
xor.b64 %rd498, %rd497, %rd494;
xor.b64 %rd499, %rd498, 3144134277;
mul.lo.s64 %rd2452, %rd499, 3449720151;
xor.b64 %rd2442, %rd490, %rd496;
LBB29_5:
shr.u64 %rd526, %rd2452, 32;
shr.u64 %rd527, %rd2442, 32;
mul.lo.s64 %rd528, %rd527, %rd2443;
and.b64 %rd529, %rd528, 4294967295;
xor.b64 %rd530, %rd529, %rd526;
xor.b64 %rd531, %rd530, %rd2444;
mul.lo.s64 %rd532, %rd531, %rd2445;
shr.u64 %rd533, %rd532, 32;
shr.u64 %rd534, %rd528, 32;
and.b64 %rd535, %rd2446, 4294967295;
xor.b64 %rd536, %rd535, %rd534;
xor.b64 %rd537, %rd536, %rd2447;
mul.lo.s64 %rd538, %rd537, %rd2445;
and.b64 %rd539, %rd538, 4294967295;
xor.b64 %rd540, %rd539, %rd533;
xor.b64 %rd541, %rd540, %rd2448;
mul.lo.s64 %rd542, %rd541, %rd2443;
shr.u64 %rd543, %rd542, 32;
shr.u64 %rd544, %rd538, 32;
and.b64 %rd545, %rd2449, 4294967295;
xor.b64 %rd546, %rd545, %rd544;
xor.b64 %rd547, %rd546, %rd2450;
mul.lo.s64 %rd548, %rd547, %rd2443;
and.b64 %rd549, %rd548, 4294967295;
xor.b64 %rd550, %rd549, %rd543;
xor.b64 %rd551, %rd550, %rd2451;
mul.lo.s64 %rd552, %rd551, %rd2445;
shr.u64 %rd553, %rd552, 32;
shr.u64 %rd554, %rd548, 32;
and.b64 %rd555, %rd2452, 4294967295;
xor.b64 %rd556, %rd555, %rd554;
xor.b64 %rd557, %rd556, %rd2453;
mul.lo.s64 %rd558, %rd557, %rd2445;
and.b64 %rd559, %rd558, 4294967295;
xor.b64 %rd560, %rd559, %rd553;
xor.b64 %rd561, %rd560, %rd2454;
mul.lo.s64 %rd562, %rd561, %rd2443;
shr.u64 %rd563, %rd562, 32;
shr.u64 %rd564, %rd558, 32;
and.b64 %rd565, %rd532, 4294967295;
xor.b64 %rd566, %rd565, %rd564;
xor.b64 %rd567, %rd566, %rd2455;
mul.lo.s64 %rd568, %rd567, %rd2443;
and.b64 %rd569, %rd568, 4294967295;
xor.b64 %rd570, %rd569, %rd563;
xor.b64 %rd571, %rd570, %rd2456;
mul.lo.s64 %rd572, %rd571, %rd2445;
shr.u64 %rd573, %rd572, 32;
shr.u64 %rd574, %rd568, 32;
and.b64 %rd575, %rd542, 4294967295;
xor.b64 %rd576, %rd575, %rd574;
xor.b64 %rd577, %rd576, %rd2457;
mul.lo.s64 %rd578, %rd577, %rd2445;
and.b64 %rd579, %rd578, 4294967295;
xor.b64 %rd580, %rd579, %rd573;
xor.b64 %rd581, %rd580, %rd2458;
mul.lo.s64 %rd582, %rd581, %rd2443;
shr.u64 %rd583, %rd582, 32;
shr.u64 %rd584, %rd578, 32;
and.b64 %rd585, %rd552, 4294967295;
xor.b64 %rd586, %rd585, %rd584;
xor.b64 %rd587, %rd586, %rd2459;
mul.lo.s64 %rd588, %rd587, %rd2443;
and.b64 %rd589, %rd588, 4294967295;
xor.b64 %rd590, %rd589, %rd583;
xor.b64 %rd591, %rd590, %rd2460;
mul.lo.s64 %rd592, %rd591, %rd2445;
shr.u64 %rd593, %rd592, 32;
cvt.u32.u64 %r56, %rd593;
shr.u64 %rd594, %rd588, 32;
xor.b64 %rd595, %rd594, %rd562;
cvt.u32.u64 %r57, %rd595;
xor.b32 %r58, %r310, %r57;
mul.lo.s32 %r59, %r58, %r311;
xor.b32 %r60, %r59, %r56;
xor.b32 %r61, %r60, %r312;
shr.u32 %r62, %r61, 9;
cvt.rn.f32.u32 %f19, %r62;
mul.rn.f32 %f20, %f19, 0f34000000;
cvt.rn.f16.f32 %h1, %f20;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p4, %h1, %h2;
mul.wide.u32 %rd596, %r2, 2048;
add.s64 %rd597, %rd9, %rd596;
mul.wide.u32 %rd598, %r3, 2;
add.s64 %rd44, %rd597, %rd598;
ld.global.nc.b16 %h3, [%rd44];
mul.wide.u32 %rd599, %r3, 4;
add.s64 %rd45, %rd1, %rd599;
ld.global.nc.f32 %f21, [%rd45];
cvt.rn.f16.f32 %h4, %f21;
add.rn.f16 %h5, %h3, %h4;
mov.b16 %h6, 0x3C72;
mul.rn.f16 %h7, %h5, %h6;
selp.b16 %h8, %h7, 0x0000, %p4;
cvt.f32.f16 %f22, %h8;
add.s64 %rd600, %rd8, %rd596;
add.s64 %rd46, %rd600, %rd598;
ld.global.nc.b16 %h9, [%rd46];
cvt.f32.f16 %f23, %h9;
mul.wide.u32 %rd601, %r2, 4;
add.s64 %rd602, %rd5, %rd601;
ld.global.nc.f32 %f24, [%rd602];
mul.rn.f32 %f25, %f24, 0f3A800000;
add.rn.f32 %f26, %f25, 0f2B8CBCCC;
rsqrt.approx.f32 %f1, %f26;
add.s64 %rd47, %rd2, %rd599;
ld.global.nc.f32 %f27, [%rd47];
mul.rn.f32 %f28, %f1, %f27;
mul.rn.f32 %f29, %f28, %f23;
add.s64 %rd48, %rd3, %rd599;
ld.global.nc.f32 %f30, [%rd48];
add.s64 %rd603, %rd4, %rd601;
ld.global.nc.f32 %f31, [%rd603];
mul.rn.f32 %f2, %f31, 0f3A800000;
mul.rn.f32 %f32, %f28, %f2;
sub.rn.f32 %f33, %f30, %f32;
add.rn.f32 %f34, %f29, %f33;
add.rn.f32 %f35, %f34, %f22;
add.rn.f32 %f3, %f35, 0f00000000;
or.b32 %r63, %r3, 1;
and.b32 %r64, %r63, 3;
setp.ne.s32 %p5, %r64, 1;
@%p5 bra LBB29_7;
mul.lo.s64 %rd2466, %rd2384, 3528531795;
selp.u64 %rd644, 1, 0, %p69;
add.s64 %rd645, %rd2461, %rd644;
xor.b64 %rd646, %rd645, %rd2466;
shr.u64 %rd647, %rd646, 32;
mul.lo.s64 %rd2469, %rd647, 3449720151;
shr.u64 %rd648, %rd2469, 32;
and.b64 %rd649, %rd645, 4294967295;
mul.lo.s64 %rd650, %rd649, 3449720151;
and.b64 %rd651, %rd650, 4294967295;
xor.b64 %rd652, %rd651, %rd648;
xor.b64 %rd653, %rd652, 2654435769;
mul.lo.s64 %rd2472, %rd653, 3528531795;
xor.b64 %rd2462, %rd650, %rd12;
mov.u32 %r314, -845247145;
mov.u32 %r313, -616729560;
mov.u64 %rd2479, 3041712726;
mov.u64 %rd2478, 1401181199;
mov.u64 %rd2477, 2835769497;
mov.u64 %rd2476, 1684936478;
mov.u64 %rd2475, 2027808484;
mov.u64 %rd2474, 387276957;
mov.u64 %rd2473, 842468239;
mov.u64 %rd2471, 3986602516;
mov.u64 %rd2470, 1013904242;
mov.u64 %rd2468, 3668340011;
mov.u64 %rd2467, 3144134277;
mov.u64 %rd2465, 3449720151;
mov.u64 %rd2464, 1993301258;
mov.u64 %rd2463, 3528531795;
bra.uni LBB29_8;
LBB29_7:
mov.u32 %r313, -239350328;
selp.u64 %rd618, 1, 0, %p69;
add.s64 %rd619, %rd2461, %rd618;
and.b64 %rd620, %rd619, 4294967295;
mul.lo.s64 %rd2466, %rd620, 3449720151;
xor.b64 %rd621, %rd2466, %rd12;
shr.u64 %rd622, %rd621, 32;
mul.lo.s64 %rd2469, %rd622, 3528531795;
shr.u64 %rd623, %rd2469, 32;
mul.lo.s64 %rd625, %rd2384, 3528531795;
and.b64 %rd626, %rd625, 4294967295;
xor.b64 %rd627, %rd626, %rd623;
xor.b64 %rd628, %rd627, 3144134277;
mul.lo.s64 %rd2472, %rd628, 3449720151;
xor.b64 %rd2462, %rd619, %rd625;
mov.u32 %r314, -766435501;
mov.u64 %rd2479, 1684936478;
mov.u64 %rd2478, 534103459;
mov.u64 %rd2477, 387276957;
mov.u64 %rd2476, 3041712726;
mov.u64 %rd2475, 3986602516;
mov.u64 %rd2474, 2835769497;
mov.u64 %rd2473, 3668340011;
mov.u64 %rd2471, 2027808484;
mov.u64 %rd2470, 1993301258;
mov.u64 %rd2468, 842468239;
mov.u64 %rd2467, 2654435769;
mov.u64 %rd2465, 3528531795;
mov.u64 %rd2464, 1013904242;
mov.u64 %rd2463, 3449720151;
LBB29_8:
setp.ne.s32 %p8, %r5, 0;
shr.u64 %rd654, %rd2472, 32;
shr.u64 %rd655, %rd2462, 32;
mul.lo.s64 %rd656, %rd655, %rd2463;
and.b64 %rd657, %rd656, 4294967295;
xor.b64 %rd658, %rd657, %rd654;
xor.b64 %rd659, %rd658, %rd2464;
mul.lo.s64 %rd660, %rd659, %rd2465;
shr.u64 %rd661, %rd660, 32;
shr.u64 %rd662, %rd656, 32;
and.b64 %rd663, %rd2466, 4294967295;
xor.b64 %rd664, %rd663, %rd662;
xor.b64 %rd665, %rd664, %rd2467;
mul.lo.s64 %rd666, %rd665, %rd2465;
and.b64 %rd667, %rd666, 4294967295;
xor.b64 %rd668, %rd667, %rd661;
xor.b64 %rd669, %rd668, %rd2468;
mul.lo.s64 %rd670, %rd669, %rd2463;
shr.u64 %rd671, %rd670, 32;
shr.u64 %rd672, %rd666, 32;
and.b64 %rd673, %rd2469, 4294967295;
xor.b64 %rd674, %rd673, %rd672;
xor.b64 %rd675, %rd674, %rd2470;
mul.lo.s64 %rd676, %rd675, %rd2463;
and.b64 %rd677, %rd676, 4294967295;
xor.b64 %rd678, %rd677, %rd671;
xor.b64 %rd679, %rd678, %rd2471;
mul.lo.s64 %rd680, %rd679, %rd2465;
shr.u64 %rd681, %rd680, 32;
shr.u64 %rd682, %rd676, 32;
and.b64 %rd683, %rd2472, 4294967295;
xor.b64 %rd684, %rd683, %rd682;
xor.b64 %rd685, %rd684, %rd2473;
mul.lo.s64 %rd686, %rd685, %rd2465;
and.b64 %rd687, %rd686, 4294967295;
xor.b64 %rd688, %rd687, %rd681;
xor.b64 %rd689, %rd688, %rd2474;
mul.lo.s64 %rd690, %rd689, %rd2463;
shr.u64 %rd691, %rd690, 32;
shr.u64 %rd692, %rd686, 32;
and.b64 %rd693, %rd660, 4294967295;
xor.b64 %rd694, %rd693, %rd692;
xor.b64 %rd695, %rd694, %rd2475;
mul.lo.s64 %rd696, %rd695, %rd2463;
and.b64 %rd697, %rd696, 4294967295;
xor.b64 %rd698, %rd697, %rd691;
xor.b64 %rd699, %rd698, %rd2476;
mul.lo.s64 %rd700, %rd699, %rd2465;
shr.u64 %rd701, %rd700, 32;
shr.u64 %rd702, %rd696, 32;
and.b64 %rd703, %rd670, 4294967295;
xor.b64 %rd704, %rd703, %rd702;
xor.b64 %rd705, %rd704, %rd2477;
mul.lo.s64 %rd706, %rd705, %rd2465;
and.b64 %rd707, %rd706, 4294967295;
xor.b64 %rd708, %rd707, %rd701;
xor.b64 %rd709, %rd708, %rd2478;
mul.lo.s64 %rd710, %rd709, %rd2463;
shr.u64 %rd711, %rd710, 32;
shr.u64 %rd712, %rd706, 32;
xor.b64 %rd713, %rd680, %rd712;
xor.b64 %rd714, %rd713, %rd2479;
mul.lo.s64 %rd715, %rd714, %rd2463;
xor.b64 %rd716, %rd711, %rd715;
cvt.u32.u64 %r69, %rd716;
xor.b32 %r70, %r313, %r69;
mul.lo.s32 %r71, %r70, %r314;
shr.u32 %r72, %r71, 9;
cvt.rn.f32.u32 %f36, %r72;
mul.rn.f32 %f37, %f36, 0f34000000;
cvt.rn.f16.f32 %h10, %f37;
mov.b16 %h11, 0x2E66;
setp.ge.f16 %p9, %h10, %h11;
ld.global.nc.b16 %h12, [%rd44+2];
ld.global.nc.f32 %f38, [%rd45+4];
cvt.rn.f16.f32 %h13, %f38;
add.rn.f16 %h14, %h12, %h13;
mov.b16 %h15, 0x3C72;
mul.rn.f16 %h16, %h14, %h15;
selp.b16 %h17, %h16, 0x0000, %p9;
cvt.f32.f16 %f39, %h17;
ld.global.nc.b16 %h18, [%rd46+2];
cvt.f32.f16 %f40, %h18;
ld.global.nc.f32 %f41, [%rd47+4];
mul.rn.f32 %f42, %f1, %f41;
mul.rn.f32 %f43, %f42, %f40;
ld.global.nc.f32 %f44, [%rd48+4];
mul.rn.f32 %f45, %f2, %f42;
sub.rn.f32 %f46, %f44, %f45;
add.rn.f32 %f47, %f43, %f46;
add.rn.f32 %f48, %f47, %f39;
add.rn.f32 %f4, %f3, %f48;
or.b32 %r73, %r3, %r4;
or.b32 %r74, %r73, 128;
shr.u32 %r75, %r74, 2;
cvt.u64.u32 %rd717, %r75;
add.s64 %rd75, %rd11, %rd717;
and.b64 %rd2433, %rd75, 4294967295;
setp.lt.u64 %p74, %rd75, %rd11;
@%p8 bra LBB29_10;
mul.lo.s64 %rd2484, %rd2433, 3528531795;
selp.u64 %rd760, 1, 0, %p74;
add.s64 %rd761, %rd2461, %rd760;
xor.b64 %rd762, %rd761, %rd2484;
shr.u64 %rd763, %rd762, 32;
mul.lo.s64 %rd2487, %rd763, 3449720151;
shr.u64 %rd764, %rd2487, 32;
and.b64 %rd765, %rd761, 4294967295;
mul.lo.s64 %rd766, %rd765, 3449720151;
and.b64 %rd767, %rd766, 4294967295;
xor.b64 %rd768, %rd767, %rd764;
xor.b64 %rd769, %rd768, 2654435769;
mul.lo.s64 %rd2490, %rd769, 3528531795;
xor.b64 %rd2480, %rd766, %rd75;
mov.u32 %r317, -1879881855;
mov.u32 %r316, -845247145;
mov.u32 %r315, 534103459;
mov.u64 %rd2498, 3678237736;
mov.u64 %rd2497, 3041712726;
mov.u64 %rd2496, 1401181199;
mov.u64 %rd2495, 2835769497;
mov.u64 %rd2494, 1684936478;
mov.u64 %rd2493, 2027808484;
mov.u64 %rd2492, 387276957;
mov.u64 %rd2491, 842468239;
mov.u64 %rd2489, 3986602516;
mov.u64 %rd2488, 1013904242;
mov.u64 %rd2486, 3668340011;
mov.u64 %rd2485, 3144134277;
mov.u64 %rd2483, 3449720151;
mov.u64 %rd2482, 1993301258;
mov.u64 %rd2481, 3528531795;
bra.uni LBB29_11;
LBB29_10:
selp.u64 %rd733, 1, 0, %p74;
add.s64 %rd734, %rd2461, %rd733;
and.b64 %rd735, %rd734, 4294967295;
mul.lo.s64 %rd2484, %rd735, 3449720151;
xor.b64 %rd736, %rd2484, %rd75;
shr.u64 %rd737, %rd736, 32;
mul.lo.s64 %rd2487, %rd737, 3528531795;
shr.u64 %rd738, %rd2487, 32;
mul.lo.s64 %rd740, %rd2433, 3528531795;
and.b64 %rd741, %rd740, 4294967295;
xor.b64 %rd742, %rd741, %rd738;
xor.b64 %rd743, %rd742, 3144134277;
mul.lo.s64 %rd2490, %rd743, 3449720151;
xor.b64 %rd2480, %rd734, %rd740;
mov.u32 %r317, -1767562579;
mov.u32 %r316, -766435501;
mov.u32 %r315, 1401181199;
mov.u64 %rd2498, 4055616968;
mov.u64 %rd2497, 1684936478;
mov.u64 %rd2496, 534103459;
mov.u64 %rd2495, 387276957;
mov.u64 %rd2494, 3041712726;
mov.u64 %rd2493, 3986602516;
mov.u64 %rd2492, 2835769497;
mov.u64 %rd2491, 3668340011;
mov.u64 %rd2489, 2027808484;
mov.u64 %rd2488, 1993301258;
mov.u64 %rd2486, 842468239;
mov.u64 %rd2485, 2654435769;
mov.u64 %rd2483, 3528531795;
mov.u64 %rd2482, 1013904242;
mov.u64 %rd2481, 3449720151;
LBB29_11:
shr.u64 %rd770, %rd2490, 32;
shr.u64 %rd771, %rd2480, 32;
mul.lo.s64 %rd772, %rd771, %rd2481;
and.b64 %rd773, %rd772, 4294967295;
xor.b64 %rd774, %rd773, %rd770;
xor.b64 %rd775, %rd774, %rd2482;
mul.lo.s64 %rd776, %rd775, %rd2483;
shr.u64 %rd777, %rd776, 32;
shr.u64 %rd778, %rd772, 32;
and.b64 %rd779, %rd2484, 4294967295;
xor.b64 %rd780, %rd779, %rd778;
xor.b64 %rd781, %rd780, %rd2485;
mul.lo.s64 %rd782, %rd781, %rd2483;
and.b64 %rd783, %rd782, 4294967295;
xor.b64 %rd784, %rd783, %rd777;
xor.b64 %rd785, %rd784, %rd2486;
mul.lo.s64 %rd786, %rd785, %rd2481;
shr.u64 %rd787, %rd786, 32;
shr.u64 %rd788, %rd782, 32;
and.b64 %rd789, %rd2487, 4294967295;
xor.b64 %rd790, %rd789, %rd788;
xor.b64 %rd791, %rd790, %rd2488;
mul.lo.s64 %rd792, %rd791, %rd2481;
and.b64 %rd793, %rd792, 4294967295;
xor.b64 %rd794, %rd793, %rd787;
xor.b64 %rd795, %rd794, %rd2489;
mul.lo.s64 %rd796, %rd795, %rd2483;
shr.u64 %rd797, %rd796, 32;
shr.u64 %rd798, %rd792, 32;
and.b64 %rd799, %rd2490, 4294967295;
xor.b64 %rd800, %rd799, %rd798;
xor.b64 %rd801, %rd800, %rd2491;
mul.lo.s64 %rd802, %rd801, %rd2483;
and.b64 %rd803, %rd802, 4294967295;
xor.b64 %rd804, %rd803, %rd797;
xor.b64 %rd805, %rd804, %rd2492;
mul.lo.s64 %rd806, %rd805, %rd2481;
shr.u64 %rd807, %rd806, 32;
shr.u64 %rd808, %rd802, 32;
and.b64 %rd809, %rd776, 4294967295;
xor.b64 %rd810, %rd809, %rd808;
xor.b64 %rd811, %rd810, %rd2493;
mul.lo.s64 %rd812, %rd811, %rd2481;
and.b64 %rd813, %rd812, 4294967295;
xor.b64 %rd814, %rd813, %rd807;
xor.b64 %rd815, %rd814, %rd2494;
mul.lo.s64 %rd816, %rd815, %rd2483;
shr.u64 %rd817, %rd816, 32;
shr.u64 %rd818, %rd812, 32;
and.b64 %rd819, %rd786, 4294967295;
xor.b64 %rd820, %rd819, %rd818;
xor.b64 %rd821, %rd820, %rd2495;
mul.lo.s64 %rd822, %rd821, %rd2483;
and.b64 %rd823, %rd822, 4294967295;
xor.b64 %rd824, %rd823, %rd817;
xor.b64 %rd825, %rd824, %rd2496;
mul.lo.s64 %rd826, %rd825, %rd2481;
shr.u64 %rd827, %rd826, 32;
shr.u64 %rd828, %rd822, 32;
and.b64 %rd829, %rd796, 4294967295;
xor.b64 %rd830, %rd829, %rd828;
xor.b64 %rd831, %rd830, %rd2497;
mul.lo.s64 %rd832, %rd831, %rd2481;
and.b64 %rd833, %rd832, 4294967295;
xor.b64 %rd834, %rd833, %rd827;
xor.b64 %rd835, %rd834, %rd2498;
mul.lo.s64 %rd836, %rd835, %rd2483;
shr.u64 %rd837, %rd836, 32;
cvt.u32.u64 %r82, %rd837;
shr.u64 %rd838, %rd832, 32;
xor.b64 %rd839, %rd838, %rd806;
cvt.u32.u64 %r83, %rd839;
xor.b32 %r84, %r315, %r83;
mul.lo.s32 %r85, %r84, %r316;
xor.b32 %r86, %r85, %r82;
xor.b32 %r87, %r86, %r317;
shr.u32 %r88, %r87, 9;
cvt.rn.f32.u32 %f49, %r88;
mul.rn.f32 %f50, %f49, 0f34000000;
cvt.rn.f16.f32 %h19, %f50;
mov.b16 %h20, 0x2E66;
setp.ge.f16 %p12, %h19, %h20;
ld.global.nc.b16 %h21, [%rd44+256];
ld.global.nc.f32 %f51, [%rd45+512];
cvt.rn.f16.f32 %h22, %f51;
add.rn.f16 %h23, %h21, %h22;
mov.b16 %h24, 0x3C72;
mul.rn.f16 %h25, %h23, %h24;
selp.b16 %h26, %h25, 0x0000, %p12;
cvt.f32.f16 %f52, %h26;
ld.global.nc.b16 %h27, [%rd46+256];
cvt.f32.f16 %f53, %h27;
ld.global.nc.f32 %f54, [%rd47+512];
mul.rn.f32 %f55, %f1, %f54;
mul.rn.f32 %f56, %f55, %f53;
ld.global.nc.f32 %f57, [%rd48+512];
mul.rn.f32 %f58, %f2, %f55;
sub.rn.f32 %f59, %f57, %f58;
add.rn.f32 %f60, %f56, %f59;
add.rn.f32 %f61, %f60, %f52;
add.rn.f32 %f5, %f4, %f61;
or.b32 %r89, %r3, 129;
or.b32 %r90, %r89, %r4;
and.b32 %r91, %r89, 3;
shr.u32 %r92, %r90, 2;
setp.ne.s32 %p13, %r91, 1;
cvt.u64.u32 %rd840, %r92;
add.s64 %rd103, %rd11, %rd840;
and.b64 %rd2430, %rd103, 4294967295;
setp.lt.u64 %p73, %rd103, %rd11;
@%p13 bra LBB29_13;
mul.lo.s64 %rd2503, %rd2430, 3528531795;
selp.u64 %rd881, 1, 0, %p73;
add.s64 %rd882, %rd2461, %rd881;
xor.b64 %rd883, %rd882, %rd2503;
shr.u64 %rd884, %rd883, 32;
mul.lo.s64 %rd2506, %rd884, 3449720151;
shr.u64 %rd885, %rd2506, 32;
and.b64 %rd886, %rd882, 4294967295;
mul.lo.s64 %rd887, %rd886, 3449720151;
and.b64 %rd888, %rd887, 4294967295;
xor.b64 %rd889, %rd888, %rd885;
xor.b64 %rd890, %rd889, 2654435769;
mul.lo.s64 %rd2509, %rd890, 3528531795;
xor.b64 %rd2499, %rd887, %rd103;
mov.u32 %r319, -845247145;
mov.u32 %r318, -616729560;
mov.u64 %rd2516, 3041712726;
mov.u64 %rd2515, 1401181199;
mov.u64 %rd2514, 2835769497;
mov.u64 %rd2513, 1684936478;
mov.u64 %rd2512, 2027808484;
mov.u64 %rd2511, 387276957;
mov.u64 %rd2510, 842468239;
mov.u64 %rd2508, 3986602516;
mov.u64 %rd2507, 1013904242;
mov.u64 %rd2505, 3668340011;
mov.u64 %rd2504, 3144134277;
mov.u64 %rd2502, 3449720151;
mov.u64 %rd2501, 1993301258;
mov.u64 %rd2500, 3528531795;
bra.uni LBB29_14;
LBB29_13:
selp.u64 %rd855, 1, 0, %p73;
add.s64 %rd856, %rd2461, %rd855;
and.b64 %rd857, %rd856, 4294967295;
mul.lo.s64 %rd2503, %rd857, 3449720151;
xor.b64 %rd858, %rd2503, %rd103;
shr.u64 %rd859, %rd858, 32;
mul.lo.s64 %rd2506, %rd859, 3528531795;
shr.u64 %rd860, %rd2506, 32;
mul.lo.s64 %rd862, %rd2430, 3528531795;
and.b64 %rd863, %rd862, 4294967295;
xor.b64 %rd864, %rd863, %rd860;
xor.b64 %rd865, %rd864, 3144134277;
mul.lo.s64 %rd2509, %rd865, 3449720151;
xor.b64 %rd2499, %rd856, %rd862;
mov.u32 %r319, -766435501;
mov.u32 %r318, -239350328;
mov.u64 %rd2516, 1684936478;
mov.u64 %rd2515, 534103459;
mov.u64 %rd2514, 387276957;
mov.u64 %rd2513, 3041712726;
mov.u64 %rd2512, 3986602516;
mov.u64 %rd2511, 2835769497;
mov.u64 %rd2510, 3668340011;
mov.u64 %rd2508, 2027808484;
mov.u64 %rd2507, 1993301258;
mov.u64 %rd2505, 842468239;
mov.u64 %rd2504, 2654435769;
mov.u64 %rd2502, 3528531795;
mov.u64 %rd2501, 1013904242;
mov.u64 %rd2500, 3449720151;
LBB29_14:
shr.u64 %rd891, %rd2509, 32;
shr.u64 %rd892, %rd2499, 32;
mul.lo.s64 %rd893, %rd892, %rd2500;
and.b64 %rd894, %rd893, 4294967295;
xor.b64 %rd895, %rd894, %rd891;
xor.b64 %rd896, %rd895, %rd2501;
mul.lo.s64 %rd897, %rd896, %rd2502;
shr.u64 %rd898, %rd897, 32;
shr.u64 %rd899, %rd893, 32;
and.b64 %rd900, %rd2503, 4294967295;
xor.b64 %rd901, %rd900, %rd899;
xor.b64 %rd902, %rd901, %rd2504;
mul.lo.s64 %rd903, %rd902, %rd2502;
and.b64 %rd904, %rd903, 4294967295;
xor.b64 %rd905, %rd904, %rd898;
xor.b64 %rd906, %rd905, %rd2505;
mul.lo.s64 %rd907, %rd906, %rd2500;
shr.u64 %rd908, %rd907, 32;
shr.u64 %rd909, %rd903, 32;
and.b64 %rd910, %rd2506, 4294967295;
xor.b64 %rd911, %rd910, %rd909;
xor.b64 %rd912, %rd911, %rd2507;
mul.lo.s64 %rd913, %rd912, %rd2500;
and.b64 %rd914, %rd913, 4294967295;
xor.b64 %rd915, %rd914, %rd908;
xor.b64 %rd916, %rd915, %rd2508;
mul.lo.s64 %rd917, %rd916, %rd2502;
shr.u64 %rd918, %rd917, 32;
shr.u64 %rd919, %rd913, 32;
and.b64 %rd920, %rd2509, 4294967295;
xor.b64 %rd921, %rd920, %rd919;
xor.b64 %rd922, %rd921, %rd2510;
mul.lo.s64 %rd923, %rd922, %rd2502;
and.b64 %rd924, %rd923, 4294967295;
xor.b64 %rd925, %rd924, %rd918;
xor.b64 %rd926, %rd925, %rd2511;
mul.lo.s64 %rd927, %rd926, %rd2500;
shr.u64 %rd928, %rd927, 32;
shr.u64 %rd929, %rd923, 32;
and.b64 %rd930, %rd897, 4294967295;
xor.b64 %rd931, %rd930, %rd929;
xor.b64 %rd932, %rd931, %rd2512;
mul.lo.s64 %rd933, %rd932, %rd2500;
and.b64 %rd934, %rd933, 4294967295;
xor.b64 %rd935, %rd934, %rd928;
xor.b64 %rd936, %rd935, %rd2513;
mul.lo.s64 %rd937, %rd936, %rd2502;
shr.u64 %rd938, %rd937, 32;
shr.u64 %rd939, %rd933, 32;
and.b64 %rd940, %rd907, 4294967295;
xor.b64 %rd941, %rd940, %rd939;
xor.b64 %rd942, %rd941, %rd2514;
mul.lo.s64 %rd943, %rd942, %rd2502;
and.b64 %rd944, %rd943, 4294967295;
xor.b64 %rd945, %rd944, %rd938;
xor.b64 %rd946, %rd945, %rd2515;
mul.lo.s64 %rd947, %rd946, %rd2500;
shr.u64 %rd948, %rd947, 32;
shr.u64 %rd949, %rd943, 32;
xor.b64 %rd950, %rd917, %rd949;
xor.b64 %rd951, %rd950, %rd2516;
mul.lo.s64 %rd952, %rd951, %rd2500;
xor.b64 %rd953, %rd948, %rd952;
cvt.u32.u64 %r97, %rd953;
xor.b32 %r98, %r318, %r97;
mul.lo.s32 %r99, %r98, %r319;
shr.u32 %r100, %r99, 9;
cvt.rn.f32.u32 %f62, %r100;
mul.rn.f32 %f63, %f62, 0f34000000;
cvt.rn.f16.f32 %h28, %f63;
mov.b16 %h29, 0x2E66;
setp.ge.f16 %p17, %h28, %h29;
ld.global.nc.b16 %h30, [%rd44+258];
ld.global.nc.f32 %f64, [%rd45+516];
cvt.rn.f16.f32 %h31, %f64;
add.rn.f16 %h32, %h30, %h31;
mov.b16 %h33, 0x3C72;
mul.rn.f16 %h34, %h32, %h33;
selp.b16 %h35, %h34, 0x0000, %p17;
cvt.f32.f16 %f65, %h35;
ld.global.nc.b16 %h36, [%rd46+258];
cvt.f32.f16 %f66, %h36;
ld.global.nc.f32 %f67, [%rd47+516];
mul.rn.f32 %f68, %f1, %f67;
mul.rn.f32 %f69, %f68, %f66;
ld.global.nc.f32 %f70, [%rd48+516];
mul.rn.f32 %f71, %f2, %f68;
sub.rn.f32 %f72, %f70, %f71;
add.rn.f32 %f73, %f69, %f72;
add.rn.f32 %f74, %f73, %f65;
add.rn.f32 %f6, %f5, %f74;
or.b32 %r102, %r73, 256;
shr.u32 %r103, %r102, 2;
cvt.u64.u32 %rd954, %r103;
add.s64 %rd130, %rd11, %rd954;
and.b64 %rd2426, %rd130, 4294967295;
setp.lt.u64 %p72, %rd130, %rd11;
@%p8 bra LBB29_16;
mul.lo.s64 %rd2521, %rd2426, 3528531795;
selp.u64 %rd997, 1, 0, %p72;
add.s64 %rd998, %rd2461, %rd997;
xor.b64 %rd999, %rd998, %rd2521;
shr.u64 %rd1000, %rd999, 32;
mul.lo.s64 %rd2524, %rd1000, 3449720151;
shr.u64 %rd1001, %rd2524, 32;
and.b64 %rd1002, %rd998, 4294967295;
mul.lo.s64 %rd1003, %rd1002, 3449720151;
and.b64 %rd1004, %rd1003, 4294967295;
xor.b64 %rd1005, %rd1004, %rd1001;
xor.b64 %rd1006, %rd1005, 2654435769;
mul.lo.s64 %rd2527, %rd1006, 3528531795;
xor.b64 %rd2517, %rd1003, %rd130;
mov.u32 %r322, -1879881855;
mov.u32 %r321, -845247145;
mov.u32 %r320, 534103459;
mov.u64 %rd2535, 3678237736;
mov.u64 %rd2534, 3041712726;
mov.u64 %rd2533, 1401181199;
mov.u64 %rd2532, 2835769497;
mov.u64 %rd2531, 1684936478;
mov.u64 %rd2530, 2027808484;
mov.u64 %rd2529, 387276957;
mov.u64 %rd2528, 842468239;
mov.u64 %rd2526, 3986602516;
mov.u64 %rd2525, 1013904242;
mov.u64 %rd2523, 3668340011;
mov.u64 %rd2522, 3144134277;
mov.u64 %rd2520, 3449720151;
mov.u64 %rd2519, 1993301258;
mov.u64 %rd2518, 3528531795;
bra.uni LBB29_17;
LBB29_16:
selp.u64 %rd970, 1, 0, %p72;
add.s64 %rd971, %rd2461, %rd970;
and.b64 %rd972, %rd971, 4294967295;
mul.lo.s64 %rd2521, %rd972, 3449720151;
xor.b64 %rd973, %rd2521, %rd130;
shr.u64 %rd974, %rd973, 32;
mul.lo.s64 %rd2524, %rd974, 3528531795;
shr.u64 %rd975, %rd2524, 32;
mul.lo.s64 %rd977, %rd2426, 3528531795;
and.b64 %rd978, %rd977, 4294967295;
xor.b64 %rd979, %rd978, %rd975;
xor.b64 %rd980, %rd979, 3144134277;
mul.lo.s64 %rd2527, %rd980, 3449720151;
xor.b64 %rd2517, %rd971, %rd977;
mov.u32 %r322, -1767562579;
mov.u32 %r321, -766435501;
mov.u32 %r320, 1401181199;
mov.u64 %rd2535, 4055616968;
mov.u64 %rd2534, 1684936478;
mov.u64 %rd2533, 534103459;
mov.u64 %rd2532, 387276957;
mov.u64 %rd2531, 3041712726;
mov.u64 %rd2530, 3986602516;
mov.u64 %rd2529, 2835769497;
mov.u64 %rd2528, 3668340011;
mov.u64 %rd2526, 2027808484;
mov.u64 %rd2525, 1993301258;
mov.u64 %rd2523, 842468239;
mov.u64 %rd2522, 2654435769;
mov.u64 %rd2520, 3528531795;
mov.u64 %rd2519, 1013904242;
mov.u64 %rd2518, 3449720151;
LBB29_17:
shr.u64 %rd1007, %rd2527, 32;
shr.u64 %rd1008, %rd2517, 32;
mul.lo.s64 %rd1009, %rd1008, %rd2518;
and.b64 %rd1010, %rd1009, 4294967295;
xor.b64 %rd1011, %rd1010, %rd1007;
xor.b64 %rd1012, %rd1011, %rd2519;
mul.lo.s64 %rd1013, %rd1012, %rd2520;
shr.u64 %rd1014, %rd1013, 32;
shr.u64 %rd1015, %rd1009, 32;
and.b64 %rd1016, %rd2521, 4294967295;
xor.b64 %rd1017, %rd1016, %rd1015;
xor.b64 %rd1018, %rd1017, %rd2522;
mul.lo.s64 %rd1019, %rd1018, %rd2520;
and.b64 %rd1020, %rd1019, 4294967295;
xor.b64 %rd1021, %rd1020, %rd1014;
xor.b64 %rd1022, %rd1021, %rd2523;
mul.lo.s64 %rd1023, %rd1022, %rd2518;
shr.u64 %rd1024, %rd1023, 32;
shr.u64 %rd1025, %rd1019, 32;
and.b64 %rd1026, %rd2524, 4294967295;
xor.b64 %rd1027, %rd1026, %rd1025;
xor.b64 %rd1028, %rd1027, %rd2525;
mul.lo.s64 %rd1029, %rd1028, %rd2518;
and.b64 %rd1030, %rd1029, 4294967295;
xor.b64 %rd1031, %rd1030, %rd1024;
xor.b64 %rd1032, %rd1031, %rd2526;
mul.lo.s64 %rd1033, %rd1032, %rd2520;
shr.u64 %rd1034, %rd1033, 32;
shr.u64 %rd1035, %rd1029, 32;
and.b64 %rd1036, %rd2527, 4294967295;
xor.b64 %rd1037, %rd1036, %rd1035;
xor.b64 %rd1038, %rd1037, %rd2528;
mul.lo.s64 %rd1039, %rd1038, %rd2520;
and.b64 %rd1040, %rd1039, 4294967295;
xor.b64 %rd1041, %rd1040, %rd1034;
xor.b64 %rd1042, %rd1041, %rd2529;
mul.lo.s64 %rd1043, %rd1042, %rd2518;
shr.u64 %rd1044, %rd1043, 32;
shr.u64 %rd1045, %rd1039, 32;
and.b64 %rd1046, %rd1013, 4294967295;
xor.b64 %rd1047, %rd1046, %rd1045;
xor.b64 %rd1048, %rd1047, %rd2530;
mul.lo.s64 %rd1049, %rd1048, %rd2518;
and.b64 %rd1050, %rd1049, 4294967295;
xor.b64 %rd1051, %rd1050, %rd1044;
xor.b64 %rd1052, %rd1051, %rd2531;
mul.lo.s64 %rd1053, %rd1052, %rd2520;
shr.u64 %rd1054, %rd1053, 32;
shr.u64 %rd1055, %rd1049, 32;
and.b64 %rd1056, %rd1023, 4294967295;
xor.b64 %rd1057, %rd1056, %rd1055;
xor.b64 %rd1058, %rd1057, %rd2532;
mul.lo.s64 %rd1059, %rd1058, %rd2520;
and.b64 %rd1060, %rd1059, 4294967295;
xor.b64 %rd1061, %rd1060, %rd1054;
xor.b64 %rd1062, %rd1061, %rd2533;
mul.lo.s64 %rd1063, %rd1062, %rd2518;
shr.u64 %rd1064, %rd1063, 32;
shr.u64 %rd1065, %rd1059, 32;
and.b64 %rd1066, %rd1033, 4294967295;
xor.b64 %rd1067, %rd1066, %rd1065;
xor.b64 %rd1068, %rd1067, %rd2534;
mul.lo.s64 %rd1069, %rd1068, %rd2518;
and.b64 %rd1070, %rd1069, 4294967295;
xor.b64 %rd1071, %rd1070, %rd1064;
xor.b64 %rd1072, %rd1071, %rd2535;
mul.lo.s64 %rd1073, %rd1072, %rd2520;
shr.u64 %rd1074, %rd1073, 32;
cvt.u32.u64 %r110, %rd1074;
shr.u64 %rd1075, %rd1069, 32;
xor.b64 %rd1076, %rd1075, %rd1043;
cvt.u32.u64 %r111, %rd1076;
xor.b32 %r112, %r320, %r111;
mul.lo.s32 %r113, %r112, %r321;
xor.b32 %r114, %r113, %r110;
xor.b32 %r115, %r114, %r322;
shr.u32 %r116, %r115, 9;
cvt.rn.f32.u32 %f75, %r116;
mul.rn.f32 %f76, %f75, 0f34000000;
cvt.rn.f16.f32 %h37, %f76;
mov.b16 %h38, 0x2E66;
setp.ge.f16 %p20, %h37, %h38;
ld.global.nc.b16 %h39, [%rd44+512];
ld.global.nc.f32 %f77, [%rd45+1024];
cvt.rn.f16.f32 %h40, %f77;
add.rn.f16 %h41, %h39, %h40;
mov.b16 %h42, 0x3C72;
mul.rn.f16 %h43, %h41, %h42;
selp.b16 %h44, %h43, 0x0000, %p20;
cvt.f32.f16 %f78, %h44;
ld.global.nc.b16 %h45, [%rd46+512];
cvt.f32.f16 %f79, %h45;
ld.global.nc.f32 %f80, [%rd47+1024];
mul.rn.f32 %f81, %f1, %f80;
mul.rn.f32 %f82, %f81, %f79;
ld.global.nc.f32 %f83, [%rd48+1024];
mul.rn.f32 %f84, %f2, %f81;
sub.rn.f32 %f85, %f83, %f84;
add.rn.f32 %f86, %f82, %f85;
add.rn.f32 %f87, %f86, %f78;
add.rn.f32 %f7, %f6, %f87;
or.b32 %r117, %r3, 257;
or.b32 %r118, %r117, %r4;
and.b32 %r119, %r117, 3;
shr.u32 %r120, %r118, 2;
setp.ne.s32 %p21, %r119, 1;
cvt.u64.u32 %rd1077, %r120;
add.s64 %rd158, %rd11, %rd1077;
and.b64 %rd2423, %rd158, 4294967295;
setp.lt.u64 %p71, %rd158, %rd11;
@%p21 bra LBB29_19;
mul.lo.s64 %rd2540, %rd2423, 3528531795;
selp.u64 %rd1118, 1, 0, %p71;
add.s64 %rd1119, %rd2461, %rd1118;
xor.b64 %rd1120, %rd1119, %rd2540;
shr.u64 %rd1121, %rd1120, 32;
mul.lo.s64 %rd2543, %rd1121, 3449720151;
shr.u64 %rd1122, %rd2543, 32;
and.b64 %rd1123, %rd1119, 4294967295;
mul.lo.s64 %rd1124, %rd1123, 3449720151;
and.b64 %rd1125, %rd1124, 4294967295;
xor.b64 %rd1126, %rd1125, %rd1122;
xor.b64 %rd1127, %rd1126, 2654435769;
mul.lo.s64 %rd2546, %rd1127, 3528531795;
xor.b64 %rd2536, %rd1124, %rd158;
mov.u32 %r324, -845247145;
mov.u32 %r323, -616729560;
mov.u64 %rd2553, 3041712726;
mov.u64 %rd2552, 1401181199;
mov.u64 %rd2551, 2835769497;
mov.u64 %rd2550, 1684936478;
mov.u64 %rd2549, 2027808484;
mov.u64 %rd2548, 387276957;
mov.u64 %rd2547, 842468239;
mov.u64 %rd2545, 3986602516;
mov.u64 %rd2544, 1013904242;
mov.u64 %rd2542, 3668340011;
mov.u64 %rd2541, 3144134277;
mov.u64 %rd2539, 3449720151;
mov.u64 %rd2538, 1993301258;
mov.u64 %rd2537, 3528531795;
bra.uni LBB29_20;
LBB29_19:
selp.u64 %rd1092, 1, 0, %p71;
add.s64 %rd1093, %rd2461, %rd1092;
and.b64 %rd1094, %rd1093, 4294967295;
mul.lo.s64 %rd2540, %rd1094, 3449720151;
xor.b64 %rd1095, %rd2540, %rd158;
shr.u64 %rd1096, %rd1095, 32;
mul.lo.s64 %rd2543, %rd1096, 3528531795;
shr.u64 %rd1097, %rd2543, 32;
mul.lo.s64 %rd1099, %rd2423, 3528531795;
and.b64 %rd1100, %rd1099, 4294967295;
xor.b64 %rd1101, %rd1100, %rd1097;
xor.b64 %rd1102, %rd1101, 3144134277;
mul.lo.s64 %rd2546, %rd1102, 3449720151;
xor.b64 %rd2536, %rd1093, %rd1099;
mov.u32 %r324, -766435501;
mov.u32 %r323, -239350328;
mov.u64 %rd2553, 1684936478;
mov.u64 %rd2552, 534103459;
mov.u64 %rd2551, 387276957;
mov.u64 %rd2550, 3041712726;
mov.u64 %rd2549, 3986602516;
mov.u64 %rd2548, 2835769497;
mov.u64 %rd2547, 3668340011;
mov.u64 %rd2545, 2027808484;
mov.u64 %rd2544, 1993301258;
mov.u64 %rd2542, 842468239;
mov.u64 %rd2541, 2654435769;
mov.u64 %rd2539, 3528531795;
mov.u64 %rd2538, 1013904242;
mov.u64 %rd2537, 3449720151;
LBB29_20:
shr.u64 %rd1128, %rd2546, 32;
shr.u64 %rd1129, %rd2536, 32;
mul.lo.s64 %rd1130, %rd1129, %rd2537;
and.b64 %rd1131, %rd1130, 4294967295;
xor.b64 %rd1132, %rd1131, %rd1128;
xor.b64 %rd1133, %rd1132, %rd2538;
mul.lo.s64 %rd1134, %rd1133, %rd2539;
shr.u64 %rd1135, %rd1134, 32;
shr.u64 %rd1136, %rd1130, 32;
and.b64 %rd1137, %rd2540, 4294967295;
xor.b64 %rd1138, %rd1137, %rd1136;
xor.b64 %rd1139, %rd1138, %rd2541;
mul.lo.s64 %rd1140, %rd1139, %rd2539;
and.b64 %rd1141, %rd1140, 4294967295;
xor.b64 %rd1142, %rd1141, %rd1135;
xor.b64 %rd1143, %rd1142, %rd2542;
mul.lo.s64 %rd1144, %rd1143, %rd2537;
shr.u64 %rd1145, %rd1144, 32;
shr.u64 %rd1146, %rd1140, 32;
and.b64 %rd1147, %rd2543, 4294967295;
xor.b64 %rd1148, %rd1147, %rd1146;
xor.b64 %rd1149, %rd1148, %rd2544;
mul.lo.s64 %rd1150, %rd1149, %rd2537;
and.b64 %rd1151, %rd1150, 4294967295;
xor.b64 %rd1152, %rd1151, %rd1145;
xor.b64 %rd1153, %rd1152, %rd2545;
mul.lo.s64 %rd1154, %rd1153, %rd2539;
shr.u64 %rd1155, %rd1154, 32;
shr.u64 %rd1156, %rd1150, 32;
and.b64 %rd1157, %rd2546, 4294967295;
xor.b64 %rd1158, %rd1157, %rd1156;
xor.b64 %rd1159, %rd1158, %rd2547;
mul.lo.s64 %rd1160, %rd1159, %rd2539;
and.b64 %rd1161, %rd1160, 4294967295;
xor.b64 %rd1162, %rd1161, %rd1155;
xor.b64 %rd1163, %rd1162, %rd2548;
mul.lo.s64 %rd1164, %rd1163, %rd2537;
shr.u64 %rd1165, %rd1164, 32;
shr.u64 %rd1166, %rd1160, 32;
and.b64 %rd1167, %rd1134, 4294967295;
xor.b64 %rd1168, %rd1167, %rd1166;
xor.b64 %rd1169, %rd1168, %rd2549;
mul.lo.s64 %rd1170, %rd1169, %rd2537;
and.b64 %rd1171, %rd1170, 4294967295;
xor.b64 %rd1172, %rd1171, %rd1165;
xor.b64 %rd1173, %rd1172, %rd2550;
mul.lo.s64 %rd1174, %rd1173, %rd2539;
shr.u64 %rd1175, %rd1174, 32;
shr.u64 %rd1176, %rd1170, 32;
and.b64 %rd1177, %rd1144, 4294967295;
xor.b64 %rd1178, %rd1177, %rd1176;
xor.b64 %rd1179, %rd1178, %rd2551;
mul.lo.s64 %rd1180, %rd1179, %rd2539;
and.b64 %rd1181, %rd1180, 4294967295;
xor.b64 %rd1182, %rd1181, %rd1175;
xor.b64 %rd1183, %rd1182, %rd2552;
mul.lo.s64 %rd1184, %rd1183, %rd2537;
shr.u64 %rd1185, %rd1184, 32;
shr.u64 %rd1186, %rd1180, 32;
xor.b64 %rd1187, %rd1154, %rd1186;
xor.b64 %rd1188, %rd1187, %rd2553;
mul.lo.s64 %rd1189, %rd1188, %rd2537;
xor.b64 %rd1190, %rd1185, %rd1189;
cvt.u32.u64 %r125, %rd1190;
xor.b32 %r126, %r323, %r125;
mul.lo.s32 %r127, %r126, %r324;
shr.u32 %r128, %r127, 9;
cvt.rn.f32.u32 %f88, %r128;
mul.rn.f32 %f89, %f88, 0f34000000;
cvt.rn.f16.f32 %h46, %f89;
mov.b16 %h47, 0x2E66;
setp.ge.f16 %p25, %h46, %h47;
ld.global.nc.b16 %h48, [%rd44+514];
ld.global.nc.f32 %f90, [%rd45+1028];
cvt.rn.f16.f32 %h49, %f90;
add.rn.f16 %h50, %h48, %h49;
mov.b16 %h51, 0x3C72;
mul.rn.f16 %h52, %h50, %h51;
selp.b16 %h53, %h52, 0x0000, %p25;
cvt.f32.f16 %f91, %h53;
ld.global.nc.b16 %h54, [%rd46+514];
cvt.f32.f16 %f92, %h54;
ld.global.nc.f32 %f93, [%rd47+1028];
mul.rn.f32 %f94, %f1, %f93;
mul.rn.f32 %f95, %f94, %f92;
ld.global.nc.f32 %f96, [%rd48+1028];
mul.rn.f32 %f97, %f2, %f94;
sub.rn.f32 %f98, %f96, %f97;
add.rn.f32 %f99, %f95, %f98;
add.rn.f32 %f100, %f99, %f91;
add.rn.f32 %f8, %f7, %f100;
or.b32 %r130, %r73, 384;
shr.u32 %r131, %r130, 2;
cvt.u64.u32 %rd1191, %r131;
add.s64 %rd185, %rd11, %rd1191;
and.b64 %rd2419, %rd185, 4294967295;
setp.lt.u64 %p70, %rd185, %rd11;
@%p8 bra LBB29_22;
mul.lo.s64 %rd2558, %rd2419, 3528531795;
selp.u64 %rd1234, 1, 0, %p70;
add.s64 %rd1235, %rd2461, %rd1234;
xor.b64 %rd1236, %rd1235, %rd2558;
shr.u64 %rd1237, %rd1236, 32;
mul.lo.s64 %rd2561, %rd1237, 3449720151;
shr.u64 %rd1238, %rd2561, 32;
and.b64 %rd1239, %rd1235, 4294967295;
mul.lo.s64 %rd1240, %rd1239, 3449720151;
and.b64 %rd1241, %rd1240, 4294967295;
xor.b64 %rd1242, %rd1241, %rd1238;
xor.b64 %rd1243, %rd1242, 2654435769;
mul.lo.s64 %rd2564, %rd1243, 3528531795;
xor.b64 %rd2554, %rd1240, %rd185;
mov.u32 %r327, -1879881855;
mov.u32 %r326, -845247145;
mov.u32 %r325, 534103459;
mov.u64 %rd2572, 3678237736;
mov.u64 %rd2571, 3041712726;
mov.u64 %rd2570, 1401181199;
mov.u64 %rd2569, 2835769497;
mov.u64 %rd2568, 1684936478;
mov.u64 %rd2567, 2027808484;
mov.u64 %rd2566, 387276957;
mov.u64 %rd2565, 842468239;
mov.u64 %rd2563, 3986602516;
mov.u64 %rd2562, 1013904242;
mov.u64 %rd2560, 3668340011;
mov.u64 %rd2559, 3144134277;
mov.u64 %rd2557, 3449720151;
mov.u64 %rd2556, 1993301258;
mov.u64 %rd2555, 3528531795;
bra.uni LBB29_23;
LBB29_22:
selp.u64 %rd1207, 1, 0, %p70;
add.s64 %rd1208, %rd2461, %rd1207;
and.b64 %rd1209, %rd1208, 4294967295;
mul.lo.s64 %rd2558, %rd1209, 3449720151;
xor.b64 %rd1210, %rd2558, %rd185;
shr.u64 %rd1211, %rd1210, 32;
mul.lo.s64 %rd2561, %rd1211, 3528531795;
shr.u64 %rd1212, %rd2561, 32;
mul.lo.s64 %rd1214, %rd2419, 3528531795;
and.b64 %rd1215, %rd1214, 4294967295;
xor.b64 %rd1216, %rd1215, %rd1212;
xor.b64 %rd1217, %rd1216, 3144134277;
mul.lo.s64 %rd2564, %rd1217, 3449720151;
xor.b64 %rd2554, %rd1208, %rd1214;
mov.u32 %r327, -1767562579;
mov.u32 %r326, -766435501;
mov.u32 %r325, 1401181199;
mov.u64 %rd2572, 4055616968;
mov.u64 %rd2571, 1684936478;
mov.u64 %rd2570, 534103459;
mov.u64 %rd2569, 387276957;
mov.u64 %rd2568, 3041712726;
mov.u64 %rd2567, 3986602516;
mov.u64 %rd2566, 2835769497;
mov.u64 %rd2565, 3668340011;
mov.u64 %rd2563, 2027808484;
mov.u64 %rd2562, 1993301258;
mov.u64 %rd2560, 842468239;
mov.u64 %rd2559, 2654435769;
mov.u64 %rd2557, 3528531795;
mov.u64 %rd2556, 1013904242;
mov.u64 %rd2555, 3449720151;
LBB29_23:
shr.u64 %rd1244, %rd2564, 32;
shr.u64 %rd1245, %rd2554, 32;
mul.lo.s64 %rd1246, %rd1245, %rd2555;
and.b64 %rd1247, %rd1246, 4294967295;
xor.b64 %rd1248, %rd1247, %rd1244;
xor.b64 %rd1249, %rd1248, %rd2556;
mul.lo.s64 %rd1250, %rd1249, %rd2557;
shr.u64 %rd1251, %rd1250, 32;
shr.u64 %rd1252, %rd1246, 32;
and.b64 %rd1253, %rd2558, 4294967295;
xor.b64 %rd1254, %rd1253, %rd1252;
xor.b64 %rd1255, %rd1254, %rd2559;
mul.lo.s64 %rd1256, %rd1255, %rd2557;
and.b64 %rd1257, %rd1256, 4294967295;
xor.b64 %rd1258, %rd1257, %rd1251;
xor.b64 %rd1259, %rd1258, %rd2560;
mul.lo.s64 %rd1260, %rd1259, %rd2555;
shr.u64 %rd1261, %rd1260, 32;
shr.u64 %rd1262, %rd1256, 32;
and.b64 %rd1263, %rd2561, 4294967295;
xor.b64 %rd1264, %rd1263, %rd1262;
xor.b64 %rd1265, %rd1264, %rd2562;
mul.lo.s64 %rd1266, %rd1265, %rd2555;
and.b64 %rd1267, %rd1266, 4294967295;
xor.b64 %rd1268, %rd1267, %rd1261;
xor.b64 %rd1269, %rd1268, %rd2563;
mul.lo.s64 %rd1270, %rd1269, %rd2557;
shr.u64 %rd1271, %rd1270, 32;
shr.u64 %rd1272, %rd1266, 32;
and.b64 %rd1273, %rd2564, 4294967295;
xor.b64 %rd1274, %rd1273, %rd1272;
xor.b64 %rd1275, %rd1274, %rd2565;
mul.lo.s64 %rd1276, %rd1275, %rd2557;
and.b64 %rd1277, %rd1276, 4294967295;
xor.b64 %rd1278, %rd1277, %rd1271;
xor.b64 %rd1279, %rd1278, %rd2566;
mul.lo.s64 %rd1280, %rd1279, %rd2555;
shr.u64 %rd1281, %rd1280, 32;
shr.u64 %rd1282, %rd1276, 32;
and.b64 %rd1283, %rd1250, 4294967295;
xor.b64 %rd1284, %rd1283, %rd1282;
xor.b64 %rd1285, %rd1284, %rd2567;
mul.lo.s64 %rd1286, %rd1285, %rd2555;
and.b64 %rd1287, %rd1286, 4294967295;
xor.b64 %rd1288, %rd1287, %rd1281;
xor.b64 %rd1289, %rd1288, %rd2568;
mul.lo.s64 %rd1290, %rd1289, %rd2557;
shr.u64 %rd1291, %rd1290, 32;
shr.u64 %rd1292, %rd1286, 32;
and.b64 %rd1293, %rd1260, 4294967295;
xor.b64 %rd1294, %rd1293, %rd1292;
xor.b64 %rd1295, %rd1294, %rd2569;
mul.lo.s64 %rd1296, %rd1295, %rd2557;
and.b64 %rd1297, %rd1296, 4294967295;
xor.b64 %rd1298, %rd1297, %rd1291;
xor.b64 %rd1299, %rd1298, %rd2570;
mul.lo.s64 %rd1300, %rd1299, %rd2555;
shr.u64 %rd1301, %rd1300, 32;
shr.u64 %rd1302, %rd1296, 32;
and.b64 %rd1303, %rd1270, 4294967295;
xor.b64 %rd1304, %rd1303, %rd1302;
xor.b64 %rd1305, %rd1304, %rd2571;
mul.lo.s64 %rd1306, %rd1305, %rd2555;
and.b64 %rd1307, %rd1306, 4294967295;
xor.b64 %rd1308, %rd1307, %rd1301;
xor.b64 %rd1309, %rd1308, %rd2572;
mul.lo.s64 %rd1310, %rd1309, %rd2557;
shr.u64 %rd1311, %rd1310, 32;
cvt.u32.u64 %r138, %rd1311;
shr.u64 %rd1312, %rd1306, 32;
xor.b64 %rd1313, %rd1312, %rd1280;
cvt.u32.u64 %r139, %rd1313;
xor.b32 %r140, %r325, %r139;
mul.lo.s32 %r141, %r140, %r326;
xor.b32 %r142, %r141, %r138;
xor.b32 %r143, %r142, %r327;
shr.u32 %r144, %r143, 9;
cvt.rn.f32.u32 %f101, %r144;
mul.rn.f32 %f102, %f101, 0f34000000;
cvt.rn.f16.f32 %h55, %f102;
mov.b16 %h56, 0x2E66;
setp.ge.f16 %p28, %h55, %h56;
ld.global.nc.b16 %h57, [%rd44+768];
ld.global.nc.f32 %f103, [%rd45+1536];
cvt.rn.f16.f32 %h58, %f103;
add.rn.f16 %h59, %h57, %h58;
mov.b16 %h60, 0x3C72;
mul.rn.f16 %h61, %h59, %h60;
selp.b16 %h62, %h61, 0x0000, %p28;
cvt.f32.f16 %f104, %h62;
ld.global.nc.b16 %h63, [%rd46+768];
cvt.f32.f16 %f105, %h63;
ld.global.nc.f32 %f106, [%rd47+1536];
mul.rn.f32 %f107, %f1, %f106;
mul.rn.f32 %f108, %f107, %f105;
ld.global.nc.f32 %f109, [%rd48+1536];
mul.rn.f32 %f110, %f2, %f107;
sub.rn.f32 %f111, %f109, %f110;
add.rn.f32 %f112, %f108, %f111;
add.rn.f32 %f113, %f112, %f104;
add.rn.f32 %f9, %f8, %f113;
or.b32 %r145, %r3, 385;
or.b32 %r146, %r145, %r4;
and.b32 %r147, %r145, 3;
shr.u32 %r148, %r146, 2;
setp.ne.s32 %p29, %r147, 1;
cvt.u64.u32 %rd1314, %r148;
add.s64 %rd213, %rd11, %rd1314;
@%p29 bra LBB29_25;
and.b64 %rd1354, %rd213, 4294967295;
mul.lo.s64 %rd2577, %rd1354, 3528531795;
setp.lt.u64 %p31, %rd213, %rd11;
selp.u64 %rd1355, 1, 0, %p31;
add.s64 %rd1356, %rd2461, %rd1355;
xor.b64 %rd1357, %rd1356, %rd2577;
shr.u64 %rd1358, %rd1357, 32;
mul.lo.s64 %rd2580, %rd1358, 3449720151;
shr.u64 %rd1359, %rd2580, 32;
and.b64 %rd1360, %rd1356, 4294967295;
mul.lo.s64 %rd1361, %rd1360, 3449720151;
and.b64 %rd1362, %rd1361, 4294967295;
xor.b64 %rd1363, %rd1362, %rd1359;
xor.b64 %rd1364, %rd1363, 2654435769;
mul.lo.s64 %rd2583, %rd1364, 3528531795;
xor.b64 %rd2573, %rd1361, %rd213;
mov.u32 %r329, -845247145;
mov.u32 %r328, -616729560;
mov.u64 %rd2590, 3041712726;
mov.u64 %rd2589, 1401181199;
mov.u64 %rd2588, 2835769497;
mov.u64 %rd2587, 1684936478;
mov.u64 %rd2586, 2027808484;
mov.u64 %rd2585, 387276957;
mov.u64 %rd2584, 842468239;
mov.u64 %rd2582, 3986602516;
mov.u64 %rd2581, 1013904242;
mov.u64 %rd2579, 3668340011;
mov.u64 %rd2578, 3144134277;
mov.u64 %rd2576, 3449720151;
mov.u64 %rd2575, 1993301258;
mov.u64 %rd2574, 3528531795;
bra.uni LBB29_26;
LBB29_25:
setp.lt.u64 %p30, %rd213, %rd11;
selp.u64 %rd1329, 1, 0, %p30;
add.s64 %rd1330, %rd2461, %rd1329;
and.b64 %rd1331, %rd1330, 4294967295;
mul.lo.s64 %rd2577, %rd1331, 3449720151;
xor.b64 %rd1332, %rd2577, %rd213;
shr.u64 %rd1333, %rd1332, 32;
mul.lo.s64 %rd2580, %rd1333, 3528531795;
shr.u64 %rd1334, %rd2580, 32;
and.b64 %rd1335, %rd213, 4294967295;
mul.lo.s64 %rd1336, %rd1335, 3528531795;
and.b64 %rd1337, %rd1336, 4294967295;
xor.b64 %rd1338, %rd1337, %rd1334;
xor.b64 %rd1339, %rd1338, 3144134277;
mul.lo.s64 %rd2583, %rd1339, 3449720151;
xor.b64 %rd2573, %rd1330, %rd1336;
mov.u32 %r329, -766435501;
mov.u32 %r328, -239350328;
mov.u64 %rd2590, 1684936478;
mov.u64 %rd2589, 534103459;
mov.u64 %rd2588, 387276957;
mov.u64 %rd2587, 3041712726;
mov.u64 %rd2586, 3986602516;
mov.u64 %rd2585, 2835769497;
mov.u64 %rd2584, 3668340011;
mov.u64 %rd2582, 2027808484;
mov.u64 %rd2581, 1993301258;
mov.u64 %rd2579, 842468239;
mov.u64 %rd2578, 2654435769;
mov.u64 %rd2576, 3528531795;
mov.u64 %rd2575, 1013904242;
mov.u64 %rd2574, 3449720151;
LBB29_26:
shr.u64 %rd1365, %rd2583, 32;
shr.u64 %rd1366, %rd2573, 32;
mul.lo.s64 %rd1367, %rd1366, %rd2574;
and.b64 %rd1368, %rd1367, 4294967295;
xor.b64 %rd1369, %rd1368, %rd1365;
xor.b64 %rd1370, %rd1369, %rd2575;
mul.lo.s64 %rd1371, %rd1370, %rd2576;
shr.u64 %rd1372, %rd1371, 32;
shr.u64 %rd1373, %rd1367, 32;
and.b64 %rd1374, %rd2577, 4294967295;
xor.b64 %rd1375, %rd1374, %rd1373;
xor.b64 %rd1376, %rd1375, %rd2578;
mul.lo.s64 %rd1377, %rd1376, %rd2576;
and.b64 %rd1378, %rd1377, 4294967295;
xor.b64 %rd1379, %rd1378, %rd1372;
xor.b64 %rd1380, %rd1379, %rd2579;
mul.lo.s64 %rd1381, %rd1380, %rd2574;
shr.u64 %rd1382, %rd1381, 32;
shr.u64 %rd1383, %rd1377, 32;
and.b64 %rd1384, %rd2580, 4294967295;
xor.b64 %rd1385, %rd1384, %rd1383;
xor.b64 %rd1386, %rd1385, %rd2581;
mul.lo.s64 %rd1387, %rd1386, %rd2574;
and.b64 %rd1388, %rd1387, 4294967295;
xor.b64 %rd1389, %rd1388, %rd1382;
xor.b64 %rd1390, %rd1389, %rd2582;
mul.lo.s64 %rd1391, %rd1390, %rd2576;
shr.u64 %rd1392, %rd1391, 32;
shr.u64 %rd1393, %rd1387, 32;
and.b64 %rd1394, %rd2583, 4294967295;
xor.b64 %rd1395, %rd1394, %rd1393;
xor.b64 %rd1396, %rd1395, %rd2584;
mul.lo.s64 %rd1397, %rd1396, %rd2576;
and.b64 %rd1398, %rd1397, 4294967295;
xor.b64 %rd1399, %rd1398, %rd1392;
xor.b64 %rd1400, %rd1399, %rd2585;
mul.lo.s64 %rd1401, %rd1400, %rd2574;
shr.u64 %rd1402, %rd1401, 32;
shr.u64 %rd1403, %rd1397, 32;
and.b64 %rd1404, %rd1371, 4294967295;
xor.b64 %rd1405, %rd1404, %rd1403;
xor.b64 %rd1406, %rd1405, %rd2586;
mul.lo.s64 %rd1407, %rd1406, %rd2574;
and.b64 %rd1408, %rd1407, 4294967295;
xor.b64 %rd1409, %rd1408, %rd1402;
xor.b64 %rd1410, %rd1409, %rd2587;
mul.lo.s64 %rd1411, %rd1410, %rd2576;
shr.u64 %rd1412, %rd1411, 32;
shr.u64 %rd1413, %rd1407, 32;
and.b64 %rd1414, %rd1381, 4294967295;
xor.b64 %rd1415, %rd1414, %rd1413;
xor.b64 %rd1416, %rd1415, %rd2588;
mul.lo.s64 %rd1417, %rd1416, %rd2576;
and.b64 %rd1418, %rd1417, 4294967295;
xor.b64 %rd1419, %rd1418, %rd1412;
xor.b64 %rd1420, %rd1419, %rd2589;
mul.lo.s64 %rd1421, %rd1420, %rd2574;
shr.u64 %rd1422, %rd1421, 32;
shr.u64 %rd1423, %rd1417, 32;
xor.b64 %rd1424, %rd1391, %rd1423;
xor.b64 %rd1425, %rd1424, %rd2590;
mul.lo.s64 %rd1426, %rd1425, %rd2574;
xor.b64 %rd1427, %rd1422, %rd1426;
cvt.u32.u64 %r153, %rd1427;
xor.b32 %r154, %r328, %r153;
mul.lo.s32 %r155, %r154, %r329;
shr.u32 %r156, %r155, 9;
cvt.rn.f32.u32 %f114, %r156;
mul.rn.f32 %f115, %f114, 0f34000000;
cvt.rn.f16.f32 %h64, %f115;
mov.b16 %h65, 0x2E66;
setp.ge.f16 %p33, %h64, %h65;
ld.global.nc.b16 %h66, [%rd44+770];
ld.global.nc.f32 %f116, [%rd45+1540];
cvt.rn.f16.f32 %h67, %f116;
add.rn.f16 %h68, %h66, %h67;
mov.b16 %h69, 0x3C72;
mul.rn.f16 %h70, %h68, %h69;
selp.b16 %h71, %h70, 0x0000, %p33;
cvt.f32.f16 %f117, %h71;
ld.global.nc.b16 %h72, [%rd46+770];
cvt.f32.f16 %f118, %h72;
ld.global.nc.f32 %f119, [%rd47+1540];
mul.rn.f32 %f120, %f1, %f119;
mul.rn.f32 %f121, %f120, %f118;
ld.global.nc.f32 %f122, [%rd48+1540];
mul.rn.f32 %f123, %f2, %f120;
sub.rn.f32 %f124, %f122, %f123;
add.rn.f32 %f125, %f121, %f124;
add.rn.f32 %f126, %f125, %f117;
add.rn.f32 %f10, %f9, %f126;
or.b32 %r158, %r73, 512;
shr.u32 %r159, %r158, 2;
cvt.u64.u32 %rd1428, %r159;
add.s64 %rd240, %rd11, %rd1428;
@%p8 bra LBB29_28;
and.b64 %rd1470, %rd240, 4294967295;
mul.lo.s64 %rd2595, %rd1470, 3528531795;
setp.lt.u64 %p35, %rd240, %rd11;
selp.u64 %rd1471, 1, 0, %p35;
add.s64 %rd1472, %rd2461, %rd1471;
xor.b64 %rd1473, %rd1472, %rd2595;
shr.u64 %rd1474, %rd1473, 32;
mul.lo.s64 %rd2598, %rd1474, 3449720151;
shr.u64 %rd1475, %rd2598, 32;
and.b64 %rd1476, %rd1472, 4294967295;
mul.lo.s64 %rd1477, %rd1476, 3449720151;
and.b64 %rd1478, %rd1477, 4294967295;
xor.b64 %rd1479, %rd1478, %rd1475;
xor.b64 %rd1480, %rd1479, 2654435769;
mul.lo.s64 %rd2601, %rd1480, 3528531795;
xor.b64 %rd2591, %rd1477, %rd240;
mov.u32 %r332, -1879881855;
mov.u32 %r331, -845247145;
mov.u32 %r330, 534103459;
mov.u64 %rd2609, 3678237736;
mov.u64 %rd2608, 3041712726;
mov.u64 %rd2607, 1401181199;
mov.u64 %rd2606, 2835769497;
mov.u64 %rd2605, 1684936478;
mov.u64 %rd2604, 2027808484;
mov.u64 %rd2603, 387276957;
mov.u64 %rd2602, 842468239;
mov.u64 %rd2600, 3986602516;
mov.u64 %rd2599, 1013904242;
mov.u64 %rd2597, 3668340011;
mov.u64 %rd2596, 3144134277;
mov.u64 %rd2594, 3449720151;
mov.u64 %rd2593, 1993301258;
mov.u64 %rd2592, 3528531795;
bra.uni LBB29_29;
LBB29_28:
setp.lt.u64 %p34, %rd240, %rd11;
selp.u64 %rd1444, 1, 0, %p34;
add.s64 %rd1445, %rd2461, %rd1444;
and.b64 %rd1446, %rd1445, 4294967295;
mul.lo.s64 %rd2595, %rd1446, 3449720151;
xor.b64 %rd1447, %rd2595, %rd240;
shr.u64 %rd1448, %rd1447, 32;
mul.lo.s64 %rd2598, %rd1448, 3528531795;
shr.u64 %rd1449, %rd2598, 32;
and.b64 %rd1450, %rd240, 4294967295;
mul.lo.s64 %rd1451, %rd1450, 3528531795;
and.b64 %rd1452, %rd1451, 4294967295;
xor.b64 %rd1453, %rd1452, %rd1449;
xor.b64 %rd1454, %rd1453, 3144134277;
mul.lo.s64 %rd2601, %rd1454, 3449720151;
xor.b64 %rd2591, %rd1445, %rd1451;
mov.u32 %r332, -1767562579;
mov.u32 %r331, -766435501;
mov.u32 %r330, 1401181199;
mov.u64 %rd2609, 4055616968;
mov.u64 %rd2608, 1684936478;
mov.u64 %rd2607, 534103459;
mov.u64 %rd2606, 387276957;
mov.u64 %rd2605, 3041712726;
mov.u64 %rd2604, 3986602516;
mov.u64 %rd2603, 2835769497;
mov.u64 %rd2602, 3668340011;
mov.u64 %rd2600, 2027808484;
mov.u64 %rd2599, 1993301258;
mov.u64 %rd2597, 842468239;
mov.u64 %rd2596, 2654435769;
mov.u64 %rd2594, 3528531795;
mov.u64 %rd2593, 1013904242;
mov.u64 %rd2592, 3449720151;
LBB29_29:
shr.u64 %rd1481, %rd2601, 32;
shr.u64 %rd1482, %rd2591, 32;
mul.lo.s64 %rd1483, %rd1482, %rd2592;
and.b64 %rd1484, %rd1483, 4294967295;
xor.b64 %rd1485, %rd1484, %rd1481;
xor.b64 %rd1486, %rd1485, %rd2593;
mul.lo.s64 %rd1487, %rd1486, %rd2594;
shr.u64 %rd1488, %rd1487, 32;
shr.u64 %rd1489, %rd1483, 32;
and.b64 %rd1490, %rd2595, 4294967295;
xor.b64 %rd1491, %rd1490, %rd1489;
xor.b64 %rd1492, %rd1491, %rd2596;
mul.lo.s64 %rd1493, %rd1492, %rd2594;
and.b64 %rd1494, %rd1493, 4294967295;
xor.b64 %rd1495, %rd1494, %rd1488;
xor.b64 %rd1496, %rd1495, %rd2597;
mul.lo.s64 %rd1497, %rd1496, %rd2592;
shr.u64 %rd1498, %rd1497, 32;
shr.u64 %rd1499, %rd1493, 32;
and.b64 %rd1500, %rd2598, 4294967295;
xor.b64 %rd1501, %rd1500, %rd1499;
xor.b64 %rd1502, %rd1501, %rd2599;
mul.lo.s64 %rd1503, %rd1502, %rd2592;
and.b64 %rd1504, %rd1503, 4294967295;
xor.b64 %rd1505, %rd1504, %rd1498;
xor.b64 %rd1506, %rd1505, %rd2600;
mul.lo.s64 %rd1507, %rd1506, %rd2594;
shr.u64 %rd1508, %rd1507, 32;
shr.u64 %rd1509, %rd1503, 32;
and.b64 %rd1510, %rd2601, 4294967295;
xor.b64 %rd1511, %rd1510, %rd1509;
xor.b64 %rd1512, %rd1511, %rd2602;
mul.lo.s64 %rd1513, %rd1512, %rd2594;
and.b64 %rd1514, %rd1513, 4294967295;
xor.b64 %rd1515, %rd1514, %rd1508;
xor.b64 %rd1516, %rd1515, %rd2603;
mul.lo.s64 %rd1517, %rd1516, %rd2592;
shr.u64 %rd1518, %rd1517, 32;
shr.u64 %rd1519, %rd1513, 32;
and.b64 %rd1520, %rd1487, 4294967295;
xor.b64 %rd1521, %rd1520, %rd1519;
xor.b64 %rd1522, %rd1521, %rd2604;
mul.lo.s64 %rd1523, %rd1522, %rd2592;
and.b64 %rd1524, %rd1523, 4294967295;
xor.b64 %rd1525, %rd1524, %rd1518;
xor.b64 %rd1526, %rd1525, %rd2605;
mul.lo.s64 %rd1527, %rd1526, %rd2594;
shr.u64 %rd1528, %rd1527, 32;
shr.u64 %rd1529, %rd1523, 32;
and.b64 %rd1530, %rd1497, 4294967295;
xor.b64 %rd1531, %rd1530, %rd1529;
xor.b64 %rd1532, %rd1531, %rd2606;
mul.lo.s64 %rd1533, %rd1532, %rd2594;
and.b64 %rd1534, %rd1533, 4294967295;
xor.b64 %rd1535, %rd1534, %rd1528;
xor.b64 %rd1536, %rd1535, %rd2607;
mul.lo.s64 %rd1537, %rd1536, %rd2592;
shr.u64 %rd1538, %rd1537, 32;
shr.u64 %rd1539, %rd1533, 32;
and.b64 %rd1540, %rd1507, 4294967295;
xor.b64 %rd1541, %rd1540, %rd1539;
xor.b64 %rd1542, %rd1541, %rd2608;
mul.lo.s64 %rd1543, %rd1542, %rd2592;
and.b64 %rd1544, %rd1543, 4294967295;
xor.b64 %rd1545, %rd1544, %rd1538;
xor.b64 %rd1546, %rd1545, %rd2609;
mul.lo.s64 %rd1547, %rd1546, %rd2594;
shr.u64 %rd1548, %rd1547, 32;
cvt.u32.u64 %r166, %rd1548;
shr.u64 %rd1549, %rd1543, 32;
xor.b64 %rd1550, %rd1549, %rd1517;
cvt.u32.u64 %r167, %rd1550;
xor.b32 %r168, %r330, %r167;
mul.lo.s32 %r169, %r168, %r331;
xor.b32 %r170, %r169, %r166;
xor.b32 %r171, %r170, %r332;
shr.u32 %r172, %r171, 9;
cvt.rn.f32.u32 %f127, %r172;
mul.rn.f32 %f128, %f127, 0f34000000;
cvt.rn.f16.f32 %h73, %f128;
mov.b16 %h74, 0x2E66;
setp.ge.f16 %p36, %h73, %h74;
ld.global.nc.b16 %h75, [%rd44+1024];
ld.global.nc.f32 %f129, [%rd45+2048];
cvt.rn.f16.f32 %h76, %f129;
add.rn.f16 %h77, %h75, %h76;
mov.b16 %h78, 0x3C72;
mul.rn.f16 %h79, %h77, %h78;
selp.b16 %h80, %h79, 0x0000, %p36;
cvt.f32.f16 %f130, %h80;
ld.global.nc.b16 %h81, [%rd46+1024];
cvt.f32.f16 %f131, %h81;
ld.global.nc.f32 %f132, [%rd47+2048];
mul.rn.f32 %f133, %f1, %f132;
mul.rn.f32 %f134, %f133, %f131;
ld.global.nc.f32 %f135, [%rd48+2048];
mul.rn.f32 %f136, %f2, %f133;
sub.rn.f32 %f137, %f135, %f136;
add.rn.f32 %f138, %f134, %f137;
add.rn.f32 %f139, %f138, %f130;
add.rn.f32 %f11, %f10, %f139;
or.b32 %r173, %r3, 513;
or.b32 %r174, %r173, %r4;
and.b32 %r175, %r173, 3;
shr.u32 %r176, %r174, 2;
setp.ne.s32 %p37, %r175, 1;
cvt.u64.u32 %rd1551, %r176;
add.s64 %rd268, %rd11, %rd1551;
@%p37 bra LBB29_31;
and.b64 %rd1591, %rd268, 4294967295;
mul.lo.s64 %rd2614, %rd1591, 3528531795;
setp.lt.u64 %p39, %rd268, %rd11;
selp.u64 %rd1592, 1, 0, %p39;
add.s64 %rd1593, %rd2461, %rd1592;
xor.b64 %rd1594, %rd1593, %rd2614;
shr.u64 %rd1595, %rd1594, 32;
mul.lo.s64 %rd2617, %rd1595, 3449720151;
shr.u64 %rd1596, %rd2617, 32;
and.b64 %rd1597, %rd1593, 4294967295;
mul.lo.s64 %rd1598, %rd1597, 3449720151;
and.b64 %rd1599, %rd1598, 4294967295;
xor.b64 %rd1600, %rd1599, %rd1596;
xor.b64 %rd1601, %rd1600, 2654435769;
mul.lo.s64 %rd2620, %rd1601, 3528531795;
xor.b64 %rd2610, %rd1598, %rd268;
mov.u32 %r334, -845247145;
mov.u32 %r333, -616729560;
mov.u64 %rd2627, 3041712726;
mov.u64 %rd2626, 1401181199;
mov.u64 %rd2625, 2835769497;
mov.u64 %rd2624, 1684936478;
mov.u64 %rd2623, 2027808484;
mov.u64 %rd2622, 387276957;
mov.u64 %rd2621, 842468239;
mov.u64 %rd2619, 3986602516;
mov.u64 %rd2618, 1013904242;
mov.u64 %rd2616, 3668340011;
mov.u64 %rd2615, 3144134277;
mov.u64 %rd2613, 3449720151;
mov.u64 %rd2612, 1993301258;
mov.u64 %rd2611, 3528531795;
bra.uni LBB29_32;
LBB29_31:
setp.lt.u64 %p38, %rd268, %rd11;
selp.u64 %rd1566, 1, 0, %p38;
add.s64 %rd1567, %rd2461, %rd1566;
and.b64 %rd1568, %rd1567, 4294967295;
mul.lo.s64 %rd2614, %rd1568, 3449720151;
xor.b64 %rd1569, %rd2614, %rd268;
shr.u64 %rd1570, %rd1569, 32;
mul.lo.s64 %rd2617, %rd1570, 3528531795;
shr.u64 %rd1571, %rd2617, 32;
and.b64 %rd1572, %rd268, 4294967295;
mul.lo.s64 %rd1573, %rd1572, 3528531795;
and.b64 %rd1574, %rd1573, 4294967295;
xor.b64 %rd1575, %rd1574, %rd1571;
xor.b64 %rd1576, %rd1575, 3144134277;
mul.lo.s64 %rd2620, %rd1576, 3449720151;
xor.b64 %rd2610, %rd1567, %rd1573;
mov.u32 %r334, -766435501;
mov.u32 %r333, -239350328;
mov.u64 %rd2627, 1684936478;
mov.u64 %rd2626, 534103459;
mov.u64 %rd2625, 387276957;
mov.u64 %rd2624, 3041712726;
mov.u64 %rd2623, 3986602516;
mov.u64 %rd2622, 2835769497;
mov.u64 %rd2621, 3668340011;
mov.u64 %rd2619, 2027808484;
mov.u64 %rd2618, 1993301258;
mov.u64 %rd2616, 842468239;
mov.u64 %rd2615, 2654435769;
mov.u64 %rd2613, 3528531795;
mov.u64 %rd2612, 1013904242;
mov.u64 %rd2611, 3449720151;
LBB29_32:
shr.u64 %rd1602, %rd2620, 32;
shr.u64 %rd1603, %rd2610, 32;
mul.lo.s64 %rd1604, %rd1603, %rd2611;
and.b64 %rd1605, %rd1604, 4294967295;
xor.b64 %rd1606, %rd1605, %rd1602;
xor.b64 %rd1607, %rd1606, %rd2612;
mul.lo.s64 %rd1608, %rd1607, %rd2613;
shr.u64 %rd1609, %rd1608, 32;
shr.u64 %rd1610, %rd1604, 32;
and.b64 %rd1611, %rd2614, 4294967295;
xor.b64 %rd1612, %rd1611, %rd1610;
xor.b64 %rd1613, %rd1612, %rd2615;
mul.lo.s64 %rd1614, %rd1613, %rd2613;
and.b64 %rd1615, %rd1614, 4294967295;
xor.b64 %rd1616, %rd1615, %rd1609;
xor.b64 %rd1617, %rd1616, %rd2616;
mul.lo.s64 %rd1618, %rd1617, %rd2611;
shr.u64 %rd1619, %rd1618, 32;
shr.u64 %rd1620, %rd1614, 32;
and.b64 %rd1621, %rd2617, 4294967295;
xor.b64 %rd1622, %rd1621, %rd1620;
xor.b64 %rd1623, %rd1622, %rd2618;
mul.lo.s64 %rd1624, %rd1623, %rd2611;
and.b64 %rd1625, %rd1624, 4294967295;
xor.b64 %rd1626, %rd1625, %rd1619;
xor.b64 %rd1627, %rd1626, %rd2619;
mul.lo.s64 %rd1628, %rd1627, %rd2613;
shr.u64 %rd1629, %rd1628, 32;
shr.u64 %rd1630, %rd1624, 32;
and.b64 %rd1631, %rd2620, 4294967295;
xor.b64 %rd1632, %rd1631, %rd1630;
xor.b64 %rd1633, %rd1632, %rd2621;
mul.lo.s64 %rd1634, %rd1633, %rd2613;
and.b64 %rd1635, %rd1634, 4294967295;
xor.b64 %rd1636, %rd1635, %rd1629;
xor.b64 %rd1637, %rd1636, %rd2622;
mul.lo.s64 %rd1638, %rd1637, %rd2611;
shr.u64 %rd1639, %rd1638, 32;
shr.u64 %rd1640, %rd1634, 32;
and.b64 %rd1641, %rd1608, 4294967295;
xor.b64 %rd1642, %rd1641, %rd1640;
xor.b64 %rd1643, %rd1642, %rd2623;
mul.lo.s64 %rd1644, %rd1643, %rd2611;
and.b64 %rd1645, %rd1644, 4294967295;
xor.b64 %rd1646, %rd1645, %rd1639;
xor.b64 %rd1647, %rd1646, %rd2624;
mul.lo.s64 %rd1648, %rd1647, %rd2613;
shr.u64 %rd1649, %rd1648, 32;
shr.u64 %rd1650, %rd1644, 32;
and.b64 %rd1651, %rd1618, 4294967295;
xor.b64 %rd1652, %rd1651, %rd1650;
xor.b64 %rd1653, %rd1652, %rd2625;
mul.lo.s64 %rd1654, %rd1653, %rd2613;
and.b64 %rd1655, %rd1654, 4294967295;
xor.b64 %rd1656, %rd1655, %rd1649;
xor.b64 %rd1657, %rd1656, %rd2626;
mul.lo.s64 %rd1658, %rd1657, %rd2611;
shr.u64 %rd1659, %rd1658, 32;
shr.u64 %rd1660, %rd1654, 32;
xor.b64 %rd1661, %rd1628, %rd1660;
xor.b64 %rd1662, %rd1661, %rd2627;
mul.lo.s64 %rd1663, %rd1662, %rd2611;
xor.b64 %rd1664, %rd1659, %rd1663;
cvt.u32.u64 %r181, %rd1664;
xor.b32 %r182, %r333, %r181;
mul.lo.s32 %r183, %r182, %r334;
shr.u32 %r184, %r183, 9;
cvt.rn.f32.u32 %f140, %r184;
mul.rn.f32 %f141, %f140, 0f34000000;
cvt.rn.f16.f32 %h82, %f141;
mov.b16 %h83, 0x2E66;
setp.ge.f16 %p41, %h82, %h83;
ld.global.nc.b16 %h84, [%rd44+1026];
ld.global.nc.f32 %f142, [%rd45+2052];
cvt.rn.f16.f32 %h85, %f142;
add.rn.f16 %h86, %h84, %h85;
mov.b16 %h87, 0x3C72;
mul.rn.f16 %h88, %h86, %h87;
selp.b16 %h89, %h88, 0x0000, %p41;
cvt.f32.f16 %f143, %h89;
ld.global.nc.b16 %h90, [%rd46+1026];
cvt.f32.f16 %f144, %h90;
ld.global.nc.f32 %f145, [%rd47+2052];
mul.rn.f32 %f146, %f1, %f145;
mul.rn.f32 %f147, %f146, %f144;
ld.global.nc.f32 %f148, [%rd48+2052];
mul.rn.f32 %f149, %f2, %f146;
sub.rn.f32 %f150, %f148, %f149;
add.rn.f32 %f151, %f147, %f150;
add.rn.f32 %f152, %f151, %f143;
add.rn.f32 %f12, %f11, %f152;
or.b32 %r186, %r73, 640;
shr.u32 %r187, %r186, 2;
cvt.u64.u32 %rd1665, %r187;
add.s64 %rd295, %rd11, %rd1665;
@%p8 bra LBB29_34;
and.b64 %rd1707, %rd295, 4294967295;
mul.lo.s64 %rd2632, %rd1707, 3528531795;
setp.lt.u64 %p43, %rd295, %rd11;
selp.u64 %rd1708, 1, 0, %p43;
add.s64 %rd1709, %rd2461, %rd1708;
xor.b64 %rd1710, %rd1709, %rd2632;
shr.u64 %rd1711, %rd1710, 32;
mul.lo.s64 %rd2635, %rd1711, 3449720151;
shr.u64 %rd1712, %rd2635, 32;
and.b64 %rd1713, %rd1709, 4294967295;
mul.lo.s64 %rd1714, %rd1713, 3449720151;
and.b64 %rd1715, %rd1714, 4294967295;
xor.b64 %rd1716, %rd1715, %rd1712;
xor.b64 %rd1717, %rd1716, 2654435769;
mul.lo.s64 %rd2638, %rd1717, 3528531795;
xor.b64 %rd2628, %rd1714, %rd295;
mov.u32 %r337, -1879881855;
mov.u32 %r336, -845247145;
mov.u32 %r335, 534103459;
mov.u64 %rd2646, 3678237736;
mov.u64 %rd2645, 3041712726;
mov.u64 %rd2644, 1401181199;
mov.u64 %rd2643, 2835769497;
mov.u64 %rd2642, 1684936478;
mov.u64 %rd2641, 2027808484;
mov.u64 %rd2640, 387276957;
mov.u64 %rd2639, 842468239;
mov.u64 %rd2637, 3986602516;
mov.u64 %rd2636, 1013904242;
mov.u64 %rd2634, 3668340011;
mov.u64 %rd2633, 3144134277;
mov.u64 %rd2631, 3449720151;
mov.u64 %rd2630, 1993301258;
mov.u64 %rd2629, 3528531795;
bra.uni LBB29_35;
LBB29_34:
setp.lt.u64 %p42, %rd295, %rd11;
selp.u64 %rd1681, 1, 0, %p42;
add.s64 %rd1682, %rd2461, %rd1681;
and.b64 %rd1683, %rd1682, 4294967295;
mul.lo.s64 %rd2632, %rd1683, 3449720151;
xor.b64 %rd1684, %rd2632, %rd295;
shr.u64 %rd1685, %rd1684, 32;
mul.lo.s64 %rd2635, %rd1685, 3528531795;
shr.u64 %rd1686, %rd2635, 32;
and.b64 %rd1687, %rd295, 4294967295;
mul.lo.s64 %rd1688, %rd1687, 3528531795;
and.b64 %rd1689, %rd1688, 4294967295;
xor.b64 %rd1690, %rd1689, %rd1686;
xor.b64 %rd1691, %rd1690, 3144134277;
mul.lo.s64 %rd2638, %rd1691, 3449720151;
xor.b64 %rd2628, %rd1682, %rd1688;
mov.u32 %r337, -1767562579;
mov.u32 %r336, -766435501;
mov.u32 %r335, 1401181199;
mov.u64 %rd2646, 4055616968;
mov.u64 %rd2645, 1684936478;
mov.u64 %rd2644, 534103459;
mov.u64 %rd2643, 387276957;
mov.u64 %rd2642, 3041712726;
mov.u64 %rd2641, 3986602516;
mov.u64 %rd2640, 2835769497;
mov.u64 %rd2639, 3668340011;
mov.u64 %rd2637, 2027808484;
mov.u64 %rd2636, 1993301258;
mov.u64 %rd2634, 842468239;
mov.u64 %rd2633, 2654435769;
mov.u64 %rd2631, 3528531795;
mov.u64 %rd2630, 1013904242;
mov.u64 %rd2629, 3449720151;
LBB29_35:
shr.u64 %rd1718, %rd2638, 32;
shr.u64 %rd1719, %rd2628, 32;
mul.lo.s64 %rd1720, %rd1719, %rd2629;
and.b64 %rd1721, %rd1720, 4294967295;
xor.b64 %rd1722, %rd1721, %rd1718;
xor.b64 %rd1723, %rd1722, %rd2630;
mul.lo.s64 %rd1724, %rd1723, %rd2631;
shr.u64 %rd1725, %rd1724, 32;
shr.u64 %rd1726, %rd1720, 32;
and.b64 %rd1727, %rd2632, 4294967295;
xor.b64 %rd1728, %rd1727, %rd1726;
xor.b64 %rd1729, %rd1728, %rd2633;
mul.lo.s64 %rd1730, %rd1729, %rd2631;
and.b64 %rd1731, %rd1730, 4294967295;
xor.b64 %rd1732, %rd1731, %rd1725;
xor.b64 %rd1733, %rd1732, %rd2634;
mul.lo.s64 %rd1734, %rd1733, %rd2629;
shr.u64 %rd1735, %rd1734, 32;
shr.u64 %rd1736, %rd1730, 32;
and.b64 %rd1737, %rd2635, 4294967295;
xor.b64 %rd1738, %rd1737, %rd1736;
xor.b64 %rd1739, %rd1738, %rd2636;
mul.lo.s64 %rd1740, %rd1739, %rd2629;
and.b64 %rd1741, %rd1740, 4294967295;
xor.b64 %rd1742, %rd1741, %rd1735;
xor.b64 %rd1743, %rd1742, %rd2637;
mul.lo.s64 %rd1744, %rd1743, %rd2631;
shr.u64 %rd1745, %rd1744, 32;
shr.u64 %rd1746, %rd1740, 32;
and.b64 %rd1747, %rd2638, 4294967295;
xor.b64 %rd1748, %rd1747, %rd1746;
xor.b64 %rd1749, %rd1748, %rd2639;
mul.lo.s64 %rd1750, %rd1749, %rd2631;
and.b64 %rd1751, %rd1750, 4294967295;
xor.b64 %rd1752, %rd1751, %rd1745;
xor.b64 %rd1753, %rd1752, %rd2640;
mul.lo.s64 %rd1754, %rd1753, %rd2629;
shr.u64 %rd1755, %rd1754, 32;
shr.u64 %rd1756, %rd1750, 32;
and.b64 %rd1757, %rd1724, 4294967295;
xor.b64 %rd1758, %rd1757, %rd1756;
xor.b64 %rd1759, %rd1758, %rd2641;
mul.lo.s64 %rd1760, %rd1759, %rd2629;
and.b64 %rd1761, %rd1760, 4294967295;
xor.b64 %rd1762, %rd1761, %rd1755;
xor.b64 %rd1763, %rd1762, %rd2642;
mul.lo.s64 %rd1764, %rd1763, %rd2631;
shr.u64 %rd1765, %rd1764, 32;
shr.u64 %rd1766, %rd1760, 32;
and.b64 %rd1767, %rd1734, 4294967295;
xor.b64 %rd1768, %rd1767, %rd1766;
xor.b64 %rd1769, %rd1768, %rd2643;
mul.lo.s64 %rd1770, %rd1769, %rd2631;
and.b64 %rd1771, %rd1770, 4294967295;
xor.b64 %rd1772, %rd1771, %rd1765;
xor.b64 %rd1773, %rd1772, %rd2644;
mul.lo.s64 %rd1774, %rd1773, %rd2629;
shr.u64 %rd1775, %rd1774, 32;
shr.u64 %rd1776, %rd1770, 32;
and.b64 %rd1777, %rd1744, 4294967295;
xor.b64 %rd1778, %rd1777, %rd1776;
xor.b64 %rd1779, %rd1778, %rd2645;
mul.lo.s64 %rd1780, %rd1779, %rd2629;
and.b64 %rd1781, %rd1780, 4294967295;
xor.b64 %rd1782, %rd1781, %rd1775;
xor.b64 %rd1783, %rd1782, %rd2646;
mul.lo.s64 %rd1784, %rd1783, %rd2631;
shr.u64 %rd1785, %rd1784, 32;
cvt.u32.u64 %r194, %rd1785;
shr.u64 %rd1786, %rd1780, 32;
xor.b64 %rd1787, %rd1786, %rd1754;
cvt.u32.u64 %r195, %rd1787;
xor.b32 %r196, %r335, %r195;
mul.lo.s32 %r197, %r196, %r336;
xor.b32 %r198, %r197, %r194;
xor.b32 %r199, %r198, %r337;
shr.u32 %r200, %r199, 9;
cvt.rn.f32.u32 %f153, %r200;
mul.rn.f32 %f154, %f153, 0f34000000;
cvt.rn.f16.f32 %h91, %f154;
mov.b16 %h92, 0x2E66;
setp.ge.f16 %p44, %h91, %h92;
ld.global.nc.b16 %h93, [%rd44+1280];
ld.global.nc.f32 %f155, [%rd45+2560];
cvt.rn.f16.f32 %h94, %f155;
add.rn.f16 %h95, %h93, %h94;
mov.b16 %h96, 0x3C72;
mul.rn.f16 %h97, %h95, %h96;
selp.b16 %h98, %h97, 0x0000, %p44;
cvt.f32.f16 %f156, %h98;
ld.global.nc.b16 %h99, [%rd46+1280];
cvt.f32.f16 %f157, %h99;
ld.global.nc.f32 %f158, [%rd47+2560];
mul.rn.f32 %f159, %f1, %f158;
mul.rn.f32 %f160, %f159, %f157;
ld.global.nc.f32 %f161, [%rd48+2560];
mul.rn.f32 %f162, %f2, %f159;
sub.rn.f32 %f163, %f161, %f162;
add.rn.f32 %f164, %f160, %f163;
add.rn.f32 %f165, %f164, %f156;
add.rn.f32 %f13, %f12, %f165;
or.b32 %r201, %r3, 641;
or.b32 %r202, %r201, %r4;
and.b32 %r203, %r201, 3;
shr.u32 %r204, %r202, 2;
setp.ne.s32 %p45, %r203, 1;
cvt.u64.u32 %rd1788, %r204;
add.s64 %rd323, %rd11, %rd1788;
@%p45 bra LBB29_37;
and.b64 %rd1828, %rd323, 4294967295;
mul.lo.s64 %rd2651, %rd1828, 3528531795;
setp.lt.u64 %p47, %rd323, %rd11;
selp.u64 %rd1829, 1, 0, %p47;
add.s64 %rd1830, %rd2461, %rd1829;
xor.b64 %rd1831, %rd1830, %rd2651;
shr.u64 %rd1832, %rd1831, 32;
mul.lo.s64 %rd2654, %rd1832, 3449720151;
shr.u64 %rd1833, %rd2654, 32;
and.b64 %rd1834, %rd1830, 4294967295;
mul.lo.s64 %rd1835, %rd1834, 3449720151;
and.b64 %rd1836, %rd1835, 4294967295;
xor.b64 %rd1837, %rd1836, %rd1833;
xor.b64 %rd1838, %rd1837, 2654435769;
mul.lo.s64 %rd2657, %rd1838, 3528531795;
xor.b64 %rd2647, %rd1835, %rd323;
mov.u32 %r339, -845247145;
mov.u32 %r338, -616729560;
mov.u64 %rd2664, 3041712726;
mov.u64 %rd2663, 1401181199;
mov.u64 %rd2662, 2835769497;
mov.u64 %rd2661, 1684936478;
mov.u64 %rd2660, 2027808484;
mov.u64 %rd2659, 387276957;
mov.u64 %rd2658, 842468239;
mov.u64 %rd2656, 3986602516;
mov.u64 %rd2655, 1013904242;
mov.u64 %rd2653, 3668340011;
mov.u64 %rd2652, 3144134277;
mov.u64 %rd2650, 3449720151;
mov.u64 %rd2649, 1993301258;
mov.u64 %rd2648, 3528531795;
bra.uni LBB29_38;
LBB29_37:
setp.lt.u64 %p46, %rd323, %rd11;
selp.u64 %rd1803, 1, 0, %p46;
add.s64 %rd1804, %rd2461, %rd1803;
and.b64 %rd1805, %rd1804, 4294967295;
mul.lo.s64 %rd2651, %rd1805, 3449720151;
xor.b64 %rd1806, %rd2651, %rd323;
shr.u64 %rd1807, %rd1806, 32;
mul.lo.s64 %rd2654, %rd1807, 3528531795;
shr.u64 %rd1808, %rd2654, 32;
and.b64 %rd1809, %rd323, 4294967295;
mul.lo.s64 %rd1810, %rd1809, 3528531795;
and.b64 %rd1811, %rd1810, 4294967295;
xor.b64 %rd1812, %rd1811, %rd1808;
xor.b64 %rd1813, %rd1812, 3144134277;
mul.lo.s64 %rd2657, %rd1813, 3449720151;
xor.b64 %rd2647, %rd1804, %rd1810;
mov.u32 %r339, -766435501;
mov.u32 %r338, -239350328;
mov.u64 %rd2664, 1684936478;
mov.u64 %rd2663, 534103459;
mov.u64 %rd2662, 387276957;
mov.u64 %rd2661, 3041712726;
mov.u64 %rd2660, 3986602516;
mov.u64 %rd2659, 2835769497;
mov.u64 %rd2658, 3668340011;
mov.u64 %rd2656, 2027808484;
mov.u64 %rd2655, 1993301258;
mov.u64 %rd2653, 842468239;
mov.u64 %rd2652, 2654435769;
mov.u64 %rd2650, 3528531795;
mov.u64 %rd2649, 1013904242;
mov.u64 %rd2648, 3449720151;
LBB29_38:
shr.u64 %rd1839, %rd2657, 32;
shr.u64 %rd1840, %rd2647, 32;
mul.lo.s64 %rd1841, %rd1840, %rd2648;
and.b64 %rd1842, %rd1841, 4294967295;
xor.b64 %rd1843, %rd1842, %rd1839;
xor.b64 %rd1844, %rd1843, %rd2649;
mul.lo.s64 %rd1845, %rd1844, %rd2650;
shr.u64 %rd1846, %rd1845, 32;
shr.u64 %rd1847, %rd1841, 32;
and.b64 %rd1848, %rd2651, 4294967295;
xor.b64 %rd1849, %rd1848, %rd1847;
xor.b64 %rd1850, %rd1849, %rd2652;
mul.lo.s64 %rd1851, %rd1850, %rd2650;
and.b64 %rd1852, %rd1851, 4294967295;
xor.b64 %rd1853, %rd1852, %rd1846;
xor.b64 %rd1854, %rd1853, %rd2653;
mul.lo.s64 %rd1855, %rd1854, %rd2648;
shr.u64 %rd1856, %rd1855, 32;
shr.u64 %rd1857, %rd1851, 32;
and.b64 %rd1858, %rd2654, 4294967295;
xor.b64 %rd1859, %rd1858, %rd1857;
xor.b64 %rd1860, %rd1859, %rd2655;
mul.lo.s64 %rd1861, %rd1860, %rd2648;
and.b64 %rd1862, %rd1861, 4294967295;
xor.b64 %rd1863, %rd1862, %rd1856;
xor.b64 %rd1864, %rd1863, %rd2656;
mul.lo.s64 %rd1865, %rd1864, %rd2650;
shr.u64 %rd1866, %rd1865, 32;
shr.u64 %rd1867, %rd1861, 32;
and.b64 %rd1868, %rd2657, 4294967295;
xor.b64 %rd1869, %rd1868, %rd1867;
xor.b64 %rd1870, %rd1869, %rd2658;
mul.lo.s64 %rd1871, %rd1870, %rd2650;
and.b64 %rd1872, %rd1871, 4294967295;
xor.b64 %rd1873, %rd1872, %rd1866;
xor.b64 %rd1874, %rd1873, %rd2659;
mul.lo.s64 %rd1875, %rd1874, %rd2648;
shr.u64 %rd1876, %rd1875, 32;
shr.u64 %rd1877, %rd1871, 32;
and.b64 %rd1878, %rd1845, 4294967295;
xor.b64 %rd1879, %rd1878, %rd1877;
xor.b64 %rd1880, %rd1879, %rd2660;
mul.lo.s64 %rd1881, %rd1880, %rd2648;
and.b64 %rd1882, %rd1881, 4294967295;
xor.b64 %rd1883, %rd1882, %rd1876;
xor.b64 %rd1884, %rd1883, %rd2661;
mul.lo.s64 %rd1885, %rd1884, %rd2650;
shr.u64 %rd1886, %rd1885, 32;
shr.u64 %rd1887, %rd1881, 32;
and.b64 %rd1888, %rd1855, 4294967295;
xor.b64 %rd1889, %rd1888, %rd1887;
xor.b64 %rd1890, %rd1889, %rd2662;
mul.lo.s64 %rd1891, %rd1890, %rd2650;
and.b64 %rd1892, %rd1891, 4294967295;
xor.b64 %rd1893, %rd1892, %rd1886;
xor.b64 %rd1894, %rd1893, %rd2663;
mul.lo.s64 %rd1895, %rd1894, %rd2648;
shr.u64 %rd1896, %rd1895, 32;
shr.u64 %rd1897, %rd1891, 32;
xor.b64 %rd1898, %rd1865, %rd1897;
xor.b64 %rd1899, %rd1898, %rd2664;
mul.lo.s64 %rd1900, %rd1899, %rd2648;
xor.b64 %rd1901, %rd1896, %rd1900;
cvt.u32.u64 %r209, %rd1901;
xor.b32 %r210, %r338, %r209;
mul.lo.s32 %r211, %r210, %r339;
shr.u32 %r212, %r211, 9;
cvt.rn.f32.u32 %f166, %r212;
mul.rn.f32 %f167, %f166, 0f34000000;
cvt.rn.f16.f32 %h100, %f167;
mov.b16 %h101, 0x2E66;
setp.ge.f16 %p49, %h100, %h101;
ld.global.nc.b16 %h102, [%rd44+1282];
ld.global.nc.f32 %f168, [%rd45+2564];
cvt.rn.f16.f32 %h103, %f168;
add.rn.f16 %h104, %h102, %h103;
mov.b16 %h105, 0x3C72;
mul.rn.f16 %h106, %h104, %h105;
selp.b16 %h107, %h106, 0x0000, %p49;
cvt.f32.f16 %f169, %h107;
ld.global.nc.b16 %h108, [%rd46+1282];
cvt.f32.f16 %f170, %h108;
ld.global.nc.f32 %f171, [%rd47+2564];
mul.rn.f32 %f172, %f1, %f171;
mul.rn.f32 %f173, %f172, %f170;
ld.global.nc.f32 %f174, [%rd48+2564];
mul.rn.f32 %f175, %f2, %f172;
sub.rn.f32 %f176, %f174, %f175;
add.rn.f32 %f177, %f173, %f176;
add.rn.f32 %f178, %f177, %f169;
add.rn.f32 %f14, %f13, %f178;
or.b32 %r214, %r73, 768;
shr.u32 %r215, %r214, 2;
cvt.u64.u32 %rd1902, %r215;
add.s64 %rd350, %rd11, %rd1902;
@%p8 bra LBB29_40;
and.b64 %rd1944, %rd350, 4294967295;
mul.lo.s64 %rd2669, %rd1944, 3528531795;
setp.lt.u64 %p51, %rd350, %rd11;
selp.u64 %rd1945, 1, 0, %p51;
add.s64 %rd1946, %rd2461, %rd1945;
xor.b64 %rd1947, %rd1946, %rd2669;
shr.u64 %rd1948, %rd1947, 32;
mul.lo.s64 %rd2672, %rd1948, 3449720151;
shr.u64 %rd1949, %rd2672, 32;
and.b64 %rd1950, %rd1946, 4294967295;
mul.lo.s64 %rd1951, %rd1950, 3449720151;
and.b64 %rd1952, %rd1951, 4294967295;
xor.b64 %rd1953, %rd1952, %rd1949;
xor.b64 %rd1954, %rd1953, 2654435769;
mul.lo.s64 %rd2675, %rd1954, 3528531795;
xor.b64 %rd2665, %rd1951, %rd350;
mov.u32 %r342, -1879881855;
mov.u32 %r341, -845247145;
mov.u32 %r340, 534103459;
mov.u64 %rd2683, 3678237736;
mov.u64 %rd2682, 3041712726;
mov.u64 %rd2681, 1401181199;
mov.u64 %rd2680, 2835769497;
mov.u64 %rd2679, 1684936478;
mov.u64 %rd2678, 2027808484;
mov.u64 %rd2677, 387276957;
mov.u64 %rd2676, 842468239;
mov.u64 %rd2674, 3986602516;
mov.u64 %rd2673, 1013904242;
mov.u64 %rd2671, 3668340011;
mov.u64 %rd2670, 3144134277;
mov.u64 %rd2668, 3449720151;
mov.u64 %rd2667, 1993301258;
mov.u64 %rd2666, 3528531795;
bra.uni LBB29_41;
LBB29_40:
setp.lt.u64 %p50, %rd350, %rd11;
selp.u64 %rd1918, 1, 0, %p50;
add.s64 %rd1919, %rd2461, %rd1918;
and.b64 %rd1920, %rd1919, 4294967295;
mul.lo.s64 %rd2669, %rd1920, 3449720151;
xor.b64 %rd1921, %rd2669, %rd350;
shr.u64 %rd1922, %rd1921, 32;
mul.lo.s64 %rd2672, %rd1922, 3528531795;
shr.u64 %rd1923, %rd2672, 32;
and.b64 %rd1924, %rd350, 4294967295;
mul.lo.s64 %rd1925, %rd1924, 3528531795;
and.b64 %rd1926, %rd1925, 4294967295;
xor.b64 %rd1927, %rd1926, %rd1923;
xor.b64 %rd1928, %rd1927, 3144134277;
mul.lo.s64 %rd2675, %rd1928, 3449720151;
xor.b64 %rd2665, %rd1919, %rd1925;
mov.u32 %r342, -1767562579;
mov.u32 %r341, -766435501;
mov.u32 %r340, 1401181199;
mov.u64 %rd2683, 4055616968;
mov.u64 %rd2682, 1684936478;
mov.u64 %rd2681, 534103459;
mov.u64 %rd2680, 387276957;
mov.u64 %rd2679, 3041712726;
mov.u64 %rd2678, 3986602516;
mov.u64 %rd2677, 2835769497;
mov.u64 %rd2676, 3668340011;
mov.u64 %rd2674, 2027808484;
mov.u64 %rd2673, 1993301258;
mov.u64 %rd2671, 842468239;
mov.u64 %rd2670, 2654435769;
mov.u64 %rd2668, 3528531795;
mov.u64 %rd2667, 1013904242;
mov.u64 %rd2666, 3449720151;
LBB29_41:
shr.u64 %rd1955, %rd2675, 32;
shr.u64 %rd1956, %rd2665, 32;
mul.lo.s64 %rd1957, %rd1956, %rd2666;
and.b64 %rd1958, %rd1957, 4294967295;
xor.b64 %rd1959, %rd1958, %rd1955;
xor.b64 %rd1960, %rd1959, %rd2667;
mul.lo.s64 %rd1961, %rd1960, %rd2668;
shr.u64 %rd1962, %rd1961, 32;
shr.u64 %rd1963, %rd1957, 32;
and.b64 %rd1964, %rd2669, 4294967295;
xor.b64 %rd1965, %rd1964, %rd1963;
xor.b64 %rd1966, %rd1965, %rd2670;
mul.lo.s64 %rd1967, %rd1966, %rd2668;
and.b64 %rd1968, %rd1967, 4294967295;
xor.b64 %rd1969, %rd1968, %rd1962;
xor.b64 %rd1970, %rd1969, %rd2671;
mul.lo.s64 %rd1971, %rd1970, %rd2666;
shr.u64 %rd1972, %rd1971, 32;
shr.u64 %rd1973, %rd1967, 32;
and.b64 %rd1974, %rd2672, 4294967295;
xor.b64 %rd1975, %rd1974, %rd1973;
xor.b64 %rd1976, %rd1975, %rd2673;
mul.lo.s64 %rd1977, %rd1976, %rd2666;
and.b64 %rd1978, %rd1977, 4294967295;
xor.b64 %rd1979, %rd1978, %rd1972;
xor.b64 %rd1980, %rd1979, %rd2674;
mul.lo.s64 %rd1981, %rd1980, %rd2668;
shr.u64 %rd1982, %rd1981, 32;
shr.u64 %rd1983, %rd1977, 32;
and.b64 %rd1984, %rd2675, 4294967295;
xor.b64 %rd1985, %rd1984, %rd1983;
xor.b64 %rd1986, %rd1985, %rd2676;
mul.lo.s64 %rd1987, %rd1986, %rd2668;
and.b64 %rd1988, %rd1987, 4294967295;
xor.b64 %rd1989, %rd1988, %rd1982;
xor.b64 %rd1990, %rd1989, %rd2677;
mul.lo.s64 %rd1991, %rd1990, %rd2666;
shr.u64 %rd1992, %rd1991, 32;
shr.u64 %rd1993, %rd1987, 32;
and.b64 %rd1994, %rd1961, 4294967295;
xor.b64 %rd1995, %rd1994, %rd1993;
xor.b64 %rd1996, %rd1995, %rd2678;
mul.lo.s64 %rd1997, %rd1996, %rd2666;
and.b64 %rd1998, %rd1997, 4294967295;
xor.b64 %rd1999, %rd1998, %rd1992;
xor.b64 %rd2000, %rd1999, %rd2679;
mul.lo.s64 %rd2001, %rd2000, %rd2668;
shr.u64 %rd2002, %rd2001, 32;
shr.u64 %rd2003, %rd1997, 32;
and.b64 %rd2004, %rd1971, 4294967295;
xor.b64 %rd2005, %rd2004, %rd2003;
xor.b64 %rd2006, %rd2005, %rd2680;
mul.lo.s64 %rd2007, %rd2006, %rd2668;
and.b64 %rd2008, %rd2007, 4294967295;
xor.b64 %rd2009, %rd2008, %rd2002;
xor.b64 %rd2010, %rd2009, %rd2681;
mul.lo.s64 %rd2011, %rd2010, %rd2666;
shr.u64 %rd2012, %rd2011, 32;
shr.u64 %rd2013, %rd2007, 32;
and.b64 %rd2014, %rd1981, 4294967295;
xor.b64 %rd2015, %rd2014, %rd2013;
xor.b64 %rd2016, %rd2015, %rd2682;
mul.lo.s64 %rd2017, %rd2016, %rd2666;
and.b64 %rd2018, %rd2017, 4294967295;
xor.b64 %rd2019, %rd2018, %rd2012;
xor.b64 %rd2020, %rd2019, %rd2683;
mul.lo.s64 %rd2021, %rd2020, %rd2668;
shr.u64 %rd2022, %rd2021, 32;
cvt.u32.u64 %r222, %rd2022;
shr.u64 %rd2023, %rd2017, 32;
xor.b64 %rd2024, %rd2023, %rd1991;
cvt.u32.u64 %r223, %rd2024;
xor.b32 %r224, %r340, %r223;
mul.lo.s32 %r225, %r224, %r341;
xor.b32 %r226, %r225, %r222;
xor.b32 %r227, %r226, %r342;
shr.u32 %r228, %r227, 9;
cvt.rn.f32.u32 %f179, %r228;
mul.rn.f32 %f180, %f179, 0f34000000;
cvt.rn.f16.f32 %h109, %f180;
mov.b16 %h110, 0x2E66;
setp.ge.f16 %p52, %h109, %h110;
ld.global.nc.b16 %h111, [%rd44+1536];
ld.global.nc.f32 %f181, [%rd45+3072];
cvt.rn.f16.f32 %h112, %f181;
add.rn.f16 %h113, %h111, %h112;
mov.b16 %h114, 0x3C72;
mul.rn.f16 %h115, %h113, %h114;
selp.b16 %h116, %h115, 0x0000, %p52;
cvt.f32.f16 %f182, %h116;
ld.global.nc.b16 %h117, [%rd46+1536];
cvt.f32.f16 %f183, %h117;
ld.global.nc.f32 %f184, [%rd47+3072];
mul.rn.f32 %f185, %f1, %f184;
mul.rn.f32 %f186, %f185, %f183;
ld.global.nc.f32 %f187, [%rd48+3072];
mul.rn.f32 %f188, %f2, %f185;
sub.rn.f32 %f189, %f187, %f188;
add.rn.f32 %f190, %f186, %f189;
add.rn.f32 %f191, %f190, %f182;
add.rn.f32 %f15, %f14, %f191;
or.b32 %r229, %r3, 769;
or.b32 %r230, %r229, %r4;
and.b32 %r231, %r229, 3;
shr.u32 %r232, %r230, 2;
setp.ne.s32 %p53, %r231, 1;
cvt.u64.u32 %rd2025, %r232;
add.s64 %rd378, %rd11, %rd2025;
@%p53 bra LBB29_43;
and.b64 %rd2065, %rd378, 4294967295;
mul.lo.s64 %rd2688, %rd2065, 3528531795;
setp.lt.u64 %p55, %rd378, %rd11;
selp.u64 %rd2066, 1, 0, %p55;
add.s64 %rd2067, %rd2461, %rd2066;
xor.b64 %rd2068, %rd2067, %rd2688;
shr.u64 %rd2069, %rd2068, 32;
mul.lo.s64 %rd2691, %rd2069, 3449720151;
shr.u64 %rd2070, %rd2691, 32;
and.b64 %rd2071, %rd2067, 4294967295;
mul.lo.s64 %rd2072, %rd2071, 3449720151;
and.b64 %rd2073, %rd2072, 4294967295;
xor.b64 %rd2074, %rd2073, %rd2070;
xor.b64 %rd2075, %rd2074, 2654435769;
mul.lo.s64 %rd2694, %rd2075, 3528531795;
xor.b64 %rd2684, %rd2072, %rd378;
mov.u32 %r344, -845247145;
mov.u32 %r343, -616729560;
mov.u64 %rd2701, 3041712726;
mov.u64 %rd2700, 1401181199;
mov.u64 %rd2699, 2835769497;
mov.u64 %rd2698, 1684936478;
mov.u64 %rd2697, 2027808484;
mov.u64 %rd2696, 387276957;
mov.u64 %rd2695, 842468239;
mov.u64 %rd2693, 3986602516;
mov.u64 %rd2692, 1013904242;
mov.u64 %rd2690, 3668340011;
mov.u64 %rd2689, 3144134277;
mov.u64 %rd2687, 3449720151;
mov.u64 %rd2686, 1993301258;
mov.u64 %rd2685, 3528531795;
bra.uni LBB29_44;
LBB29_43:
setp.lt.u64 %p54, %rd378, %rd11;
selp.u64 %rd2040, 1, 0, %p54;
add.s64 %rd2041, %rd2461, %rd2040;
and.b64 %rd2042, %rd2041, 4294967295;
mul.lo.s64 %rd2688, %rd2042, 3449720151;
xor.b64 %rd2043, %rd2688, %rd378;
shr.u64 %rd2044, %rd2043, 32;
mul.lo.s64 %rd2691, %rd2044, 3528531795;
shr.u64 %rd2045, %rd2691, 32;
and.b64 %rd2046, %rd378, 4294967295;
mul.lo.s64 %rd2047, %rd2046, 3528531795;
and.b64 %rd2048, %rd2047, 4294967295;
xor.b64 %rd2049, %rd2048, %rd2045;
xor.b64 %rd2050, %rd2049, 3144134277;
mul.lo.s64 %rd2694, %rd2050, 3449720151;
xor.b64 %rd2684, %rd2041, %rd2047;
mov.u32 %r344, -766435501;
mov.u32 %r343, -239350328;
mov.u64 %rd2701, 1684936478;
mov.u64 %rd2700, 534103459;
mov.u64 %rd2699, 387276957;
mov.u64 %rd2698, 3041712726;
mov.u64 %rd2697, 3986602516;
mov.u64 %rd2696, 2835769497;
mov.u64 %rd2695, 3668340011;
mov.u64 %rd2693, 2027808484;
mov.u64 %rd2692, 1993301258;
mov.u64 %rd2690, 842468239;
mov.u64 %rd2689, 2654435769;
mov.u64 %rd2687, 3528531795;
mov.u64 %rd2686, 1013904242;
mov.u64 %rd2685, 3449720151;
LBB29_44:
shr.u64 %rd2076, %rd2694, 32;
shr.u64 %rd2077, %rd2684, 32;
mul.lo.s64 %rd2078, %rd2077, %rd2685;
and.b64 %rd2079, %rd2078, 4294967295;
xor.b64 %rd2080, %rd2079, %rd2076;
xor.b64 %rd2081, %rd2080, %rd2686;
mul.lo.s64 %rd2082, %rd2081, %rd2687;
shr.u64 %rd2083, %rd2082, 32;
shr.u64 %rd2084, %rd2078, 32;
and.b64 %rd2085, %rd2688, 4294967295;
xor.b64 %rd2086, %rd2085, %rd2084;
xor.b64 %rd2087, %rd2086, %rd2689;
mul.lo.s64 %rd2088, %rd2087, %rd2687;
and.b64 %rd2089, %rd2088, 4294967295;
xor.b64 %rd2090, %rd2089, %rd2083;
xor.b64 %rd2091, %rd2090, %rd2690;
mul.lo.s64 %rd2092, %rd2091, %rd2685;
shr.u64 %rd2093, %rd2092, 32;
shr.u64 %rd2094, %rd2088, 32;
and.b64 %rd2095, %rd2691, 4294967295;
xor.b64 %rd2096, %rd2095, %rd2094;
xor.b64 %rd2097, %rd2096, %rd2692;
mul.lo.s64 %rd2098, %rd2097, %rd2685;
and.b64 %rd2099, %rd2098, 4294967295;
xor.b64 %rd2100, %rd2099, %rd2093;
xor.b64 %rd2101, %rd2100, %rd2693;
mul.lo.s64 %rd2102, %rd2101, %rd2687;
shr.u64 %rd2103, %rd2102, 32;
shr.u64 %rd2104, %rd2098, 32;
and.b64 %rd2105, %rd2694, 4294967295;
xor.b64 %rd2106, %rd2105, %rd2104;
xor.b64 %rd2107, %rd2106, %rd2695;
mul.lo.s64 %rd2108, %rd2107, %rd2687;
and.b64 %rd2109, %rd2108, 4294967295;
xor.b64 %rd2110, %rd2109, %rd2103;
xor.b64 %rd2111, %rd2110, %rd2696;
mul.lo.s64 %rd2112, %rd2111, %rd2685;
shr.u64 %rd2113, %rd2112, 32;
shr.u64 %rd2114, %rd2108, 32;
and.b64 %rd2115, %rd2082, 4294967295;
xor.b64 %rd2116, %rd2115, %rd2114;
xor.b64 %rd2117, %rd2116, %rd2697;
mul.lo.s64 %rd2118, %rd2117, %rd2685;
and.b64 %rd2119, %rd2118, 4294967295;
xor.b64 %rd2120, %rd2119, %rd2113;
xor.b64 %rd2121, %rd2120, %rd2698;
mul.lo.s64 %rd2122, %rd2121, %rd2687;
shr.u64 %rd2123, %rd2122, 32;
shr.u64 %rd2124, %rd2118, 32;
and.b64 %rd2125, %rd2092, 4294967295;
xor.b64 %rd2126, %rd2125, %rd2124;
xor.b64 %rd2127, %rd2126, %rd2699;
mul.lo.s64 %rd2128, %rd2127, %rd2687;
and.b64 %rd2129, %rd2128, 4294967295;
xor.b64 %rd2130, %rd2129, %rd2123;
xor.b64 %rd2131, %rd2130, %rd2700;
mul.lo.s64 %rd2132, %rd2131, %rd2685;
shr.u64 %rd2133, %rd2132, 32;
shr.u64 %rd2134, %rd2128, 32;
xor.b64 %rd2135, %rd2102, %rd2134;
xor.b64 %rd2136, %rd2135, %rd2701;
mul.lo.s64 %rd2137, %rd2136, %rd2685;
xor.b64 %rd2138, %rd2133, %rd2137;
cvt.u32.u64 %r237, %rd2138;
xor.b32 %r238, %r343, %r237;
mul.lo.s32 %r239, %r238, %r344;
shr.u32 %r240, %r239, 9;
cvt.rn.f32.u32 %f192, %r240;
mul.rn.f32 %f193, %f192, 0f34000000;
cvt.rn.f16.f32 %h118, %f193;
mov.b16 %h119, 0x2E66;
setp.ge.f16 %p57, %h118, %h119;
ld.global.nc.b16 %h120, [%rd44+1538];
ld.global.nc.f32 %f194, [%rd45+3076];
cvt.rn.f16.f32 %h121, %f194;
add.rn.f16 %h122, %h120, %h121;
mov.b16 %h123, 0x3C72;
mul.rn.f16 %h124, %h122, %h123;
selp.b16 %h125, %h124, 0x0000, %p57;
cvt.f32.f16 %f195, %h125;
ld.global.nc.b16 %h126, [%rd46+1538];
cvt.f32.f16 %f196, %h126;
ld.global.nc.f32 %f197, [%rd47+3076];
mul.rn.f32 %f198, %f1, %f197;
mul.rn.f32 %f199, %f198, %f196;
ld.global.nc.f32 %f200, [%rd48+3076];
mul.rn.f32 %f201, %f2, %f198;
sub.rn.f32 %f202, %f200, %f201;
add.rn.f32 %f203, %f199, %f202;
add.rn.f32 %f204, %f203, %f195;
add.rn.f32 %f16, %f15, %f204;
or.b32 %r242, %r73, 896;
shr.u32 %r243, %r242, 2;
cvt.u64.u32 %rd2139, %r243;
add.s64 %rd405, %rd11, %rd2139;
@%p8 bra LBB29_46;
mov.u32 %r347, -1879881855;
mov.u32 %r345, 534103459;
mov.u64 %rd2720, 3678237736;
and.b64 %rd2181, %rd405, 4294967295;
mul.lo.s64 %rd2706, %rd2181, 3528531795;
setp.lt.u64 %p59, %rd405, %rd11;
selp.u64 %rd2182, 1, 0, %p59;
add.s64 %rd2183, %rd2461, %rd2182;
xor.b64 %rd2184, %rd2183, %rd2706;
shr.u64 %rd2185, %rd2184, 32;
mul.lo.s64 %rd2709, %rd2185, 3449720151;
shr.u64 %rd2186, %rd2709, 32;
and.b64 %rd2187, %rd2183, 4294967295;
mul.lo.s64 %rd2188, %rd2187, 3449720151;
and.b64 %rd2189, %rd2188, 4294967295;
xor.b64 %rd2190, %rd2189, %rd2186;
xor.b64 %rd2191, %rd2190, 2654435769;
mul.lo.s64 %rd2712, %rd2191, 3528531795;
xor.b64 %rd2702, %rd2188, %rd405;
mov.u32 %r346, -845247145;
mov.u64 %rd2719, 3041712726;
mov.u64 %rd2718, 1401181199;
mov.u64 %rd2717, 2835769497;
mov.u64 %rd2716, 1684936478;
mov.u64 %rd2715, 2027808484;
mov.u64 %rd2714, 387276957;
mov.u64 %rd2713, 842468239;
mov.u64 %rd2711, 3986602516;
mov.u64 %rd2710, 1013904242;
mov.u64 %rd2708, 3668340011;
mov.u64 %rd2707, 3144134277;
mov.u64 %rd2705, 3449720151;
mov.u64 %rd2704, 1993301258;
mov.u64 %rd2703, 3528531795;
bra.uni LBB29_47;
LBB29_46:
setp.lt.u64 %p58, %rd405, %rd11;
selp.u64 %rd2155, 1, 0, %p58;
add.s64 %rd2156, %rd2461, %rd2155;
and.b64 %rd2157, %rd2156, 4294967295;
mul.lo.s64 %rd2706, %rd2157, 3449720151;
xor.b64 %rd2158, %rd2706, %rd405;
shr.u64 %rd2159, %rd2158, 32;
mul.lo.s64 %rd2709, %rd2159, 3528531795;
shr.u64 %rd2160, %rd2709, 32;
and.b64 %rd2161, %rd405, 4294967295;
mul.lo.s64 %rd2162, %rd2161, 3528531795;
and.b64 %rd2163, %rd2162, 4294967295;
xor.b64 %rd2164, %rd2163, %rd2160;
xor.b64 %rd2165, %rd2164, 3144134277;
mul.lo.s64 %rd2712, %rd2165, 3449720151;
xor.b64 %rd2702, %rd2156, %rd2162;
mov.u32 %r347, -1767562579;
mov.u32 %r346, -766435501;
mov.u32 %r345, 1401181199;
mov.u64 %rd2720, 4055616968;
mov.u64 %rd2719, 1684936478;
mov.u64 %rd2718, 534103459;
mov.u64 %rd2717, 387276957;
mov.u64 %rd2716, 3041712726;
mov.u64 %rd2715, 3986602516;
mov.u64 %rd2714, 2835769497;
mov.u64 %rd2713, 3668340011;
mov.u64 %rd2711, 2027808484;
mov.u64 %rd2710, 1993301258;
mov.u64 %rd2708, 842468239;
mov.u64 %rd2707, 2654435769;
mov.u64 %rd2705, 3528531795;
mov.u64 %rd2704, 1013904242;
mov.u64 %rd2703, 3449720151;
LBB29_47:
shr.u64 %rd2192, %rd2712, 32;
shr.u64 %rd2193, %rd2702, 32;
mul.lo.s64 %rd2194, %rd2193, %rd2703;
and.b64 %rd2195, %rd2194, 4294967295;
xor.b64 %rd2196, %rd2195, %rd2192;
xor.b64 %rd2197, %rd2196, %rd2704;
mul.lo.s64 %rd2198, %rd2197, %rd2705;
shr.u64 %rd2199, %rd2198, 32;
shr.u64 %rd2200, %rd2194, 32;
and.b64 %rd2201, %rd2706, 4294967295;
xor.b64 %rd2202, %rd2201, %rd2200;
xor.b64 %rd2203, %rd2202, %rd2707;
mul.lo.s64 %rd2204, %rd2203, %rd2705;
and.b64 %rd2205, %rd2204, 4294967295;
xor.b64 %rd2206, %rd2205, %rd2199;
xor.b64 %rd2207, %rd2206, %rd2708;
mul.lo.s64 %rd2208, %rd2207, %rd2703;
shr.u64 %rd2209, %rd2208, 32;
shr.u64 %rd2210, %rd2204, 32;
and.b64 %rd2211, %rd2709, 4294967295;
xor.b64 %rd2212, %rd2211, %rd2210;
xor.b64 %rd2213, %rd2212, %rd2710;
mul.lo.s64 %rd2214, %rd2213, %rd2703;
and.b64 %rd2215, %rd2214, 4294967295;
xor.b64 %rd2216, %rd2215, %rd2209;
xor.b64 %rd2217, %rd2216, %rd2711;
mul.lo.s64 %rd2218, %rd2217, %rd2705;
shr.u64 %rd2219, %rd2218, 32;
shr.u64 %rd2220, %rd2214, 32;
and.b64 %rd2221, %rd2712, 4294967295;
xor.b64 %rd2222, %rd2221, %rd2220;
xor.b64 %rd2223, %rd2222, %rd2713;
mul.lo.s64 %rd2224, %rd2223, %rd2705;
and.b64 %rd2225, %rd2224, 4294967295;
xor.b64 %rd2226, %rd2225, %rd2219;
xor.b64 %rd2227, %rd2226, %rd2714;
mul.lo.s64 %rd2228, %rd2227, %rd2703;
shr.u64 %rd2229, %rd2228, 32;
shr.u64 %rd2230, %rd2224, 32;
and.b64 %rd2231, %rd2198, 4294967295;
xor.b64 %rd2232, %rd2231, %rd2230;
xor.b64 %rd2233, %rd2232, %rd2715;
mul.lo.s64 %rd2234, %rd2233, %rd2703;
and.b64 %rd2235, %rd2234, 4294967295;
xor.b64 %rd2236, %rd2235, %rd2229;
xor.b64 %rd2237, %rd2236, %rd2716;
mul.lo.s64 %rd2238, %rd2237, %rd2705;
shr.u64 %rd2239, %rd2238, 32;
shr.u64 %rd2240, %rd2234, 32;
and.b64 %rd2241, %rd2208, 4294967295;
xor.b64 %rd2242, %rd2241, %rd2240;
xor.b64 %rd2243, %rd2242, %rd2717;
mul.lo.s64 %rd2244, %rd2243, %rd2705;
and.b64 %rd2245, %rd2244, 4294967295;
xor.b64 %rd2246, %rd2245, %rd2239;
xor.b64 %rd2247, %rd2246, %rd2718;
mul.lo.s64 %rd2248, %rd2247, %rd2703;
shr.u64 %rd2249, %rd2248, 32;
shr.u64 %rd2250, %rd2244, 32;
and.b64 %rd2251, %rd2218, 4294967295;
xor.b64 %rd2252, %rd2251, %rd2250;
xor.b64 %rd2253, %rd2252, %rd2719;
mul.lo.s64 %rd2254, %rd2253, %rd2703;
and.b64 %rd2255, %rd2254, 4294967295;
xor.b64 %rd2256, %rd2255, %rd2249;
xor.b64 %rd2257, %rd2256, %rd2720;
mul.lo.s64 %rd2258, %rd2257, %rd2705;
shr.u64 %rd2259, %rd2258, 32;
cvt.u32.u64 %r250, %rd2259;
shr.u64 %rd2260, %rd2254, 32;
xor.b64 %rd2261, %rd2260, %rd2228;
cvt.u32.u64 %r251, %rd2261;
xor.b32 %r252, %r345, %r251;
mul.lo.s32 %r253, %r252, %r346;
xor.b32 %r254, %r253, %r250;
xor.b32 %r255, %r254, %r347;
shr.u32 %r256, %r255, 9;
cvt.rn.f32.u32 %f205, %r256;
mul.rn.f32 %f206, %f205, 0f34000000;
cvt.rn.f16.f32 %h127, %f206;
mov.b16 %h128, 0x2E66;
setp.ge.f16 %p60, %h127, %h128;
ld.global.nc.b16 %h129, [%rd44+1792];
ld.global.nc.f32 %f207, [%rd45+3584];
cvt.rn.f16.f32 %h130, %f207;
add.rn.f16 %h131, %h129, %h130;
mov.b16 %h132, 0x3C72;
mul.rn.f16 %h133, %h131, %h132;
selp.b16 %h134, %h133, 0x0000, %p60;
cvt.f32.f16 %f208, %h134;
ld.global.nc.b16 %h135, [%rd46+1792];
cvt.f32.f16 %f209, %h135;
ld.global.nc.f32 %f210, [%rd47+3584];
mul.rn.f32 %f211, %f1, %f210;
mul.rn.f32 %f212, %f211, %f209;
ld.global.nc.f32 %f213, [%rd48+3584];
mul.rn.f32 %f214, %f2, %f211;
sub.rn.f32 %f215, %f213, %f214;
add.rn.f32 %f216, %f212, %f215;
add.rn.f32 %f217, %f216, %f208;
add.rn.f32 %f17, %f16, %f217;
or.b32 %r257, %r3, 897;
or.b32 %r258, %r257, %r4;
and.b32 %r259, %r257, 3;
shr.u32 %r260, %r258, 2;
setp.ne.s32 %p61, %r259, 1;
cvt.u64.u32 %rd2262, %r260;
add.s64 %rd433, %rd11, %rd2262;
@%p61 bra LBB29_49;
mov.u32 %r349, -845247145;
mov.u64 %rd2737, 1401181199;
mov.u64 %rd2726, 3144134277;
mov.u32 %r348, -616729560;
and.b64 %rd2302, %rd433, 4294967295;
mul.lo.s64 %rd2725, %rd2302, 3528531795;
setp.lt.u64 %p63, %rd433, %rd11;
selp.u64 %rd2303, 1, 0, %p63;
add.s64 %rd2304, %rd2461, %rd2303;
xor.b64 %rd2305, %rd2304, %rd2725;
shr.u64 %rd2306, %rd2305, 32;
mul.lo.s64 %rd2728, %rd2306, 3449720151;
shr.u64 %rd2307, %rd2728, 32;
and.b64 %rd2308, %rd2304, 4294967295;
mul.lo.s64 %rd2309, %rd2308, 3449720151;
and.b64 %rd2310, %rd2309, 4294967295;
xor.b64 %rd2311, %rd2310, %rd2307;
xor.b64 %rd2312, %rd2311, 2654435769;
mul.lo.s64 %rd2731, %rd2312, 3528531795;
xor.b64 %rd2721, %rd2309, %rd433;
mov.u64 %rd2738, 3041712726;
mov.u64 %rd2736, 2835769497;
mov.u64 %rd2735, 1684936478;
mov.u64 %rd2734, 2027808484;
mov.u64 %rd2733, 387276957;
mov.u64 %rd2732, 842468239;
mov.u64 %rd2730, 3986602516;
mov.u64 %rd2729, 1013904242;
mov.u64 %rd2727, 3668340011;
mov.u64 %rd2724, 3449720151;
mov.u64 %rd2723, 1993301258;
mov.u64 %rd2722, 3528531795;
bra.uni LBB29_50;
LBB29_49:
setp.lt.u64 %p62, %rd433, %rd11;
selp.u64 %rd2277, 1, 0, %p62;
add.s64 %rd2278, %rd2461, %rd2277;
and.b64 %rd2279, %rd2278, 4294967295;
mul.lo.s64 %rd2725, %rd2279, 3449720151;
xor.b64 %rd2280, %rd2725, %rd433;
shr.u64 %rd2281, %rd2280, 32;
mul.lo.s64 %rd2728, %rd2281, 3528531795;
shr.u64 %rd2282, %rd2728, 32;
and.b64 %rd2283, %rd433, 4294967295;
mul.lo.s64 %rd2284, %rd2283, 3528531795;
and.b64 %rd2285, %rd2284, 4294967295;
xor.b64 %rd2286, %rd2285, %rd2282;
xor.b64 %rd2287, %rd2286, 3144134277;
mul.lo.s64 %rd2731, %rd2287, 3449720151;
xor.b64 %rd2721, %rd2278, %rd2284;
mov.u32 %r349, -766435501;
mov.u32 %r348, -239350328;
mov.u64 %rd2738, 1684936478;
mov.u64 %rd2737, 534103459;
mov.u64 %rd2736, 387276957;
mov.u64 %rd2735, 3041712726;
mov.u64 %rd2734, 3986602516;
mov.u64 %rd2733, 2835769497;
mov.u64 %rd2732, 3668340011;
mov.u64 %rd2730, 2027808484;
mov.u64 %rd2729, 1993301258;
mov.u64 %rd2727, 842468239;
mov.u64 %rd2726, 2654435769;
mov.u64 %rd2724, 3528531795;
mov.u64 %rd2723, 1013904242;
mov.u64 %rd2722, 3449720151;
LBB29_50:
shr.u64 %rd2313, %rd2731, 32;
shr.u64 %rd2314, %rd2721, 32;
mul.lo.s64 %rd2315, %rd2314, %rd2722;
and.b64 %rd2316, %rd2315, 4294967295;
xor.b64 %rd2317, %rd2316, %rd2313;
xor.b64 %rd2318, %rd2317, %rd2723;
mul.lo.s64 %rd2319, %rd2318, %rd2724;
shr.u64 %rd2320, %rd2319, 32;
shr.u64 %rd2321, %rd2315, 32;
and.b64 %rd2322, %rd2725, 4294967295;
xor.b64 %rd2323, %rd2322, %rd2321;
xor.b64 %rd2324, %rd2323, %rd2726;
mul.lo.s64 %rd2325, %rd2324, %rd2724;
and.b64 %rd2326, %rd2325, 4294967295;
xor.b64 %rd2327, %rd2326, %rd2320;
xor.b64 %rd2328, %rd2327, %rd2727;
mul.lo.s64 %rd2329, %rd2328, %rd2722;
shr.u64 %rd2330, %rd2329, 32;
shr.u64 %rd2331, %rd2325, 32;
and.b64 %rd2332, %rd2728, 4294967295;
xor.b64 %rd2333, %rd2332, %rd2331;
xor.b64 %rd2334, %rd2333, %rd2729;
mul.lo.s64 %rd2335, %rd2334, %rd2722;
and.b64 %rd2336, %rd2335, 4294967295;
xor.b64 %rd2337, %rd2336, %rd2330;
xor.b64 %rd2338, %rd2337, %rd2730;
mul.lo.s64 %rd2339, %rd2338, %rd2724;
shr.u64 %rd2340, %rd2339, 32;
shr.u64 %rd2341, %rd2335, 32;
and.b64 %rd2342, %rd2731, 4294967295;
xor.b64 %rd2343, %rd2342, %rd2341;
xor.b64 %rd2344, %rd2343, %rd2732;
mul.lo.s64 %rd2345, %rd2344, %rd2724;
and.b64 %rd2346, %rd2345, 4294967295;
xor.b64 %rd2347, %rd2346, %rd2340;
xor.b64 %rd2348, %rd2347, %rd2733;
mul.lo.s64 %rd2349, %rd2348, %rd2722;
shr.u64 %rd2350, %rd2349, 32;
shr.u64 %rd2351, %rd2345, 32;
and.b64 %rd2352, %rd2319, 4294967295;
xor.b64 %rd2353, %rd2352, %rd2351;
xor.b64 %rd2354, %rd2353, %rd2734;
mul.lo.s64 %rd2355, %rd2354, %rd2722;
and.b64 %rd2356, %rd2355, 4294967295;
xor.b64 %rd2357, %rd2356, %rd2350;
xor.b64 %rd2358, %rd2357, %rd2735;
mul.lo.s64 %rd2359, %rd2358, %rd2724;
shr.u64 %rd2360, %rd2359, 32;
shr.u64 %rd2361, %rd2355, 32;
and.b64 %rd2362, %rd2329, 4294967295;
xor.b64 %rd2363, %rd2362, %rd2361;
xor.b64 %rd2364, %rd2363, %rd2736;
mul.lo.s64 %rd2365, %rd2364, %rd2724;
and.b64 %rd2366, %rd2365, 4294967295;
xor.b64 %rd2367, %rd2366, %rd2360;
xor.b64 %rd2368, %rd2367, %rd2737;
mul.lo.s64 %rd2369, %rd2368, %rd2722;
shr.u64 %rd2370, %rd2369, 32;
shr.u64 %rd2371, %rd2365, 32;
xor.b64 %rd2372, %rd2339, %rd2371;
xor.b64 %rd2373, %rd2372, %rd2738;
mul.lo.s64 %rd2374, %rd2373, %rd2722;
xor.b64 %rd2375, %rd2370, %rd2374;
cvt.u32.u64 %r265, %rd2375;
xor.b32 %r266, %r348, %r265;
mul.lo.s32 %r267, %r266, %r349;
shr.u32 %r268, %r267, 9;
cvt.rn.f32.u32 %f218, %r268;
mul.rn.f32 %f219, %f218, 0f34000000;
cvt.rn.f16.f32 %h136, %f219;
mov.b16 %h137, 0x2E66;
setp.ge.f16 %p64, %h136, %h137;
ld.global.nc.b16 %h138, [%rd44+1794];
ld.global.nc.f32 %f220, [%rd45+3588];
cvt.rn.f16.f32 %h139, %f220;
add.rn.f16 %h140, %h138, %h139;
mov.b16 %h141, 0x3C72;
mul.rn.f16 %h142, %h140, %h141;
selp.b16 %h143, %h142, 0x0000, %p64;
cvt.f32.f16 %f221, %h143;
ld.global.nc.b16 %h144, [%rd46+1794];
cvt.f32.f16 %f222, %h144;
ld.global.nc.f32 %f223, [%rd47+3588];
mul.rn.f32 %f224, %f1, %f223;
mul.rn.f32 %f225, %f224, %f222;
ld.global.nc.f32 %f226, [%rd48+3588];
mul.rn.f32 %f227, %f2, %f224;
sub.rn.f32 %f228, %f226, %f227;
add.rn.f32 %f229, %f225, %f228;
add.rn.f32 %f230, %f229, %f221;
add.rn.f32 %f231, %f17, %f230;
and.b32 %r46, %r1, 31;
shfl.sync.down.b32 %f232, %f231, 16, 31, -1;
add.rn.f32 %f233, %f232, %f231;
shfl.sync.down.b32 %f234, %f233, 8, 31, -1;
add.rn.f32 %f235, %f234, %f233;
shfl.sync.down.b32 %f236, %f235, 4, 31, -1;
add.rn.f32 %f237, %f236, %f235;
shfl.sync.down.b32 %f238, %f237, 2, 31, -1;
add.rn.f32 %f239, %f238, %f237;
shfl.sync.down.b32 %f240, %f239, 1, 31, -1;
shr.u32 %r47, %r1, 5;
setp.ne.s32 %p65, %r46, 0;
mov.u64 %rd2378, shared_cache_06;
@%p65 bra LBB29_2;
mul.wide.u32 %rd2377, %r47, 4;
add.s64 %rd461, %rd2378, %rd2377;
add.rn.f32 %f18, %f240, %f239;
st.shared.f32 [%rd461], %f18;
LBB29_2:
bar.sync 0;
setp.eq.s32 %p66, %r47, 0;
@%p66 bra LBB29_52;
bra.uni LBB29_3;
LBB29_52:
add.u64 %rd472, %SP, 0;
add.u64 %rd10, %SPL, 0;
mul.wide.u32 %rd2379, %r46, 4;
add.s64 %rd462, %rd2378, %rd2379;
cvta.shared.u64 %rd2381, %rd462;
mov.u32 %r269, 0;
st.local.u32 [%rd10], %r269;
setp.lt.u32 %p67, %r1, 2;
selp.b64 %rd2383, %rd2381, %rd472, %p67;
ld.f32 %f241, [%rd2383];
shfl.sync.down.b32 %f242, %f241, 16, 31, -1;
add.rn.f32 %f243, %f241, %f242;
shfl.sync.down.b32 %f244, %f243, 8, 31, -1;
add.rn.f32 %f245, %f243, %f244;
shfl.sync.down.b32 %f246, %f245, 4, 31, -1;
add.rn.f32 %f247, %f245, %f246;
shfl.sync.down.b32 %f248, %f247, 2, 31, -1;
add.rn.f32 %f249, %f247, %f248;
shfl.sync.down.b32 %f250, %f249, 1, 31, -1;
add.rn.f32 %f251, %f249, %f250;
st.f32 [%rd2383], %f251;
setp.ne.s32 %p68, %r1, 0;
@%p68 bra LBB29_3;
ld.param.u64 %rd469, [fusion_2248_param_3];
cvt.u64.u32 %rd43, %r2;
cvta.to.global.u64 %rd6, %rd469;
shl.b64 %rd2376, %rd43, 2;
add.s64 %rd460, %rd6, %rd2376;
ld.shared.f32 %f252, [%rd462];
atom.global.add.f32 %f253, [%rd460], %f252;
LBB29_3:
ret;
}
// .globl fusion_2246
.visible .entry fusion_2246(
.param .u64 fusion_2246_param_0,
.param .u64 fusion_2246_param_1,
.param .u64 fusion_2246_param_2,
.param .u64 fusion_2246_param_3,
.param .u64 fusion_2246_param_4,
.param .u64 fusion_2246_param_5,
.param .u64 fusion_2246_param_6,
.param .u64 fusion_2246_param_7,
.param .u64 fusion_2246_param_8,
.param .u64 fusion_2246_param_9,
.param .u64 fusion_2246_param_10
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot30[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<75>;
.reg .b16 %h<145>;
.reg .f32 %f<288>;
.reg .b32 %r<350>;
.reg .b64 %rd<2742>;
mov.u64 %SPL, __local_depot30;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd464, [fusion_2246_param_0];
ld.param.u64 %rd465, [fusion_2246_param_9];
cvta.to.global.u64 %rd1, %rd465;
ld.param.u64 %rd466, [fusion_2246_param_1];
ld.param.u64 %rd467, [fusion_2246_param_8];
cvta.to.global.u64 %rd2, %rd467;
ld.param.u64 %rd468, [fusion_2246_param_2];
ld.param.u64 %rd469, [fusion_2246_param_7];
cvta.to.global.u64 %rd3, %rd469;
ld.param.u64 %rd471, [fusion_2246_param_6];
cvta.to.global.u64 %rd4, %rd471;
ld.param.u64 %rd472, [fusion_2246_param_4];
ld.param.u64 %rd473, [fusion_2246_param_5];
cvta.to.global.u64 %rd5, %rd473;
cvta.to.global.u64 %rd6, %rd472;
cvta.to.global.u64 %rd8, %rd468;
cvta.to.global.u64 %rd9, %rd466;
cvta.to.global.u64 %rd10, %rd464;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 10;
or.b32 %r48, %r4, %r3;
shr.u32 %r49, %r48, 2;
and.b32 %r5, %r1, 1;
setp.eq.s32 %p1, %r5, 0;
ld.global.nc.u64 %rd12, [%rd8];
cvt.u64.u32 %rd475, %r49;
add.s64 %rd13, %rd12, %rd475;
setp.lt.u64 %p69, %rd13, %rd12;
and.b64 %rd2387, %rd13, 4294967295;
@%p1 bra LBB30_1;
bra.uni LBB30_4;
LBB30_1:
mul.lo.s64 %rd2449, %rd2387, 3528531795;
ld.global.nc.u64 %rd2464, [%rd8+8];
selp.u64 %rd518, 1, 0, %p69;
add.s64 %rd519, %rd2464, %rd518;
xor.b64 %rd520, %rd519, %rd2449;
shr.u64 %rd521, %rd520, 32;
mul.lo.s64 %rd2452, %rd521, 3449720151;
shr.u64 %rd522, %rd2452, 32;
and.b64 %rd523, %rd519, 4294967295;
mul.lo.s64 %rd524, %rd523, 3449720151;
and.b64 %rd525, %rd524, 4294967295;
xor.b64 %rd526, %rd525, %rd522;
xor.b64 %rd527, %rd526, 2654435769;
mul.lo.s64 %rd2455, %rd527, 3528531795;
xor.b64 %rd2445, %rd524, %rd13;
mov.u32 %r312, -1879881855;
mov.u32 %r311, -845247145;
mov.u32 %r310, 534103459;
mov.u64 %rd2463, 3678237736;
mov.u64 %rd2462, 3041712726;
mov.u64 %rd2461, 1401181199;
mov.u64 %rd2460, 2835769497;
mov.u64 %rd2459, 1684936478;
mov.u64 %rd2458, 2027808484;
mov.u64 %rd2457, 387276957;
mov.u64 %rd2456, 842468239;
mov.u64 %rd2454, 3986602516;
mov.u64 %rd2453, 1013904242;
mov.u64 %rd2451, 3668340011;
mov.u64 %rd2450, 3144134277;
mov.u64 %rd2448, 3449720151;
mov.u64 %rd2447, 1993301258;
mov.u64 %rd2446, 3528531795;
bra.uni LBB30_5;
LBB30_4:
mov.u32 %r311, -766435501;
mov.u64 %rd2462, 1684936478;
mov.u64 %rd2461, 534103459;
mov.u64 %rd2460, 387276957;
mov.u64 %rd2459, 3041712726;
mov.u64 %rd2458, 3986602516;
mov.u64 %rd2457, 2835769497;
mov.u64 %rd2456, 3668340011;
mov.u64 %rd2454, 2027808484;
mov.u64 %rd2453, 1993301258;
mov.u64 %rd2451, 842468239;
mov.u64 %rd2450, 2654435769;
mov.u64 %rd2448, 3528531795;
mov.u64 %rd2447, 1013904242;
mov.u64 %rd2446, 3449720151;
mov.u32 %r312, -1767562579;
mov.u32 %r310, 1401181199;
mov.u64 %rd2463, 4055616968;
ld.global.nc.u64 %rd2464, [%rd8+8];
selp.u64 %rd491, 1, 0, %p69;
add.s64 %rd492, %rd2464, %rd491;
and.b64 %rd493, %rd492, 4294967295;
mul.lo.s64 %rd2449, %rd493, 3449720151;
xor.b64 %rd494, %rd2449, %rd13;
shr.u64 %rd495, %rd494, 32;
mul.lo.s64 %rd2452, %rd495, 3528531795;
shr.u64 %rd496, %rd2452, 32;
mul.lo.s64 %rd498, %rd2387, 3528531795;
and.b64 %rd499, %rd498, 4294967295;
xor.b64 %rd500, %rd499, %rd496;
xor.b64 %rd501, %rd500, 3144134277;
mul.lo.s64 %rd2455, %rd501, 3449720151;
xor.b64 %rd2445, %rd492, %rd498;
LBB30_5:
shr.u64 %rd528, %rd2455, 32;
shr.u64 %rd529, %rd2445, 32;
mul.lo.s64 %rd530, %rd529, %rd2446;
and.b64 %rd531, %rd530, 4294967295;
xor.b64 %rd532, %rd531, %rd528;
xor.b64 %rd533, %rd532, %rd2447;
mul.lo.s64 %rd534, %rd533, %rd2448;
shr.u64 %rd535, %rd534, 32;
shr.u64 %rd536, %rd530, 32;
and.b64 %rd537, %rd2449, 4294967295;
xor.b64 %rd538, %rd537, %rd536;
xor.b64 %rd539, %rd538, %rd2450;
mul.lo.s64 %rd540, %rd539, %rd2448;
and.b64 %rd541, %rd540, 4294967295;
xor.b64 %rd542, %rd541, %rd535;
xor.b64 %rd543, %rd542, %rd2451;
mul.lo.s64 %rd544, %rd543, %rd2446;
shr.u64 %rd545, %rd544, 32;
shr.u64 %rd546, %rd540, 32;
and.b64 %rd547, %rd2452, 4294967295;
xor.b64 %rd548, %rd547, %rd546;
xor.b64 %rd549, %rd548, %rd2453;
mul.lo.s64 %rd550, %rd549, %rd2446;
and.b64 %rd551, %rd550, 4294967295;
xor.b64 %rd552, %rd551, %rd545;
xor.b64 %rd553, %rd552, %rd2454;
mul.lo.s64 %rd554, %rd553, %rd2448;
shr.u64 %rd555, %rd554, 32;
shr.u64 %rd556, %rd550, 32;
and.b64 %rd557, %rd2455, 4294967295;
xor.b64 %rd558, %rd557, %rd556;
xor.b64 %rd559, %rd558, %rd2456;
mul.lo.s64 %rd560, %rd559, %rd2448;
and.b64 %rd561, %rd560, 4294967295;
xor.b64 %rd562, %rd561, %rd555;
xor.b64 %rd563, %rd562, %rd2457;
mul.lo.s64 %rd564, %rd563, %rd2446;
shr.u64 %rd565, %rd564, 32;
shr.u64 %rd566, %rd560, 32;
and.b64 %rd567, %rd534, 4294967295;
xor.b64 %rd568, %rd567, %rd566;
xor.b64 %rd569, %rd568, %rd2458;
mul.lo.s64 %rd570, %rd569, %rd2446;
and.b64 %rd571, %rd570, 4294967295;
xor.b64 %rd572, %rd571, %rd565;
xor.b64 %rd573, %rd572, %rd2459;
mul.lo.s64 %rd574, %rd573, %rd2448;
shr.u64 %rd575, %rd574, 32;
shr.u64 %rd576, %rd570, 32;
and.b64 %rd577, %rd544, 4294967295;
xor.b64 %rd578, %rd577, %rd576;
xor.b64 %rd579, %rd578, %rd2460;
mul.lo.s64 %rd580, %rd579, %rd2448;
and.b64 %rd581, %rd580, 4294967295;
xor.b64 %rd582, %rd581, %rd575;
xor.b64 %rd583, %rd582, %rd2461;
mul.lo.s64 %rd584, %rd583, %rd2446;
shr.u64 %rd585, %rd584, 32;
shr.u64 %rd586, %rd580, 32;
and.b64 %rd587, %rd554, 4294967295;
xor.b64 %rd588, %rd587, %rd586;
xor.b64 %rd589, %rd588, %rd2462;
mul.lo.s64 %rd590, %rd589, %rd2446;
and.b64 %rd591, %rd590, 4294967295;
xor.b64 %rd592, %rd591, %rd585;
xor.b64 %rd593, %rd592, %rd2463;
mul.lo.s64 %rd594, %rd593, %rd2448;
shr.u64 %rd595, %rd594, 32;
cvt.u32.u64 %r56, %rd595;
shr.u64 %rd596, %rd590, 32;
xor.b64 %rd597, %rd596, %rd564;
cvt.u32.u64 %r57, %rd597;
xor.b32 %r58, %r310, %r57;
mul.lo.s32 %r59, %r58, %r311;
xor.b32 %r60, %r59, %r56;
xor.b32 %r61, %r60, %r312;
shr.u32 %r62, %r61, 9;
cvt.rn.f32.u32 %f20, %r62;
mul.rn.f32 %f21, %f20, 0f34000000;
cvt.rn.f16.f32 %h1, %f21;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p4, %h1, %h2;
mul.wide.u32 %rd598, %r2, 2048;
add.s64 %rd599, %rd10, %rd598;
mul.wide.u32 %rd600, %r3, 2;
add.s64 %rd45, %rd599, %rd600;
ld.global.nc.b16 %h3, [%rd45];
mul.wide.u32 %rd601, %r3, 4;
add.s64 %rd46, %rd1, %rd601;
ld.global.nc.f32 %f22, [%rd46];
cvt.rn.f16.f32 %h4, %f22;
add.rn.f16 %h5, %h3, %h4;
mov.b16 %h6, 0x3C72;
mul.rn.f16 %h7, %h5, %h6;
selp.b16 %h8, %h7, 0x0000, %p4;
cvt.f32.f16 %f23, %h8;
add.s64 %rd602, %rd9, %rd598;
add.s64 %rd47, %rd602, %rd600;
ld.global.nc.b16 %h9, [%rd47];
cvt.f32.f16 %f24, %h9;
mul.wide.u32 %rd603, %r2, 4;
add.s64 %rd604, %rd5, %rd603;
ld.global.nc.f32 %f25, [%rd604];
mul.rn.f32 %f26, %f25, 0f3A800000;
add.rn.f32 %f27, %f26, 0f2B8CBCCC;
rsqrt.approx.f32 %f1, %f27;
add.s64 %rd48, %rd2, %rd601;
ld.global.nc.f32 %f28, [%rd48];
mul.rn.f32 %f29, %f1, %f28;
mul.rn.f32 %f30, %f29, %f24;
add.s64 %rd49, %rd3, %rd601;
ld.global.nc.f32 %f31, [%rd49];
add.s64 %rd605, %rd4, %rd603;
ld.global.nc.f32 %f32, [%rd605];
mul.rn.f32 %f2, %f32, 0f3A800000;
mul.rn.f32 %f33, %f29, %f2;
sub.rn.f32 %f34, %f31, %f33;
add.rn.f32 %f35, %f30, %f34;
add.rn.f32 %f36, %f35, %f23;
add.s64 %rd606, %rd6, %rd603;
ld.global.nc.f32 %f37, [%rd606];
mul.rn.f32 %f3, %f37, 0f3A800000;
sub.rn.f32 %f38, %f36, %f3;
mul.rn.f32 %f39, %f38, %f38;
add.rn.f32 %f4, %f39, 0f00000000;
or.b32 %r63, %r3, 1;
and.b32 %r64, %r63, 3;
setp.ne.s32 %p5, %r64, 1;
@%p5 bra LBB30_7;
mul.lo.s64 %rd2469, %rd2387, 3528531795;
selp.u64 %rd647, 1, 0, %p69;
add.s64 %rd648, %rd2464, %rd647;
xor.b64 %rd649, %rd648, %rd2469;
shr.u64 %rd650, %rd649, 32;
mul.lo.s64 %rd2472, %rd650, 3449720151;
shr.u64 %rd651, %rd2472, 32;
and.b64 %rd652, %rd648, 4294967295;
mul.lo.s64 %rd653, %rd652, 3449720151;
and.b64 %rd654, %rd653, 4294967295;
xor.b64 %rd655, %rd654, %rd651;
xor.b64 %rd656, %rd655, 2654435769;
mul.lo.s64 %rd2475, %rd656, 3528531795;
xor.b64 %rd2465, %rd653, %rd13;
mov.u32 %r314, -845247145;
mov.u32 %r313, -616729560;
mov.u64 %rd2482, 3041712726;
mov.u64 %rd2481, 1401181199;
mov.u64 %rd2480, 2835769497;
mov.u64 %rd2479, 1684936478;
mov.u64 %rd2478, 2027808484;
mov.u64 %rd2477, 387276957;
mov.u64 %rd2476, 842468239;
mov.u64 %rd2474, 3986602516;
mov.u64 %rd2473, 1013904242;
mov.u64 %rd2471, 3668340011;
mov.u64 %rd2470, 3144134277;
mov.u64 %rd2468, 3449720151;
mov.u64 %rd2467, 1993301258;
mov.u64 %rd2466, 3528531795;
bra.uni LBB30_8;
LBB30_7:
mov.u32 %r313, -239350328;
selp.u64 %rd621, 1, 0, %p69;
add.s64 %rd622, %rd2464, %rd621;
and.b64 %rd623, %rd622, 4294967295;
mul.lo.s64 %rd2469, %rd623, 3449720151;
xor.b64 %rd624, %rd2469, %rd13;
shr.u64 %rd625, %rd624, 32;
mul.lo.s64 %rd2472, %rd625, 3528531795;
shr.u64 %rd626, %rd2472, 32;
mul.lo.s64 %rd628, %rd2387, 3528531795;
and.b64 %rd629, %rd628, 4294967295;
xor.b64 %rd630, %rd629, %rd626;
xor.b64 %rd631, %rd630, 3144134277;
mul.lo.s64 %rd2475, %rd631, 3449720151;
xor.b64 %rd2465, %rd622, %rd628;
mov.u32 %r314, -766435501;
mov.u64 %rd2482, 1684936478;
mov.u64 %rd2481, 534103459;
mov.u64 %rd2480, 387276957;
mov.u64 %rd2479, 3041712726;
mov.u64 %rd2478, 3986602516;
mov.u64 %rd2477, 2835769497;
mov.u64 %rd2476, 3668340011;
mov.u64 %rd2474, 2027808484;
mov.u64 %rd2473, 1993301258;
mov.u64 %rd2471, 842468239;
mov.u64 %rd2470, 2654435769;
mov.u64 %rd2468, 3528531795;
mov.u64 %rd2467, 1013904242;
mov.u64 %rd2466, 3449720151;
LBB30_8:
setp.ne.s32 %p8, %r5, 0;
shr.u64 %rd657, %rd2475, 32;
shr.u64 %rd658, %rd2465, 32;
mul.lo.s64 %rd659, %rd658, %rd2466;
and.b64 %rd660, %rd659, 4294967295;
xor.b64 %rd661, %rd660, %rd657;
xor.b64 %rd662, %rd661, %rd2467;
mul.lo.s64 %rd663, %rd662, %rd2468;
shr.u64 %rd664, %rd663, 32;
shr.u64 %rd665, %rd659, 32;
and.b64 %rd666, %rd2469, 4294967295;
xor.b64 %rd667, %rd666, %rd665;
xor.b64 %rd668, %rd667, %rd2470;
mul.lo.s64 %rd669, %rd668, %rd2468;
and.b64 %rd670, %rd669, 4294967295;
xor.b64 %rd671, %rd670, %rd664;
xor.b64 %rd672, %rd671, %rd2471;
mul.lo.s64 %rd673, %rd672, %rd2466;
shr.u64 %rd674, %rd673, 32;
shr.u64 %rd675, %rd669, 32;
and.b64 %rd676, %rd2472, 4294967295;
xor.b64 %rd677, %rd676, %rd675;
xor.b64 %rd678, %rd677, %rd2473;
mul.lo.s64 %rd679, %rd678, %rd2466;
and.b64 %rd680, %rd679, 4294967295;
xor.b64 %rd681, %rd680, %rd674;
xor.b64 %rd682, %rd681, %rd2474;
mul.lo.s64 %rd683, %rd682, %rd2468;
shr.u64 %rd684, %rd683, 32;
shr.u64 %rd685, %rd679, 32;
and.b64 %rd686, %rd2475, 4294967295;
xor.b64 %rd687, %rd686, %rd685;
xor.b64 %rd688, %rd687, %rd2476;
mul.lo.s64 %rd689, %rd688, %rd2468;
and.b64 %rd690, %rd689, 4294967295;
xor.b64 %rd691, %rd690, %rd684;
xor.b64 %rd692, %rd691, %rd2477;
mul.lo.s64 %rd693, %rd692, %rd2466;
shr.u64 %rd694, %rd693, 32;
shr.u64 %rd695, %rd689, 32;
and.b64 %rd696, %rd663, 4294967295;
xor.b64 %rd697, %rd696, %rd695;
xor.b64 %rd698, %rd697, %rd2478;
mul.lo.s64 %rd699, %rd698, %rd2466;
and.b64 %rd700, %rd699, 4294967295;
xor.b64 %rd701, %rd700, %rd694;
xor.b64 %rd702, %rd701, %rd2479;
mul.lo.s64 %rd703, %rd702, %rd2468;
shr.u64 %rd704, %rd703, 32;
shr.u64 %rd705, %rd699, 32;
and.b64 %rd706, %rd673, 4294967295;
xor.b64 %rd707, %rd706, %rd705;
xor.b64 %rd708, %rd707, %rd2480;
mul.lo.s64 %rd709, %rd708, %rd2468;
and.b64 %rd710, %rd709, 4294967295;
xor.b64 %rd711, %rd710, %rd704;
xor.b64 %rd712, %rd711, %rd2481;
mul.lo.s64 %rd713, %rd712, %rd2466;
shr.u64 %rd714, %rd713, 32;
shr.u64 %rd715, %rd709, 32;
xor.b64 %rd716, %rd683, %rd715;
xor.b64 %rd717, %rd716, %rd2482;
mul.lo.s64 %rd718, %rd717, %rd2466;
xor.b64 %rd719, %rd714, %rd718;
cvt.u32.u64 %r69, %rd719;
xor.b32 %r70, %r313, %r69;
mul.lo.s32 %r71, %r70, %r314;
shr.u32 %r72, %r71, 9;
cvt.rn.f32.u32 %f40, %r72;
mul.rn.f32 %f41, %f40, 0f34000000;
cvt.rn.f16.f32 %h10, %f41;
mov.b16 %h11, 0x2E66;
setp.ge.f16 %p9, %h10, %h11;
ld.global.nc.b16 %h12, [%rd45+2];
ld.global.nc.f32 %f42, [%rd46+4];
cvt.rn.f16.f32 %h13, %f42;
add.rn.f16 %h14, %h12, %h13;
mov.b16 %h15, 0x3C72;
mul.rn.f16 %h16, %h14, %h15;
selp.b16 %h17, %h16, 0x0000, %p9;
cvt.f32.f16 %f43, %h17;
ld.global.nc.b16 %h18, [%rd47+2];
cvt.f32.f16 %f44, %h18;
ld.global.nc.f32 %f45, [%rd48+4];
mul.rn.f32 %f46, %f1, %f45;
mul.rn.f32 %f47, %f46, %f44;
ld.global.nc.f32 %f48, [%rd49+4];
mul.rn.f32 %f49, %f2, %f46;
sub.rn.f32 %f50, %f48, %f49;
add.rn.f32 %f51, %f47, %f50;
add.rn.f32 %f52, %f51, %f43;
sub.rn.f32 %f53, %f52, %f3;
mul.rn.f32 %f54, %f53, %f53;
add.rn.f32 %f5, %f4, %f54;
or.b32 %r73, %r3, %r4;
or.b32 %r74, %r73, 128;
shr.u32 %r75, %r74, 2;
cvt.u64.u32 %rd720, %r75;
add.s64 %rd76, %rd12, %rd720;
and.b64 %rd2436, %rd76, 4294967295;
setp.lt.u64 %p74, %rd76, %rd12;
@%p8 bra LBB30_10;
mul.lo.s64 %rd2487, %rd2436, 3528531795;
selp.u64 %rd763, 1, 0, %p74;
add.s64 %rd764, %rd2464, %rd763;
xor.b64 %rd765, %rd764, %rd2487;
shr.u64 %rd766, %rd765, 32;
mul.lo.s64 %rd2490, %rd766, 3449720151;
shr.u64 %rd767, %rd2490, 32;
and.b64 %rd768, %rd764, 4294967295;
mul.lo.s64 %rd769, %rd768, 3449720151;
and.b64 %rd770, %rd769, 4294967295;
xor.b64 %rd771, %rd770, %rd767;
xor.b64 %rd772, %rd771, 2654435769;
mul.lo.s64 %rd2493, %rd772, 3528531795;
xor.b64 %rd2483, %rd769, %rd76;
mov.u32 %r317, -1879881855;
mov.u32 %r316, -845247145;
mov.u32 %r315, 534103459;
mov.u64 %rd2501, 3678237736;
mov.u64 %rd2500, 3041712726;
mov.u64 %rd2499, 1401181199;
mov.u64 %rd2498, 2835769497;
mov.u64 %rd2497, 1684936478;
mov.u64 %rd2496, 2027808484;
mov.u64 %rd2495, 387276957;
mov.u64 %rd2494, 842468239;
mov.u64 %rd2492, 3986602516;
mov.u64 %rd2491, 1013904242;
mov.u64 %rd2489, 3668340011;
mov.u64 %rd2488, 3144134277;
mov.u64 %rd2486, 3449720151;
mov.u64 %rd2485, 1993301258;
mov.u64 %rd2484, 3528531795;
bra.uni LBB30_11;
LBB30_10:
selp.u64 %rd736, 1, 0, %p74;
add.s64 %rd737, %rd2464, %rd736;
and.b64 %rd738, %rd737, 4294967295;
mul.lo.s64 %rd2487, %rd738, 3449720151;
xor.b64 %rd739, %rd2487, %rd76;
shr.u64 %rd740, %rd739, 32;
mul.lo.s64 %rd2490, %rd740, 3528531795;
shr.u64 %rd741, %rd2490, 32;
mul.lo.s64 %rd743, %rd2436, 3528531795;
and.b64 %rd744, %rd743, 4294967295;
xor.b64 %rd745, %rd744, %rd741;
xor.b64 %rd746, %rd745, 3144134277;
mul.lo.s64 %rd2493, %rd746, 3449720151;
xor.b64 %rd2483, %rd737, %rd743;
mov.u32 %r317, -1767562579;
mov.u32 %r316, -766435501;
mov.u32 %r315, 1401181199;
mov.u64 %rd2501, 4055616968;
mov.u64 %rd2500, 1684936478;
mov.u64 %rd2499, 534103459;
mov.u64 %rd2498, 387276957;
mov.u64 %rd2497, 3041712726;
mov.u64 %rd2496, 3986602516;
mov.u64 %rd2495, 2835769497;
mov.u64 %rd2494, 3668340011;
mov.u64 %rd2492, 2027808484;
mov.u64 %rd2491, 1993301258;
mov.u64 %rd2489, 842468239;
mov.u64 %rd2488, 2654435769;
mov.u64 %rd2486, 3528531795;
mov.u64 %rd2485, 1013904242;
mov.u64 %rd2484, 3449720151;
LBB30_11:
shr.u64 %rd773, %rd2493, 32;
shr.u64 %rd774, %rd2483, 32;
mul.lo.s64 %rd775, %rd774, %rd2484;
and.b64 %rd776, %rd775, 4294967295;
xor.b64 %rd777, %rd776, %rd773;
xor.b64 %rd778, %rd777, %rd2485;
mul.lo.s64 %rd779, %rd778, %rd2486;
shr.u64 %rd780, %rd779, 32;
shr.u64 %rd781, %rd775, 32;
and.b64 %rd782, %rd2487, 4294967295;
xor.b64 %rd783, %rd782, %rd781;
xor.b64 %rd784, %rd783, %rd2488;
mul.lo.s64 %rd785, %rd784, %rd2486;
and.b64 %rd786, %rd785, 4294967295;
xor.b64 %rd787, %rd786, %rd780;
xor.b64 %rd788, %rd787, %rd2489;
mul.lo.s64 %rd789, %rd788, %rd2484;
shr.u64 %rd790, %rd789, 32;
shr.u64 %rd791, %rd785, 32;
and.b64 %rd792, %rd2490, 4294967295;
xor.b64 %rd793, %rd792, %rd791;
xor.b64 %rd794, %rd793, %rd2491;
mul.lo.s64 %rd795, %rd794, %rd2484;
and.b64 %rd796, %rd795, 4294967295;
xor.b64 %rd797, %rd796, %rd790;
xor.b64 %rd798, %rd797, %rd2492;
mul.lo.s64 %rd799, %rd798, %rd2486;
shr.u64 %rd800, %rd799, 32;
shr.u64 %rd801, %rd795, 32;
and.b64 %rd802, %rd2493, 4294967295;
xor.b64 %rd803, %rd802, %rd801;
xor.b64 %rd804, %rd803, %rd2494;
mul.lo.s64 %rd805, %rd804, %rd2486;
and.b64 %rd806, %rd805, 4294967295;
xor.b64 %rd807, %rd806, %rd800;
xor.b64 %rd808, %rd807, %rd2495;
mul.lo.s64 %rd809, %rd808, %rd2484;
shr.u64 %rd810, %rd809, 32;
shr.u64 %rd811, %rd805, 32;
and.b64 %rd812, %rd779, 4294967295;
xor.b64 %rd813, %rd812, %rd811;
xor.b64 %rd814, %rd813, %rd2496;
mul.lo.s64 %rd815, %rd814, %rd2484;
and.b64 %rd816, %rd815, 4294967295;
xor.b64 %rd817, %rd816, %rd810;
xor.b64 %rd818, %rd817, %rd2497;
mul.lo.s64 %rd819, %rd818, %rd2486;
shr.u64 %rd820, %rd819, 32;
shr.u64 %rd821, %rd815, 32;
and.b64 %rd822, %rd789, 4294967295;
xor.b64 %rd823, %rd822, %rd821;
xor.b64 %rd824, %rd823, %rd2498;
mul.lo.s64 %rd825, %rd824, %rd2486;
and.b64 %rd826, %rd825, 4294967295;
xor.b64 %rd827, %rd826, %rd820;
xor.b64 %rd828, %rd827, %rd2499;
mul.lo.s64 %rd829, %rd828, %rd2484;
shr.u64 %rd830, %rd829, 32;
shr.u64 %rd831, %rd825, 32;
and.b64 %rd832, %rd799, 4294967295;
xor.b64 %rd833, %rd832, %rd831;
xor.b64 %rd834, %rd833, %rd2500;
mul.lo.s64 %rd835, %rd834, %rd2484;
and.b64 %rd836, %rd835, 4294967295;
xor.b64 %rd837, %rd836, %rd830;
xor.b64 %rd838, %rd837, %rd2501;
mul.lo.s64 %rd839, %rd838, %rd2486;
shr.u64 %rd840, %rd839, 32;
cvt.u32.u64 %r82, %rd840;
shr.u64 %rd841, %rd835, 32;
xor.b64 %rd842, %rd841, %rd809;
cvt.u32.u64 %r83, %rd842;
xor.b32 %r84, %r315, %r83;
mul.lo.s32 %r85, %r84, %r316;
xor.b32 %r86, %r85, %r82;
xor.b32 %r87, %r86, %r317;
shr.u32 %r88, %r87, 9;
cvt.rn.f32.u32 %f55, %r88;
mul.rn.f32 %f56, %f55, 0f34000000;
cvt.rn.f16.f32 %h19, %f56;
mov.b16 %h20, 0x2E66;
setp.ge.f16 %p12, %h19, %h20;
ld.global.nc.b16 %h21, [%rd45+256];
ld.global.nc.f32 %f57, [%rd46+512];
cvt.rn.f16.f32 %h22, %f57;
add.rn.f16 %h23, %h21, %h22;
mov.b16 %h24, 0x3C72;
mul.rn.f16 %h25, %h23, %h24;
selp.b16 %h26, %h25, 0x0000, %p12;
cvt.f32.f16 %f58, %h26;
ld.global.nc.b16 %h27, [%rd47+256];
cvt.f32.f16 %f59, %h27;
ld.global.nc.f32 %f60, [%rd48+512];
mul.rn.f32 %f61, %f1, %f60;
mul.rn.f32 %f62, %f61, %f59;
ld.global.nc.f32 %f63, [%rd49+512];
mul.rn.f32 %f64, %f2, %f61;
sub.rn.f32 %f65, %f63, %f64;
add.rn.f32 %f66, %f62, %f65;
add.rn.f32 %f67, %f66, %f58;
sub.rn.f32 %f68, %f67, %f3;
mul.rn.f32 %f69, %f68, %f68;
add.rn.f32 %f6, %f5, %f69;
or.b32 %r89, %r3, 129;
or.b32 %r90, %r89, %r4;
and.b32 %r91, %r89, 3;
shr.u32 %r92, %r90, 2;
setp.ne.s32 %p13, %r91, 1;
cvt.u64.u32 %rd843, %r92;
add.s64 %rd104, %rd12, %rd843;
and.b64 %rd2433, %rd104, 4294967295;
setp.lt.u64 %p73, %rd104, %rd12;
@%p13 bra LBB30_13;
mul.lo.s64 %rd2506, %rd2433, 3528531795;
selp.u64 %rd884, 1, 0, %p73;
add.s64 %rd885, %rd2464, %rd884;
xor.b64 %rd886, %rd885, %rd2506;
shr.u64 %rd887, %rd886, 32;
mul.lo.s64 %rd2509, %rd887, 3449720151;
shr.u64 %rd888, %rd2509, 32;
and.b64 %rd889, %rd885, 4294967295;
mul.lo.s64 %rd890, %rd889, 3449720151;
and.b64 %rd891, %rd890, 4294967295;
xor.b64 %rd892, %rd891, %rd888;
xor.b64 %rd893, %rd892, 2654435769;
mul.lo.s64 %rd2512, %rd893, 3528531795;
xor.b64 %rd2502, %rd890, %rd104;
mov.u32 %r319, -845247145;
mov.u32 %r318, -616729560;
mov.u64 %rd2519, 3041712726;
mov.u64 %rd2518, 1401181199;
mov.u64 %rd2517, 2835769497;
mov.u64 %rd2516, 1684936478;
mov.u64 %rd2515, 2027808484;
mov.u64 %rd2514, 387276957;
mov.u64 %rd2513, 842468239;
mov.u64 %rd2511, 3986602516;
mov.u64 %rd2510, 1013904242;
mov.u64 %rd2508, 3668340011;
mov.u64 %rd2507, 3144134277;
mov.u64 %rd2505, 3449720151;
mov.u64 %rd2504, 1993301258;
mov.u64 %rd2503, 3528531795;
bra.uni LBB30_14;
LBB30_13:
selp.u64 %rd858, 1, 0, %p73;
add.s64 %rd859, %rd2464, %rd858;
and.b64 %rd860, %rd859, 4294967295;
mul.lo.s64 %rd2506, %rd860, 3449720151;
xor.b64 %rd861, %rd2506, %rd104;
shr.u64 %rd862, %rd861, 32;
mul.lo.s64 %rd2509, %rd862, 3528531795;
shr.u64 %rd863, %rd2509, 32;
mul.lo.s64 %rd865, %rd2433, 3528531795;
and.b64 %rd866, %rd865, 4294967295;
xor.b64 %rd867, %rd866, %rd863;
xor.b64 %rd868, %rd867, 3144134277;
mul.lo.s64 %rd2512, %rd868, 3449720151;
xor.b64 %rd2502, %rd859, %rd865;
mov.u32 %r319, -766435501;
mov.u32 %r318, -239350328;
mov.u64 %rd2519, 1684936478;
mov.u64 %rd2518, 534103459;
mov.u64 %rd2517, 387276957;
mov.u64 %rd2516, 3041712726;
mov.u64 %rd2515, 3986602516;
mov.u64 %rd2514, 2835769497;
mov.u64 %rd2513, 3668340011;
mov.u64 %rd2511, 2027808484;
mov.u64 %rd2510, 1993301258;
mov.u64 %rd2508, 842468239;
mov.u64 %rd2507, 2654435769;
mov.u64 %rd2505, 3528531795;
mov.u64 %rd2504, 1013904242;
mov.u64 %rd2503, 3449720151;
LBB30_14:
shr.u64 %rd894, %rd2512, 32;
shr.u64 %rd895, %rd2502, 32;
mul.lo.s64 %rd896, %rd895, %rd2503;
and.b64 %rd897, %rd896, 4294967295;
xor.b64 %rd898, %rd897, %rd894;
xor.b64 %rd899, %rd898, %rd2504;
mul.lo.s64 %rd900, %rd899, %rd2505;
shr.u64 %rd901, %rd900, 32;
shr.u64 %rd902, %rd896, 32;
and.b64 %rd903, %rd2506, 4294967295;
xor.b64 %rd904, %rd903, %rd902;
xor.b64 %rd905, %rd904, %rd2507;
mul.lo.s64 %rd906, %rd905, %rd2505;
and.b64 %rd907, %rd906, 4294967295;
xor.b64 %rd908, %rd907, %rd901;
xor.b64 %rd909, %rd908, %rd2508;
mul.lo.s64 %rd910, %rd909, %rd2503;
shr.u64 %rd911, %rd910, 32;
shr.u64 %rd912, %rd906, 32;
and.b64 %rd913, %rd2509, 4294967295;
xor.b64 %rd914, %rd913, %rd912;
xor.b64 %rd915, %rd914, %rd2510;
mul.lo.s64 %rd916, %rd915, %rd2503;
and.b64 %rd917, %rd916, 4294967295;
xor.b64 %rd918, %rd917, %rd911;
xor.b64 %rd919, %rd918, %rd2511;
mul.lo.s64 %rd920, %rd919, %rd2505;
shr.u64 %rd921, %rd920, 32;
shr.u64 %rd922, %rd916, 32;
and.b64 %rd923, %rd2512, 4294967295;
xor.b64 %rd924, %rd923, %rd922;
xor.b64 %rd925, %rd924, %rd2513;
mul.lo.s64 %rd926, %rd925, %rd2505;
and.b64 %rd927, %rd926, 4294967295;
xor.b64 %rd928, %rd927, %rd921;
xor.b64 %rd929, %rd928, %rd2514;
mul.lo.s64 %rd930, %rd929, %rd2503;
shr.u64 %rd931, %rd930, 32;
shr.u64 %rd932, %rd926, 32;
and.b64 %rd933, %rd900, 4294967295;
xor.b64 %rd934, %rd933, %rd932;
xor.b64 %rd935, %rd934, %rd2515;
mul.lo.s64 %rd936, %rd935, %rd2503;
and.b64 %rd937, %rd936, 4294967295;
xor.b64 %rd938, %rd937, %rd931;
xor.b64 %rd939, %rd938, %rd2516;
mul.lo.s64 %rd940, %rd939, %rd2505;
shr.u64 %rd941, %rd940, 32;
shr.u64 %rd942, %rd936, 32;
and.b64 %rd943, %rd910, 4294967295;
xor.b64 %rd944, %rd943, %rd942;
xor.b64 %rd945, %rd944, %rd2517;
mul.lo.s64 %rd946, %rd945, %rd2505;
and.b64 %rd947, %rd946, 4294967295;
xor.b64 %rd948, %rd947, %rd941;
xor.b64 %rd949, %rd948, %rd2518;
mul.lo.s64 %rd950, %rd949, %rd2503;
shr.u64 %rd951, %rd950, 32;
shr.u64 %rd952, %rd946, 32;
xor.b64 %rd953, %rd920, %rd952;
xor.b64 %rd954, %rd953, %rd2519;
mul.lo.s64 %rd955, %rd954, %rd2503;
xor.b64 %rd956, %rd951, %rd955;
cvt.u32.u64 %r97, %rd956;
xor.b32 %r98, %r318, %r97;
mul.lo.s32 %r99, %r98, %r319;
shr.u32 %r100, %r99, 9;
cvt.rn.f32.u32 %f70, %r100;
mul.rn.f32 %f71, %f70, 0f34000000;
cvt.rn.f16.f32 %h28, %f71;
mov.b16 %h29, 0x2E66;
setp.ge.f16 %p17, %h28, %h29;
ld.global.nc.b16 %h30, [%rd45+258];
ld.global.nc.f32 %f72, [%rd46+516];
cvt.rn.f16.f32 %h31, %f72;
add.rn.f16 %h32, %h30, %h31;
mov.b16 %h33, 0x3C72;
mul.rn.f16 %h34, %h32, %h33;
selp.b16 %h35, %h34, 0x0000, %p17;
cvt.f32.f16 %f73, %h35;
ld.global.nc.b16 %h36, [%rd47+258];
cvt.f32.f16 %f74, %h36;
ld.global.nc.f32 %f75, [%rd48+516];
mul.rn.f32 %f76, %f1, %f75;
mul.rn.f32 %f77, %f76, %f74;
ld.global.nc.f32 %f78, [%rd49+516];
mul.rn.f32 %f79, %f2, %f76;
sub.rn.f32 %f80, %f78, %f79;
add.rn.f32 %f81, %f77, %f80;
add.rn.f32 %f82, %f81, %f73;
sub.rn.f32 %f83, %f82, %f3;
mul.rn.f32 %f84, %f83, %f83;
add.rn.f32 %f7, %f6, %f84;
or.b32 %r102, %r73, 256;
shr.u32 %r103, %r102, 2;
cvt.u64.u32 %rd957, %r103;
add.s64 %rd131, %rd12, %rd957;
and.b64 %rd2429, %rd131, 4294967295;
setp.lt.u64 %p72, %rd131, %rd12;
@%p8 bra LBB30_16;
mul.lo.s64 %rd2524, %rd2429, 3528531795;
selp.u64 %rd1000, 1, 0, %p72;
add.s64 %rd1001, %rd2464, %rd1000;
xor.b64 %rd1002, %rd1001, %rd2524;
shr.u64 %rd1003, %rd1002, 32;
mul.lo.s64 %rd2527, %rd1003, 3449720151;
shr.u64 %rd1004, %rd2527, 32;
and.b64 %rd1005, %rd1001, 4294967295;
mul.lo.s64 %rd1006, %rd1005, 3449720151;
and.b64 %rd1007, %rd1006, 4294967295;
xor.b64 %rd1008, %rd1007, %rd1004;
xor.b64 %rd1009, %rd1008, 2654435769;
mul.lo.s64 %rd2530, %rd1009, 3528531795;
xor.b64 %rd2520, %rd1006, %rd131;
mov.u32 %r322, -1879881855;
mov.u32 %r321, -845247145;
mov.u32 %r320, 534103459;
mov.u64 %rd2538, 3678237736;
mov.u64 %rd2537, 3041712726;
mov.u64 %rd2536, 1401181199;
mov.u64 %rd2535, 2835769497;
mov.u64 %rd2534, 1684936478;
mov.u64 %rd2533, 2027808484;
mov.u64 %rd2532, 387276957;
mov.u64 %rd2531, 842468239;
mov.u64 %rd2529, 3986602516;
mov.u64 %rd2528, 1013904242;
mov.u64 %rd2526, 3668340011;
mov.u64 %rd2525, 3144134277;
mov.u64 %rd2523, 3449720151;
mov.u64 %rd2522, 1993301258;
mov.u64 %rd2521, 3528531795;
bra.uni LBB30_17;
LBB30_16:
selp.u64 %rd973, 1, 0, %p72;
add.s64 %rd974, %rd2464, %rd973;
and.b64 %rd975, %rd974, 4294967295;
mul.lo.s64 %rd2524, %rd975, 3449720151;
xor.b64 %rd976, %rd2524, %rd131;
shr.u64 %rd977, %rd976, 32;
mul.lo.s64 %rd2527, %rd977, 3528531795;
shr.u64 %rd978, %rd2527, 32;
mul.lo.s64 %rd980, %rd2429, 3528531795;
and.b64 %rd981, %rd980, 4294967295;
xor.b64 %rd982, %rd981, %rd978;
xor.b64 %rd983, %rd982, 3144134277;
mul.lo.s64 %rd2530, %rd983, 3449720151;
xor.b64 %rd2520, %rd974, %rd980;
mov.u32 %r322, -1767562579;
mov.u32 %r321, -766435501;
mov.u32 %r320, 1401181199;
mov.u64 %rd2538, 4055616968;
mov.u64 %rd2537, 1684936478;
mov.u64 %rd2536, 534103459;
mov.u64 %rd2535, 387276957;
mov.u64 %rd2534, 3041712726;
mov.u64 %rd2533, 3986602516;
mov.u64 %rd2532, 2835769497;
mov.u64 %rd2531, 3668340011;
mov.u64 %rd2529, 2027808484;
mov.u64 %rd2528, 1993301258;
mov.u64 %rd2526, 842468239;
mov.u64 %rd2525, 2654435769;
mov.u64 %rd2523, 3528531795;
mov.u64 %rd2522, 1013904242;
mov.u64 %rd2521, 3449720151;
LBB30_17:
shr.u64 %rd1010, %rd2530, 32;
shr.u64 %rd1011, %rd2520, 32;
mul.lo.s64 %rd1012, %rd1011, %rd2521;
and.b64 %rd1013, %rd1012, 4294967295;
xor.b64 %rd1014, %rd1013, %rd1010;
xor.b64 %rd1015, %rd1014, %rd2522;
mul.lo.s64 %rd1016, %rd1015, %rd2523;
shr.u64 %rd1017, %rd1016, 32;
shr.u64 %rd1018, %rd1012, 32;
and.b64 %rd1019, %rd2524, 4294967295;
xor.b64 %rd1020, %rd1019, %rd1018;
xor.b64 %rd1021, %rd1020, %rd2525;
mul.lo.s64 %rd1022, %rd1021, %rd2523;
and.b64 %rd1023, %rd1022, 4294967295;
xor.b64 %rd1024, %rd1023, %rd1017;
xor.b64 %rd1025, %rd1024, %rd2526;
mul.lo.s64 %rd1026, %rd1025, %rd2521;
shr.u64 %rd1027, %rd1026, 32;
shr.u64 %rd1028, %rd1022, 32;
and.b64 %rd1029, %rd2527, 4294967295;
xor.b64 %rd1030, %rd1029, %rd1028;
xor.b64 %rd1031, %rd1030, %rd2528;
mul.lo.s64 %rd1032, %rd1031, %rd2521;
and.b64 %rd1033, %rd1032, 4294967295;
xor.b64 %rd1034, %rd1033, %rd1027;
xor.b64 %rd1035, %rd1034, %rd2529;
mul.lo.s64 %rd1036, %rd1035, %rd2523;
shr.u64 %rd1037, %rd1036, 32;
shr.u64 %rd1038, %rd1032, 32;
and.b64 %rd1039, %rd2530, 4294967295;
xor.b64 %rd1040, %rd1039, %rd1038;
xor.b64 %rd1041, %rd1040, %rd2531;
mul.lo.s64 %rd1042, %rd1041, %rd2523;
and.b64 %rd1043, %rd1042, 4294967295;
xor.b64 %rd1044, %rd1043, %rd1037;
xor.b64 %rd1045, %rd1044, %rd2532;
mul.lo.s64 %rd1046, %rd1045, %rd2521;
shr.u64 %rd1047, %rd1046, 32;
shr.u64 %rd1048, %rd1042, 32;
and.b64 %rd1049, %rd1016, 4294967295;
xor.b64 %rd1050, %rd1049, %rd1048;
xor.b64 %rd1051, %rd1050, %rd2533;
mul.lo.s64 %rd1052, %rd1051, %rd2521;
and.b64 %rd1053, %rd1052, 4294967295;
xor.b64 %rd1054, %rd1053, %rd1047;
xor.b64 %rd1055, %rd1054, %rd2534;
mul.lo.s64 %rd1056, %rd1055, %rd2523;
shr.u64 %rd1057, %rd1056, 32;
shr.u64 %rd1058, %rd1052, 32;
and.b64 %rd1059, %rd1026, 4294967295;
xor.b64 %rd1060, %rd1059, %rd1058;
xor.b64 %rd1061, %rd1060, %rd2535;
mul.lo.s64 %rd1062, %rd1061, %rd2523;
and.b64 %rd1063, %rd1062, 4294967295;
xor.b64 %rd1064, %rd1063, %rd1057;
xor.b64 %rd1065, %rd1064, %rd2536;
mul.lo.s64 %rd1066, %rd1065, %rd2521;
shr.u64 %rd1067, %rd1066, 32;
shr.u64 %rd1068, %rd1062, 32;
and.b64 %rd1069, %rd1036, 4294967295;
xor.b64 %rd1070, %rd1069, %rd1068;
xor.b64 %rd1071, %rd1070, %rd2537;
mul.lo.s64 %rd1072, %rd1071, %rd2521;
and.b64 %rd1073, %rd1072, 4294967295;
xor.b64 %rd1074, %rd1073, %rd1067;
xor.b64 %rd1075, %rd1074, %rd2538;
mul.lo.s64 %rd1076, %rd1075, %rd2523;
shr.u64 %rd1077, %rd1076, 32;
cvt.u32.u64 %r110, %rd1077;
shr.u64 %rd1078, %rd1072, 32;
xor.b64 %rd1079, %rd1078, %rd1046;
cvt.u32.u64 %r111, %rd1079;
xor.b32 %r112, %r320, %r111;
mul.lo.s32 %r113, %r112, %r321;
xor.b32 %r114, %r113, %r110;
xor.b32 %r115, %r114, %r322;
shr.u32 %r116, %r115, 9;
cvt.rn.f32.u32 %f85, %r116;
mul.rn.f32 %f86, %f85, 0f34000000;
cvt.rn.f16.f32 %h37, %f86;
mov.b16 %h38, 0x2E66;
setp.ge.f16 %p20, %h37, %h38;
ld.global.nc.b16 %h39, [%rd45+512];
ld.global.nc.f32 %f87, [%rd46+1024];
cvt.rn.f16.f32 %h40, %f87;
add.rn.f16 %h41, %h39, %h40;
mov.b16 %h42, 0x3C72;
mul.rn.f16 %h43, %h41, %h42;
selp.b16 %h44, %h43, 0x0000, %p20;
cvt.f32.f16 %f88, %h44;
ld.global.nc.b16 %h45, [%rd47+512];
cvt.f32.f16 %f89, %h45;
ld.global.nc.f32 %f90, [%rd48+1024];
mul.rn.f32 %f91, %f1, %f90;
mul.rn.f32 %f92, %f91, %f89;
ld.global.nc.f32 %f93, [%rd49+1024];
mul.rn.f32 %f94, %f2, %f91;
sub.rn.f32 %f95, %f93, %f94;
add.rn.f32 %f96, %f92, %f95;
add.rn.f32 %f97, %f96, %f88;
sub.rn.f32 %f98, %f97, %f3;
mul.rn.f32 %f99, %f98, %f98;
add.rn.f32 %f8, %f7, %f99;
or.b32 %r117, %r3, 257;
or.b32 %r118, %r117, %r4;
and.b32 %r119, %r117, 3;
shr.u32 %r120, %r118, 2;
setp.ne.s32 %p21, %r119, 1;
cvt.u64.u32 %rd1080, %r120;
add.s64 %rd159, %rd12, %rd1080;
and.b64 %rd2426, %rd159, 4294967295;
setp.lt.u64 %p71, %rd159, %rd12;
@%p21 bra LBB30_19;
mul.lo.s64 %rd2543, %rd2426, 3528531795;
selp.u64 %rd1121, 1, 0, %p71;
add.s64 %rd1122, %rd2464, %rd1121;
xor.b64 %rd1123, %rd1122, %rd2543;
shr.u64 %rd1124, %rd1123, 32;
mul.lo.s64 %rd2546, %rd1124, 3449720151;
shr.u64 %rd1125, %rd2546, 32;
and.b64 %rd1126, %rd1122, 4294967295;
mul.lo.s64 %rd1127, %rd1126, 3449720151;
and.b64 %rd1128, %rd1127, 4294967295;
xor.b64 %rd1129, %rd1128, %rd1125;
xor.b64 %rd1130, %rd1129, 2654435769;
mul.lo.s64 %rd2549, %rd1130, 3528531795;
xor.b64 %rd2539, %rd1127, %rd159;
mov.u32 %r324, -845247145;
mov.u32 %r323, -616729560;
mov.u64 %rd2556, 3041712726;
mov.u64 %rd2555, 1401181199;
mov.u64 %rd2554, 2835769497;
mov.u64 %rd2553, 1684936478;
mov.u64 %rd2552, 2027808484;
mov.u64 %rd2551, 387276957;
mov.u64 %rd2550, 842468239;
mov.u64 %rd2548, 3986602516;
mov.u64 %rd2547, 1013904242;
mov.u64 %rd2545, 3668340011;
mov.u64 %rd2544, 3144134277;
mov.u64 %rd2542, 3449720151;
mov.u64 %rd2541, 1993301258;
mov.u64 %rd2540, 3528531795;
bra.uni LBB30_20;
LBB30_19:
selp.u64 %rd1095, 1, 0, %p71;
add.s64 %rd1096, %rd2464, %rd1095;
and.b64 %rd1097, %rd1096, 4294967295;
mul.lo.s64 %rd2543, %rd1097, 3449720151;
xor.b64 %rd1098, %rd2543, %rd159;
shr.u64 %rd1099, %rd1098, 32;
mul.lo.s64 %rd2546, %rd1099, 3528531795;
shr.u64 %rd1100, %rd2546, 32;
mul.lo.s64 %rd1102, %rd2426, 3528531795;
and.b64 %rd1103, %rd1102, 4294967295;
xor.b64 %rd1104, %rd1103, %rd1100;
xor.b64 %rd1105, %rd1104, 3144134277;
mul.lo.s64 %rd2549, %rd1105, 3449720151;
xor.b64 %rd2539, %rd1096, %rd1102;
mov.u32 %r324, -766435501;
mov.u32 %r323, -239350328;
mov.u64 %rd2556, 1684936478;
mov.u64 %rd2555, 534103459;
mov.u64 %rd2554, 387276957;
mov.u64 %rd2553, 3041712726;
mov.u64 %rd2552, 3986602516;
mov.u64 %rd2551, 2835769497;
mov.u64 %rd2550, 3668340011;
mov.u64 %rd2548, 2027808484;
mov.u64 %rd2547, 1993301258;
mov.u64 %rd2545, 842468239;
mov.u64 %rd2544, 2654435769;
mov.u64 %rd2542, 3528531795;
mov.u64 %rd2541, 1013904242;
mov.u64 %rd2540, 3449720151;
LBB30_20:
shr.u64 %rd1131, %rd2549, 32;
shr.u64 %rd1132, %rd2539, 32;
mul.lo.s64 %rd1133, %rd1132, %rd2540;
and.b64 %rd1134, %rd1133, 4294967295;
xor.b64 %rd1135, %rd1134, %rd1131;
xor.b64 %rd1136, %rd1135, %rd2541;
mul.lo.s64 %rd1137, %rd1136, %rd2542;
shr.u64 %rd1138, %rd1137, 32;
shr.u64 %rd1139, %rd1133, 32;
and.b64 %rd1140, %rd2543, 4294967295;
xor.b64 %rd1141, %rd1140, %rd1139;
xor.b64 %rd1142, %rd1141, %rd2544;
mul.lo.s64 %rd1143, %rd1142, %rd2542;
and.b64 %rd1144, %rd1143, 4294967295;
xor.b64 %rd1145, %rd1144, %rd1138;
xor.b64 %rd1146, %rd1145, %rd2545;
mul.lo.s64 %rd1147, %rd1146, %rd2540;
shr.u64 %rd1148, %rd1147, 32;
shr.u64 %rd1149, %rd1143, 32;
and.b64 %rd1150, %rd2546, 4294967295;
xor.b64 %rd1151, %rd1150, %rd1149;
xor.b64 %rd1152, %rd1151, %rd2547;
mul.lo.s64 %rd1153, %rd1152, %rd2540;
and.b64 %rd1154, %rd1153, 4294967295;
xor.b64 %rd1155, %rd1154, %rd1148;
xor.b64 %rd1156, %rd1155, %rd2548;
mul.lo.s64 %rd1157, %rd1156, %rd2542;
shr.u64 %rd1158, %rd1157, 32;
shr.u64 %rd1159, %rd1153, 32;
and.b64 %rd1160, %rd2549, 4294967295;
xor.b64 %rd1161, %rd1160, %rd1159;
xor.b64 %rd1162, %rd1161, %rd2550;
mul.lo.s64 %rd1163, %rd1162, %rd2542;
and.b64 %rd1164, %rd1163, 4294967295;
xor.b64 %rd1165, %rd1164, %rd1158;
xor.b64 %rd1166, %rd1165, %rd2551;
mul.lo.s64 %rd1167, %rd1166, %rd2540;
shr.u64 %rd1168, %rd1167, 32;
shr.u64 %rd1169, %rd1163, 32;
and.b64 %rd1170, %rd1137, 4294967295;
xor.b64 %rd1171, %rd1170, %rd1169;
xor.b64 %rd1172, %rd1171, %rd2552;
mul.lo.s64 %rd1173, %rd1172, %rd2540;
and.b64 %rd1174, %rd1173, 4294967295;
xor.b64 %rd1175, %rd1174, %rd1168;
xor.b64 %rd1176, %rd1175, %rd2553;
mul.lo.s64 %rd1177, %rd1176, %rd2542;
shr.u64 %rd1178, %rd1177, 32;
shr.u64 %rd1179, %rd1173, 32;
and.b64 %rd1180, %rd1147, 4294967295;
xor.b64 %rd1181, %rd1180, %rd1179;
xor.b64 %rd1182, %rd1181, %rd2554;
mul.lo.s64 %rd1183, %rd1182, %rd2542;
and.b64 %rd1184, %rd1183, 4294967295;
xor.b64 %rd1185, %rd1184, %rd1178;
xor.b64 %rd1186, %rd1185, %rd2555;
mul.lo.s64 %rd1187, %rd1186, %rd2540;
shr.u64 %rd1188, %rd1187, 32;
shr.u64 %rd1189, %rd1183, 32;
xor.b64 %rd1190, %rd1157, %rd1189;
xor.b64 %rd1191, %rd1190, %rd2556;
mul.lo.s64 %rd1192, %rd1191, %rd2540;
xor.b64 %rd1193, %rd1188, %rd1192;
cvt.u32.u64 %r125, %rd1193;
xor.b32 %r126, %r323, %r125;
mul.lo.s32 %r127, %r126, %r324;
shr.u32 %r128, %r127, 9;
cvt.rn.f32.u32 %f100, %r128;
mul.rn.f32 %f101, %f100, 0f34000000;
cvt.rn.f16.f32 %h46, %f101;
mov.b16 %h47, 0x2E66;
setp.ge.f16 %p25, %h46, %h47;
ld.global.nc.b16 %h48, [%rd45+514];
ld.global.nc.f32 %f102, [%rd46+1028];
cvt.rn.f16.f32 %h49, %f102;
add.rn.f16 %h50, %h48, %h49;
mov.b16 %h51, 0x3C72;
mul.rn.f16 %h52, %h50, %h51;
selp.b16 %h53, %h52, 0x0000, %p25;
cvt.f32.f16 %f103, %h53;
ld.global.nc.b16 %h54, [%rd47+514];
cvt.f32.f16 %f104, %h54;
ld.global.nc.f32 %f105, [%rd48+1028];
mul.rn.f32 %f106, %f1, %f105;
mul.rn.f32 %f107, %f106, %f104;
ld.global.nc.f32 %f108, [%rd49+1028];
mul.rn.f32 %f109, %f2, %f106;
sub.rn.f32 %f110, %f108, %f109;
add.rn.f32 %f111, %f107, %f110;
add.rn.f32 %f112, %f111, %f103;
sub.rn.f32 %f113, %f112, %f3;
mul.rn.f32 %f114, %f113, %f113;
add.rn.f32 %f9, %f8, %f114;
or.b32 %r130, %r73, 384;
shr.u32 %r131, %r130, 2;
cvt.u64.u32 %rd1194, %r131;
add.s64 %rd186, %rd12, %rd1194;
and.b64 %rd2422, %rd186, 4294967295;
setp.lt.u64 %p70, %rd186, %rd12;
@%p8 bra LBB30_22;
mul.lo.s64 %rd2561, %rd2422, 3528531795;
selp.u64 %rd1237, 1, 0, %p70;
add.s64 %rd1238, %rd2464, %rd1237;
xor.b64 %rd1239, %rd1238, %rd2561;
shr.u64 %rd1240, %rd1239, 32;
mul.lo.s64 %rd2564, %rd1240, 3449720151;
shr.u64 %rd1241, %rd2564, 32;
and.b64 %rd1242, %rd1238, 4294967295;
mul.lo.s64 %rd1243, %rd1242, 3449720151;
and.b64 %rd1244, %rd1243, 4294967295;
xor.b64 %rd1245, %rd1244, %rd1241;
xor.b64 %rd1246, %rd1245, 2654435769;
mul.lo.s64 %rd2567, %rd1246, 3528531795;
xor.b64 %rd2557, %rd1243, %rd186;
mov.u32 %r327, -1879881855;
mov.u32 %r326, -845247145;
mov.u32 %r325, 534103459;
mov.u64 %rd2575, 3678237736;
mov.u64 %rd2574, 3041712726;
mov.u64 %rd2573, 1401181199;
mov.u64 %rd2572, 2835769497;
mov.u64 %rd2571, 1684936478;
mov.u64 %rd2570, 2027808484;
mov.u64 %rd2569, 387276957;
mov.u64 %rd2568, 842468239;
mov.u64 %rd2566, 3986602516;
mov.u64 %rd2565, 1013904242;
mov.u64 %rd2563, 3668340011;
mov.u64 %rd2562, 3144134277;
mov.u64 %rd2560, 3449720151;
mov.u64 %rd2559, 1993301258;
mov.u64 %rd2558, 3528531795;
bra.uni LBB30_23;
LBB30_22:
selp.u64 %rd1210, 1, 0, %p70;
add.s64 %rd1211, %rd2464, %rd1210;
and.b64 %rd1212, %rd1211, 4294967295;
mul.lo.s64 %rd2561, %rd1212, 3449720151;
xor.b64 %rd1213, %rd2561, %rd186;
shr.u64 %rd1214, %rd1213, 32;
mul.lo.s64 %rd2564, %rd1214, 3528531795;
shr.u64 %rd1215, %rd2564, 32;
mul.lo.s64 %rd1217, %rd2422, 3528531795;
and.b64 %rd1218, %rd1217, 4294967295;
xor.b64 %rd1219, %rd1218, %rd1215;
xor.b64 %rd1220, %rd1219, 3144134277;
mul.lo.s64 %rd2567, %rd1220, 3449720151;
xor.b64 %rd2557, %rd1211, %rd1217;
mov.u32 %r327, -1767562579;
mov.u32 %r326, -766435501;
mov.u32 %r325, 1401181199;
mov.u64 %rd2575, 4055616968;
mov.u64 %rd2574, 1684936478;
mov.u64 %rd2573, 534103459;
mov.u64 %rd2572, 387276957;
mov.u64 %rd2571, 3041712726;
mov.u64 %rd2570, 3986602516;
mov.u64 %rd2569, 2835769497;
mov.u64 %rd2568, 3668340011;
mov.u64 %rd2566, 2027808484;
mov.u64 %rd2565, 1993301258;
mov.u64 %rd2563, 842468239;
mov.u64 %rd2562, 2654435769;
mov.u64 %rd2560, 3528531795;
mov.u64 %rd2559, 1013904242;
mov.u64 %rd2558, 3449720151;
LBB30_23:
shr.u64 %rd1247, %rd2567, 32;
shr.u64 %rd1248, %rd2557, 32;
mul.lo.s64 %rd1249, %rd1248, %rd2558;
and.b64 %rd1250, %rd1249, 4294967295;
xor.b64 %rd1251, %rd1250, %rd1247;
xor.b64 %rd1252, %rd1251, %rd2559;
mul.lo.s64 %rd1253, %rd1252, %rd2560;
shr.u64 %rd1254, %rd1253, 32;
shr.u64 %rd1255, %rd1249, 32;
and.b64 %rd1256, %rd2561, 4294967295;
xor.b64 %rd1257, %rd1256, %rd1255;
xor.b64 %rd1258, %rd1257, %rd2562;
mul.lo.s64 %rd1259, %rd1258, %rd2560;
and.b64 %rd1260, %rd1259, 4294967295;
xor.b64 %rd1261, %rd1260, %rd1254;
xor.b64 %rd1262, %rd1261, %rd2563;
mul.lo.s64 %rd1263, %rd1262, %rd2558;
shr.u64 %rd1264, %rd1263, 32;
shr.u64 %rd1265, %rd1259, 32;
and.b64 %rd1266, %rd2564, 4294967295;
xor.b64 %rd1267, %rd1266, %rd1265;
xor.b64 %rd1268, %rd1267, %rd2565;
mul.lo.s64 %rd1269, %rd1268, %rd2558;
and.b64 %rd1270, %rd1269, 4294967295;
xor.b64 %rd1271, %rd1270, %rd1264;
xor.b64 %rd1272, %rd1271, %rd2566;
mul.lo.s64 %rd1273, %rd1272, %rd2560;
shr.u64 %rd1274, %rd1273, 32;
shr.u64 %rd1275, %rd1269, 32;
and.b64 %rd1276, %rd2567, 4294967295;
xor.b64 %rd1277, %rd1276, %rd1275;
xor.b64 %rd1278, %rd1277, %rd2568;
mul.lo.s64 %rd1279, %rd1278, %rd2560;
and.b64 %rd1280, %rd1279, 4294967295;
xor.b64 %rd1281, %rd1280, %rd1274;
xor.b64 %rd1282, %rd1281, %rd2569;
mul.lo.s64 %rd1283, %rd1282, %rd2558;
shr.u64 %rd1284, %rd1283, 32;
shr.u64 %rd1285, %rd1279, 32;
and.b64 %rd1286, %rd1253, 4294967295;
xor.b64 %rd1287, %rd1286, %rd1285;
xor.b64 %rd1288, %rd1287, %rd2570;
mul.lo.s64 %rd1289, %rd1288, %rd2558;
and.b64 %rd1290, %rd1289, 4294967295;
xor.b64 %rd1291, %rd1290, %rd1284;
xor.b64 %rd1292, %rd1291, %rd2571;
mul.lo.s64 %rd1293, %rd1292, %rd2560;
shr.u64 %rd1294, %rd1293, 32;
shr.u64 %rd1295, %rd1289, 32;
and.b64 %rd1296, %rd1263, 4294967295;
xor.b64 %rd1297, %rd1296, %rd1295;
xor.b64 %rd1298, %rd1297, %rd2572;
mul.lo.s64 %rd1299, %rd1298, %rd2560;
and.b64 %rd1300, %rd1299, 4294967295;
xor.b64 %rd1301, %rd1300, %rd1294;
xor.b64 %rd1302, %rd1301, %rd2573;
mul.lo.s64 %rd1303, %rd1302, %rd2558;
shr.u64 %rd1304, %rd1303, 32;
shr.u64 %rd1305, %rd1299, 32;
and.b64 %rd1306, %rd1273, 4294967295;
xor.b64 %rd1307, %rd1306, %rd1305;
xor.b64 %rd1308, %rd1307, %rd2574;
mul.lo.s64 %rd1309, %rd1308, %rd2558;
and.b64 %rd1310, %rd1309, 4294967295;
xor.b64 %rd1311, %rd1310, %rd1304;
xor.b64 %rd1312, %rd1311, %rd2575;
mul.lo.s64 %rd1313, %rd1312, %rd2560;
shr.u64 %rd1314, %rd1313, 32;
cvt.u32.u64 %r138, %rd1314;
shr.u64 %rd1315, %rd1309, 32;
xor.b64 %rd1316, %rd1315, %rd1283;
cvt.u32.u64 %r139, %rd1316;
xor.b32 %r140, %r325, %r139;
mul.lo.s32 %r141, %r140, %r326;
xor.b32 %r142, %r141, %r138;
xor.b32 %r143, %r142, %r327;
shr.u32 %r144, %r143, 9;
cvt.rn.f32.u32 %f115, %r144;
mul.rn.f32 %f116, %f115, 0f34000000;
cvt.rn.f16.f32 %h55, %f116;
mov.b16 %h56, 0x2E66;
setp.ge.f16 %p28, %h55, %h56;
ld.global.nc.b16 %h57, [%rd45+768];
ld.global.nc.f32 %f117, [%rd46+1536];
cvt.rn.f16.f32 %h58, %f117;
add.rn.f16 %h59, %h57, %h58;
mov.b16 %h60, 0x3C72;
mul.rn.f16 %h61, %h59, %h60;
selp.b16 %h62, %h61, 0x0000, %p28;
cvt.f32.f16 %f118, %h62;
ld.global.nc.b16 %h63, [%rd47+768];
cvt.f32.f16 %f119, %h63;
ld.global.nc.f32 %f120, [%rd48+1536];
mul.rn.f32 %f121, %f1, %f120;
mul.rn.f32 %f122, %f121, %f119;
ld.global.nc.f32 %f123, [%rd49+1536];
mul.rn.f32 %f124, %f2, %f121;
sub.rn.f32 %f125, %f123, %f124;
add.rn.f32 %f126, %f122, %f125;
add.rn.f32 %f127, %f126, %f118;
sub.rn.f32 %f128, %f127, %f3;
mul.rn.f32 %f129, %f128, %f128;
add.rn.f32 %f10, %f9, %f129;
or.b32 %r145, %r3, 385;
or.b32 %r146, %r145, %r4;
and.b32 %r147, %r145, 3;
shr.u32 %r148, %r146, 2;
setp.ne.s32 %p29, %r147, 1;
cvt.u64.u32 %rd1317, %r148;
add.s64 %rd214, %rd12, %rd1317;
@%p29 bra LBB30_25;
and.b64 %rd1357, %rd214, 4294967295;
mul.lo.s64 %rd2580, %rd1357, 3528531795;
setp.lt.u64 %p31, %rd214, %rd12;
selp.u64 %rd1358, 1, 0, %p31;
add.s64 %rd1359, %rd2464, %rd1358;
xor.b64 %rd1360, %rd1359, %rd2580;
shr.u64 %rd1361, %rd1360, 32;
mul.lo.s64 %rd2583, %rd1361, 3449720151;
shr.u64 %rd1362, %rd2583, 32;
and.b64 %rd1363, %rd1359, 4294967295;
mul.lo.s64 %rd1364, %rd1363, 3449720151;
and.b64 %rd1365, %rd1364, 4294967295;
xor.b64 %rd1366, %rd1365, %rd1362;
xor.b64 %rd1367, %rd1366, 2654435769;
mul.lo.s64 %rd2586, %rd1367, 3528531795;
xor.b64 %rd2576, %rd1364, %rd214;
mov.u32 %r329, -845247145;
mov.u32 %r328, -616729560;
mov.u64 %rd2593, 3041712726;
mov.u64 %rd2592, 1401181199;
mov.u64 %rd2591, 2835769497;
mov.u64 %rd2590, 1684936478;
mov.u64 %rd2589, 2027808484;
mov.u64 %rd2588, 387276957;
mov.u64 %rd2587, 842468239;
mov.u64 %rd2585, 3986602516;
mov.u64 %rd2584, 1013904242;
mov.u64 %rd2582, 3668340011;
mov.u64 %rd2581, 3144134277;
mov.u64 %rd2579, 3449720151;
mov.u64 %rd2578, 1993301258;
mov.u64 %rd2577, 3528531795;
bra.uni LBB30_26;
LBB30_25:
setp.lt.u64 %p30, %rd214, %rd12;
selp.u64 %rd1332, 1, 0, %p30;
add.s64 %rd1333, %rd2464, %rd1332;
and.b64 %rd1334, %rd1333, 4294967295;
mul.lo.s64 %rd2580, %rd1334, 3449720151;
xor.b64 %rd1335, %rd2580, %rd214;
shr.u64 %rd1336, %rd1335, 32;
mul.lo.s64 %rd2583, %rd1336, 3528531795;
shr.u64 %rd1337, %rd2583, 32;
and.b64 %rd1338, %rd214, 4294967295;
mul.lo.s64 %rd1339, %rd1338, 3528531795;
and.b64 %rd1340, %rd1339, 4294967295;
xor.b64 %rd1341, %rd1340, %rd1337;
xor.b64 %rd1342, %rd1341, 3144134277;
mul.lo.s64 %rd2586, %rd1342, 3449720151;
xor.b64 %rd2576, %rd1333, %rd1339;
mov.u32 %r329, -766435501;
mov.u32 %r328, -239350328;
mov.u64 %rd2593, 1684936478;
mov.u64 %rd2592, 534103459;
mov.u64 %rd2591, 387276957;
mov.u64 %rd2590, 3041712726;
mov.u64 %rd2589, 3986602516;
mov.u64 %rd2588, 2835769497;
mov.u64 %rd2587, 3668340011;
mov.u64 %rd2585, 2027808484;
mov.u64 %rd2584, 1993301258;
mov.u64 %rd2582, 842468239;
mov.u64 %rd2581, 2654435769;
mov.u64 %rd2579, 3528531795;
mov.u64 %rd2578, 1013904242;
mov.u64 %rd2577, 3449720151;
LBB30_26:
shr.u64 %rd1368, %rd2586, 32;
shr.u64 %rd1369, %rd2576, 32;
mul.lo.s64 %rd1370, %rd1369, %rd2577;
and.b64 %rd1371, %rd1370, 4294967295;
xor.b64 %rd1372, %rd1371, %rd1368;
xor.b64 %rd1373, %rd1372, %rd2578;
mul.lo.s64 %rd1374, %rd1373, %rd2579;
shr.u64 %rd1375, %rd1374, 32;
shr.u64 %rd1376, %rd1370, 32;
and.b64 %rd1377, %rd2580, 4294967295;
xor.b64 %rd1378, %rd1377, %rd1376;
xor.b64 %rd1379, %rd1378, %rd2581;
mul.lo.s64 %rd1380, %rd1379, %rd2579;
and.b64 %rd1381, %rd1380, 4294967295;
xor.b64 %rd1382, %rd1381, %rd1375;
xor.b64 %rd1383, %rd1382, %rd2582;
mul.lo.s64 %rd1384, %rd1383, %rd2577;
shr.u64 %rd1385, %rd1384, 32;
shr.u64 %rd1386, %rd1380, 32;
and.b64 %rd1387, %rd2583, 4294967295;
xor.b64 %rd1388, %rd1387, %rd1386;
xor.b64 %rd1389, %rd1388, %rd2584;
mul.lo.s64 %rd1390, %rd1389, %rd2577;
and.b64 %rd1391, %rd1390, 4294967295;
xor.b64 %rd1392, %rd1391, %rd1385;
xor.b64 %rd1393, %rd1392, %rd2585;
mul.lo.s64 %rd1394, %rd1393, %rd2579;
shr.u64 %rd1395, %rd1394, 32;
shr.u64 %rd1396, %rd1390, 32;
and.b64 %rd1397, %rd2586, 4294967295;
xor.b64 %rd1398, %rd1397, %rd1396;
xor.b64 %rd1399, %rd1398, %rd2587;
mul.lo.s64 %rd1400, %rd1399, %rd2579;
and.b64 %rd1401, %rd1400, 4294967295;
xor.b64 %rd1402, %rd1401, %rd1395;
xor.b64 %rd1403, %rd1402, %rd2588;
mul.lo.s64 %rd1404, %rd1403, %rd2577;
shr.u64 %rd1405, %rd1404, 32;
shr.u64 %rd1406, %rd1400, 32;
and.b64 %rd1407, %rd1374, 4294967295;
xor.b64 %rd1408, %rd1407, %rd1406;
xor.b64 %rd1409, %rd1408, %rd2589;
mul.lo.s64 %rd1410, %rd1409, %rd2577;
and.b64 %rd1411, %rd1410, 4294967295;
xor.b64 %rd1412, %rd1411, %rd1405;
xor.b64 %rd1413, %rd1412, %rd2590;
mul.lo.s64 %rd1414, %rd1413, %rd2579;
shr.u64 %rd1415, %rd1414, 32;
shr.u64 %rd1416, %rd1410, 32;
and.b64 %rd1417, %rd1384, 4294967295;
xor.b64 %rd1418, %rd1417, %rd1416;
xor.b64 %rd1419, %rd1418, %rd2591;
mul.lo.s64 %rd1420, %rd1419, %rd2579;
and.b64 %rd1421, %rd1420, 4294967295;
xor.b64 %rd1422, %rd1421, %rd1415;
xor.b64 %rd1423, %rd1422, %rd2592;
mul.lo.s64 %rd1424, %rd1423, %rd2577;
shr.u64 %rd1425, %rd1424, 32;
shr.u64 %rd1426, %rd1420, 32;
xor.b64 %rd1427, %rd1394, %rd1426;
xor.b64 %rd1428, %rd1427, %rd2593;
mul.lo.s64 %rd1429, %rd1428, %rd2577;
xor.b64 %rd1430, %rd1425, %rd1429;
cvt.u32.u64 %r153, %rd1430;
xor.b32 %r154, %r328, %r153;
mul.lo.s32 %r155, %r154, %r329;
shr.u32 %r156, %r155, 9;
cvt.rn.f32.u32 %f130, %r156;
mul.rn.f32 %f131, %f130, 0f34000000;
cvt.rn.f16.f32 %h64, %f131;
mov.b16 %h65, 0x2E66;
setp.ge.f16 %p33, %h64, %h65;
ld.global.nc.b16 %h66, [%rd45+770];
ld.global.nc.f32 %f132, [%rd46+1540];
cvt.rn.f16.f32 %h67, %f132;
add.rn.f16 %h68, %h66, %h67;
mov.b16 %h69, 0x3C72;
mul.rn.f16 %h70, %h68, %h69;
selp.b16 %h71, %h70, 0x0000, %p33;
cvt.f32.f16 %f133, %h71;
ld.global.nc.b16 %h72, [%rd47+770];
cvt.f32.f16 %f134, %h72;
ld.global.nc.f32 %f135, [%rd48+1540];
mul.rn.f32 %f136, %f1, %f135;
mul.rn.f32 %f137, %f136, %f134;
ld.global.nc.f32 %f138, [%rd49+1540];
mul.rn.f32 %f139, %f2, %f136;
sub.rn.f32 %f140, %f138, %f139;
add.rn.f32 %f141, %f137, %f140;
add.rn.f32 %f142, %f141, %f133;
sub.rn.f32 %f143, %f142, %f3;
mul.rn.f32 %f144, %f143, %f143;
add.rn.f32 %f11, %f10, %f144;
or.b32 %r158, %r73, 512;
shr.u32 %r159, %r158, 2;
cvt.u64.u32 %rd1431, %r159;
add.s64 %rd241, %rd12, %rd1431;
@%p8 bra LBB30_28;
and.b64 %rd1473, %rd241, 4294967295;
mul.lo.s64 %rd2598, %rd1473, 3528531795;
setp.lt.u64 %p35, %rd241, %rd12;
selp.u64 %rd1474, 1, 0, %p35;
add.s64 %rd1475, %rd2464, %rd1474;
xor.b64 %rd1476, %rd1475, %rd2598;
shr.u64 %rd1477, %rd1476, 32;
mul.lo.s64 %rd2601, %rd1477, 3449720151;
shr.u64 %rd1478, %rd2601, 32;
and.b64 %rd1479, %rd1475, 4294967295;
mul.lo.s64 %rd1480, %rd1479, 3449720151;
and.b64 %rd1481, %rd1480, 4294967295;
xor.b64 %rd1482, %rd1481, %rd1478;
xor.b64 %rd1483, %rd1482, 2654435769;
mul.lo.s64 %rd2604, %rd1483, 3528531795;
xor.b64 %rd2594, %rd1480, %rd241;
mov.u32 %r332, -1879881855;
mov.u32 %r331, -845247145;
mov.u32 %r330, 534103459;
mov.u64 %rd2612, 3678237736;
mov.u64 %rd2611, 3041712726;
mov.u64 %rd2610, 1401181199;
mov.u64 %rd2609, 2835769497;
mov.u64 %rd2608, 1684936478;
mov.u64 %rd2607, 2027808484;
mov.u64 %rd2606, 387276957;
mov.u64 %rd2605, 842468239;
mov.u64 %rd2603, 3986602516;
mov.u64 %rd2602, 1013904242;
mov.u64 %rd2600, 3668340011;
mov.u64 %rd2599, 3144134277;
mov.u64 %rd2597, 3449720151;
mov.u64 %rd2596, 1993301258;
mov.u64 %rd2595, 3528531795;
bra.uni LBB30_29;
LBB30_28:
setp.lt.u64 %p34, %rd241, %rd12;
selp.u64 %rd1447, 1, 0, %p34;
add.s64 %rd1448, %rd2464, %rd1447;
and.b64 %rd1449, %rd1448, 4294967295;
mul.lo.s64 %rd2598, %rd1449, 3449720151;
xor.b64 %rd1450, %rd2598, %rd241;
shr.u64 %rd1451, %rd1450, 32;
mul.lo.s64 %rd2601, %rd1451, 3528531795;
shr.u64 %rd1452, %rd2601, 32;
and.b64 %rd1453, %rd241, 4294967295;
mul.lo.s64 %rd1454, %rd1453, 3528531795;
and.b64 %rd1455, %rd1454, 4294967295;
xor.b64 %rd1456, %rd1455, %rd1452;
xor.b64 %rd1457, %rd1456, 3144134277;
mul.lo.s64 %rd2604, %rd1457, 3449720151;
xor.b64 %rd2594, %rd1448, %rd1454;
mov.u32 %r332, -1767562579;
mov.u32 %r331, -766435501;
mov.u32 %r330, 1401181199;
mov.u64 %rd2612, 4055616968;
mov.u64 %rd2611, 1684936478;
mov.u64 %rd2610, 534103459;
mov.u64 %rd2609, 387276957;
mov.u64 %rd2608, 3041712726;
mov.u64 %rd2607, 3986602516;
mov.u64 %rd2606, 2835769497;
mov.u64 %rd2605, 3668340011;
mov.u64 %rd2603, 2027808484;
mov.u64 %rd2602, 1993301258;
mov.u64 %rd2600, 842468239;
mov.u64 %rd2599, 2654435769;
mov.u64 %rd2597, 3528531795;
mov.u64 %rd2596, 1013904242;
mov.u64 %rd2595, 3449720151;
LBB30_29:
shr.u64 %rd1484, %rd2604, 32;
shr.u64 %rd1485, %rd2594, 32;
mul.lo.s64 %rd1486, %rd1485, %rd2595;
and.b64 %rd1487, %rd1486, 4294967295;
xor.b64 %rd1488, %rd1487, %rd1484;
xor.b64 %rd1489, %rd1488, %rd2596;
mul.lo.s64 %rd1490, %rd1489, %rd2597;
shr.u64 %rd1491, %rd1490, 32;
shr.u64 %rd1492, %rd1486, 32;
and.b64 %rd1493, %rd2598, 4294967295;
xor.b64 %rd1494, %rd1493, %rd1492;
xor.b64 %rd1495, %rd1494, %rd2599;
mul.lo.s64 %rd1496, %rd1495, %rd2597;
and.b64 %rd1497, %rd1496, 4294967295;
xor.b64 %rd1498, %rd1497, %rd1491;
xor.b64 %rd1499, %rd1498, %rd2600;
mul.lo.s64 %rd1500, %rd1499, %rd2595;
shr.u64 %rd1501, %rd1500, 32;
shr.u64 %rd1502, %rd1496, 32;
and.b64 %rd1503, %rd2601, 4294967295;
xor.b64 %rd1504, %rd1503, %rd1502;
xor.b64 %rd1505, %rd1504, %rd2602;
mul.lo.s64 %rd1506, %rd1505, %rd2595;
and.b64 %rd1507, %rd1506, 4294967295;
xor.b64 %rd1508, %rd1507, %rd1501;
xor.b64 %rd1509, %rd1508, %rd2603;
mul.lo.s64 %rd1510, %rd1509, %rd2597;
shr.u64 %rd1511, %rd1510, 32;
shr.u64 %rd1512, %rd1506, 32;
and.b64 %rd1513, %rd2604, 4294967295;
xor.b64 %rd1514, %rd1513, %rd1512;
xor.b64 %rd1515, %rd1514, %rd2605;
mul.lo.s64 %rd1516, %rd1515, %rd2597;
and.b64 %rd1517, %rd1516, 4294967295;
xor.b64 %rd1518, %rd1517, %rd1511;
xor.b64 %rd1519, %rd1518, %rd2606;
mul.lo.s64 %rd1520, %rd1519, %rd2595;
shr.u64 %rd1521, %rd1520, 32;
shr.u64 %rd1522, %rd1516, 32;
and.b64 %rd1523, %rd1490, 4294967295;
xor.b64 %rd1524, %rd1523, %rd1522;
xor.b64 %rd1525, %rd1524, %rd2607;
mul.lo.s64 %rd1526, %rd1525, %rd2595;
and.b64 %rd1527, %rd1526, 4294967295;
xor.b64 %rd1528, %rd1527, %rd1521;
xor.b64 %rd1529, %rd1528, %rd2608;
mul.lo.s64 %rd1530, %rd1529, %rd2597;
shr.u64 %rd1531, %rd1530, 32;
shr.u64 %rd1532, %rd1526, 32;
and.b64 %rd1533, %rd1500, 4294967295;
xor.b64 %rd1534, %rd1533, %rd1532;
xor.b64 %rd1535, %rd1534, %rd2609;
mul.lo.s64 %rd1536, %rd1535, %rd2597;
and.b64 %rd1537, %rd1536, 4294967295;
xor.b64 %rd1538, %rd1537, %rd1531;
xor.b64 %rd1539, %rd1538, %rd2610;
mul.lo.s64 %rd1540, %rd1539, %rd2595;
shr.u64 %rd1541, %rd1540, 32;
shr.u64 %rd1542, %rd1536, 32;
and.b64 %rd1543, %rd1510, 4294967295;
xor.b64 %rd1544, %rd1543, %rd1542;
xor.b64 %rd1545, %rd1544, %rd2611;
mul.lo.s64 %rd1546, %rd1545, %rd2595;
and.b64 %rd1547, %rd1546, 4294967295;
xor.b64 %rd1548, %rd1547, %rd1541;
xor.b64 %rd1549, %rd1548, %rd2612;
mul.lo.s64 %rd1550, %rd1549, %rd2597;
shr.u64 %rd1551, %rd1550, 32;
cvt.u32.u64 %r166, %rd1551;
shr.u64 %rd1552, %rd1546, 32;
xor.b64 %rd1553, %rd1552, %rd1520;
cvt.u32.u64 %r167, %rd1553;
xor.b32 %r168, %r330, %r167;
mul.lo.s32 %r169, %r168, %r331;
xor.b32 %r170, %r169, %r166;
xor.b32 %r171, %r170, %r332;
shr.u32 %r172, %r171, 9;
cvt.rn.f32.u32 %f145, %r172;
mul.rn.f32 %f146, %f145, 0f34000000;
cvt.rn.f16.f32 %h73, %f146;
mov.b16 %h74, 0x2E66;
setp.ge.f16 %p36, %h73, %h74;
ld.global.nc.b16 %h75, [%rd45+1024];
ld.global.nc.f32 %f147, [%rd46+2048];
cvt.rn.f16.f32 %h76, %f147;
add.rn.f16 %h77, %h75, %h76;
mov.b16 %h78, 0x3C72;
mul.rn.f16 %h79, %h77, %h78;
selp.b16 %h80, %h79, 0x0000, %p36;
cvt.f32.f16 %f148, %h80;
ld.global.nc.b16 %h81, [%rd47+1024];
cvt.f32.f16 %f149, %h81;
ld.global.nc.f32 %f150, [%rd48+2048];
mul.rn.f32 %f151, %f1, %f150;
mul.rn.f32 %f152, %f151, %f149;
ld.global.nc.f32 %f153, [%rd49+2048];
mul.rn.f32 %f154, %f2, %f151;
sub.rn.f32 %f155, %f153, %f154;
add.rn.f32 %f156, %f152, %f155;
add.rn.f32 %f157, %f156, %f148;
sub.rn.f32 %f158, %f157, %f3;
mul.rn.f32 %f159, %f158, %f158;
add.rn.f32 %f12, %f11, %f159;
or.b32 %r173, %r3, 513;
or.b32 %r174, %r173, %r4;
and.b32 %r175, %r173, 3;
shr.u32 %r176, %r174, 2;
setp.ne.s32 %p37, %r175, 1;
cvt.u64.u32 %rd1554, %r176;
add.s64 %rd269, %rd12, %rd1554;
@%p37 bra LBB30_31;
and.b64 %rd1594, %rd269, 4294967295;
mul.lo.s64 %rd2617, %rd1594, 3528531795;
setp.lt.u64 %p39, %rd269, %rd12;
selp.u64 %rd1595, 1, 0, %p39;
add.s64 %rd1596, %rd2464, %rd1595;
xor.b64 %rd1597, %rd1596, %rd2617;
shr.u64 %rd1598, %rd1597, 32;
mul.lo.s64 %rd2620, %rd1598, 3449720151;
shr.u64 %rd1599, %rd2620, 32;
and.b64 %rd1600, %rd1596, 4294967295;
mul.lo.s64 %rd1601, %rd1600, 3449720151;
and.b64 %rd1602, %rd1601, 4294967295;
xor.b64 %rd1603, %rd1602, %rd1599;
xor.b64 %rd1604, %rd1603, 2654435769;
mul.lo.s64 %rd2623, %rd1604, 3528531795;
xor.b64 %rd2613, %rd1601, %rd269;
mov.u32 %r334, -845247145;
mov.u32 %r333, -616729560;
mov.u64 %rd2630, 3041712726;
mov.u64 %rd2629, 1401181199;
mov.u64 %rd2628, 2835769497;
mov.u64 %rd2627, 1684936478;
mov.u64 %rd2626, 2027808484;
mov.u64 %rd2625, 387276957;
mov.u64 %rd2624, 842468239;
mov.u64 %rd2622, 3986602516;
mov.u64 %rd2621, 1013904242;
mov.u64 %rd2619, 3668340011;
mov.u64 %rd2618, 3144134277;
mov.u64 %rd2616, 3449720151;
mov.u64 %rd2615, 1993301258;
mov.u64 %rd2614, 3528531795;
bra.uni LBB30_32;
LBB30_31:
setp.lt.u64 %p38, %rd269, %rd12;
selp.u64 %rd1569, 1, 0, %p38;
add.s64 %rd1570, %rd2464, %rd1569;
and.b64 %rd1571, %rd1570, 4294967295;
mul.lo.s64 %rd2617, %rd1571, 3449720151;
xor.b64 %rd1572, %rd2617, %rd269;
shr.u64 %rd1573, %rd1572, 32;
mul.lo.s64 %rd2620, %rd1573, 3528531795;
shr.u64 %rd1574, %rd2620, 32;
and.b64 %rd1575, %rd269, 4294967295;
mul.lo.s64 %rd1576, %rd1575, 3528531795;
and.b64 %rd1577, %rd1576, 4294967295;
xor.b64 %rd1578, %rd1577, %rd1574;
xor.b64 %rd1579, %rd1578, 3144134277;
mul.lo.s64 %rd2623, %rd1579, 3449720151;
xor.b64 %rd2613, %rd1570, %rd1576;
mov.u32 %r334, -766435501;
mov.u32 %r333, -239350328;
mov.u64 %rd2630, 1684936478;
mov.u64 %rd2629, 534103459;
mov.u64 %rd2628, 387276957;
mov.u64 %rd2627, 3041712726;
mov.u64 %rd2626, 3986602516;
mov.u64 %rd2625, 2835769497;
mov.u64 %rd2624, 3668340011;
mov.u64 %rd2622, 2027808484;
mov.u64 %rd2621, 1993301258;
mov.u64 %rd2619, 842468239;
mov.u64 %rd2618, 2654435769;
mov.u64 %rd2616, 3528531795;
mov.u64 %rd2615, 1013904242;
mov.u64 %rd2614, 3449720151;
LBB30_32:
shr.u64 %rd1605, %rd2623, 32;
shr.u64 %rd1606, %rd2613, 32;
mul.lo.s64 %rd1607, %rd1606, %rd2614;
and.b64 %rd1608, %rd1607, 4294967295;
xor.b64 %rd1609, %rd1608, %rd1605;
xor.b64 %rd1610, %rd1609, %rd2615;
mul.lo.s64 %rd1611, %rd1610, %rd2616;
shr.u64 %rd1612, %rd1611, 32;
shr.u64 %rd1613, %rd1607, 32;
and.b64 %rd1614, %rd2617, 4294967295;
xor.b64 %rd1615, %rd1614, %rd1613;
xor.b64 %rd1616, %rd1615, %rd2618;
mul.lo.s64 %rd1617, %rd1616, %rd2616;
and.b64 %rd1618, %rd1617, 4294967295;
xor.b64 %rd1619, %rd1618, %rd1612;
xor.b64 %rd1620, %rd1619, %rd2619;
mul.lo.s64 %rd1621, %rd1620, %rd2614;
shr.u64 %rd1622, %rd1621, 32;
shr.u64 %rd1623, %rd1617, 32;
and.b64 %rd1624, %rd2620, 4294967295;
xor.b64 %rd1625, %rd1624, %rd1623;
xor.b64 %rd1626, %rd1625, %rd2621;
mul.lo.s64 %rd1627, %rd1626, %rd2614;
and.b64 %rd1628, %rd1627, 4294967295;
xor.b64 %rd1629, %rd1628, %rd1622;
xor.b64 %rd1630, %rd1629, %rd2622;
mul.lo.s64 %rd1631, %rd1630, %rd2616;
shr.u64 %rd1632, %rd1631, 32;
shr.u64 %rd1633, %rd1627, 32;
and.b64 %rd1634, %rd2623, 4294967295;
xor.b64 %rd1635, %rd1634, %rd1633;
xor.b64 %rd1636, %rd1635, %rd2624;
mul.lo.s64 %rd1637, %rd1636, %rd2616;
and.b64 %rd1638, %rd1637, 4294967295;
xor.b64 %rd1639, %rd1638, %rd1632;
xor.b64 %rd1640, %rd1639, %rd2625;
mul.lo.s64 %rd1641, %rd1640, %rd2614;
shr.u64 %rd1642, %rd1641, 32;
shr.u64 %rd1643, %rd1637, 32;
and.b64 %rd1644, %rd1611, 4294967295;
xor.b64 %rd1645, %rd1644, %rd1643;
xor.b64 %rd1646, %rd1645, %rd2626;
mul.lo.s64 %rd1647, %rd1646, %rd2614;
and.b64 %rd1648, %rd1647, 4294967295;
xor.b64 %rd1649, %rd1648, %rd1642;
xor.b64 %rd1650, %rd1649, %rd2627;
mul.lo.s64 %rd1651, %rd1650, %rd2616;
shr.u64 %rd1652, %rd1651, 32;
shr.u64 %rd1653, %rd1647, 32;
and.b64 %rd1654, %rd1621, 4294967295;
xor.b64 %rd1655, %rd1654, %rd1653;
xor.b64 %rd1656, %rd1655, %rd2628;
mul.lo.s64 %rd1657, %rd1656, %rd2616;
and.b64 %rd1658, %rd1657, 4294967295;
xor.b64 %rd1659, %rd1658, %rd1652;
xor.b64 %rd1660, %rd1659, %rd2629;
mul.lo.s64 %rd1661, %rd1660, %rd2614;
shr.u64 %rd1662, %rd1661, 32;
shr.u64 %rd1663, %rd1657, 32;
xor.b64 %rd1664, %rd1631, %rd1663;
xor.b64 %rd1665, %rd1664, %rd2630;
mul.lo.s64 %rd1666, %rd1665, %rd2614;
xor.b64 %rd1667, %rd1662, %rd1666;
cvt.u32.u64 %r181, %rd1667;
xor.b32 %r182, %r333, %r181;
mul.lo.s32 %r183, %r182, %r334;
shr.u32 %r184, %r183, 9;
cvt.rn.f32.u32 %f160, %r184;
mul.rn.f32 %f161, %f160, 0f34000000;
cvt.rn.f16.f32 %h82, %f161;
mov.b16 %h83, 0x2E66;
setp.ge.f16 %p41, %h82, %h83;
ld.global.nc.b16 %h84, [%rd45+1026];
ld.global.nc.f32 %f162, [%rd46+2052];
cvt.rn.f16.f32 %h85, %f162;
add.rn.f16 %h86, %h84, %h85;
mov.b16 %h87, 0x3C72;
mul.rn.f16 %h88, %h86, %h87;
selp.b16 %h89, %h88, 0x0000, %p41;
cvt.f32.f16 %f163, %h89;
ld.global.nc.b16 %h90, [%rd47+1026];
cvt.f32.f16 %f164, %h90;
ld.global.nc.f32 %f165, [%rd48+2052];
mul.rn.f32 %f166, %f1, %f165;
mul.rn.f32 %f167, %f166, %f164;
ld.global.nc.f32 %f168, [%rd49+2052];
mul.rn.f32 %f169, %f2, %f166;
sub.rn.f32 %f170, %f168, %f169;
add.rn.f32 %f171, %f167, %f170;
add.rn.f32 %f172, %f171, %f163;
sub.rn.f32 %f173, %f172, %f3;
mul.rn.f32 %f174, %f173, %f173;
add.rn.f32 %f13, %f12, %f174;
or.b32 %r186, %r73, 640;
shr.u32 %r187, %r186, 2;
cvt.u64.u32 %rd1668, %r187;
add.s64 %rd296, %rd12, %rd1668;
@%p8 bra LBB30_34;
and.b64 %rd1710, %rd296, 4294967295;
mul.lo.s64 %rd2635, %rd1710, 3528531795;
setp.lt.u64 %p43, %rd296, %rd12;
selp.u64 %rd1711, 1, 0, %p43;
add.s64 %rd1712, %rd2464, %rd1711;
xor.b64 %rd1713, %rd1712, %rd2635;
shr.u64 %rd1714, %rd1713, 32;
mul.lo.s64 %rd2638, %rd1714, 3449720151;
shr.u64 %rd1715, %rd2638, 32;
and.b64 %rd1716, %rd1712, 4294967295;
mul.lo.s64 %rd1717, %rd1716, 3449720151;
and.b64 %rd1718, %rd1717, 4294967295;
xor.b64 %rd1719, %rd1718, %rd1715;
xor.b64 %rd1720, %rd1719, 2654435769;
mul.lo.s64 %rd2641, %rd1720, 3528531795;
xor.b64 %rd2631, %rd1717, %rd296;
mov.u32 %r337, -1879881855;
mov.u32 %r336, -845247145;
mov.u32 %r335, 534103459;
mov.u64 %rd2649, 3678237736;
mov.u64 %rd2648, 3041712726;
mov.u64 %rd2647, 1401181199;
mov.u64 %rd2646, 2835769497;
mov.u64 %rd2645, 1684936478;
mov.u64 %rd2644, 2027808484;
mov.u64 %rd2643, 387276957;
mov.u64 %rd2642, 842468239;
mov.u64 %rd2640, 3986602516;
mov.u64 %rd2639, 1013904242;
mov.u64 %rd2637, 3668340011;
mov.u64 %rd2636, 3144134277;
mov.u64 %rd2634, 3449720151;
mov.u64 %rd2633, 1993301258;
mov.u64 %rd2632, 3528531795;
bra.uni LBB30_35;
LBB30_34:
setp.lt.u64 %p42, %rd296, %rd12;
selp.u64 %rd1684, 1, 0, %p42;
add.s64 %rd1685, %rd2464, %rd1684;
and.b64 %rd1686, %rd1685, 4294967295;
mul.lo.s64 %rd2635, %rd1686, 3449720151;
xor.b64 %rd1687, %rd2635, %rd296;
shr.u64 %rd1688, %rd1687, 32;
mul.lo.s64 %rd2638, %rd1688, 3528531795;
shr.u64 %rd1689, %rd2638, 32;
and.b64 %rd1690, %rd296, 4294967295;
mul.lo.s64 %rd1691, %rd1690, 3528531795;
and.b64 %rd1692, %rd1691, 4294967295;
xor.b64 %rd1693, %rd1692, %rd1689;
xor.b64 %rd1694, %rd1693, 3144134277;
mul.lo.s64 %rd2641, %rd1694, 3449720151;
xor.b64 %rd2631, %rd1685, %rd1691;
mov.u32 %r337, -1767562579;
mov.u32 %r336, -766435501;
mov.u32 %r335, 1401181199;
mov.u64 %rd2649, 4055616968;
mov.u64 %rd2648, 1684936478;
mov.u64 %rd2647, 534103459;
mov.u64 %rd2646, 387276957;
mov.u64 %rd2645, 3041712726;
mov.u64 %rd2644, 3986602516;
mov.u64 %rd2643, 2835769497;
mov.u64 %rd2642, 3668340011;
mov.u64 %rd2640, 2027808484;
mov.u64 %rd2639, 1993301258;
mov.u64 %rd2637, 842468239;
mov.u64 %rd2636, 2654435769;
mov.u64 %rd2634, 3528531795;
mov.u64 %rd2633, 1013904242;
mov.u64 %rd2632, 3449720151;
LBB30_35:
shr.u64 %rd1721, %rd2641, 32;
shr.u64 %rd1722, %rd2631, 32;
mul.lo.s64 %rd1723, %rd1722, %rd2632;
and.b64 %rd1724, %rd1723, 4294967295;
xor.b64 %rd1725, %rd1724, %rd1721;
xor.b64 %rd1726, %rd1725, %rd2633;
mul.lo.s64 %rd1727, %rd1726, %rd2634;
shr.u64 %rd1728, %rd1727, 32;
shr.u64 %rd1729, %rd1723, 32;
and.b64 %rd1730, %rd2635, 4294967295;
xor.b64 %rd1731, %rd1730, %rd1729;
xor.b64 %rd1732, %rd1731, %rd2636;
mul.lo.s64 %rd1733, %rd1732, %rd2634;
and.b64 %rd1734, %rd1733, 4294967295;
xor.b64 %rd1735, %rd1734, %rd1728;
xor.b64 %rd1736, %rd1735, %rd2637;
mul.lo.s64 %rd1737, %rd1736, %rd2632;
shr.u64 %rd1738, %rd1737, 32;
shr.u64 %rd1739, %rd1733, 32;
and.b64 %rd1740, %rd2638, 4294967295;
xor.b64 %rd1741, %rd1740, %rd1739;
xor.b64 %rd1742, %rd1741, %rd2639;
mul.lo.s64 %rd1743, %rd1742, %rd2632;
and.b64 %rd1744, %rd1743, 4294967295;
xor.b64 %rd1745, %rd1744, %rd1738;
xor.b64 %rd1746, %rd1745, %rd2640;
mul.lo.s64 %rd1747, %rd1746, %rd2634;
shr.u64 %rd1748, %rd1747, 32;
shr.u64 %rd1749, %rd1743, 32;
and.b64 %rd1750, %rd2641, 4294967295;
xor.b64 %rd1751, %rd1750, %rd1749;
xor.b64 %rd1752, %rd1751, %rd2642;
mul.lo.s64 %rd1753, %rd1752, %rd2634;
and.b64 %rd1754, %rd1753, 4294967295;
xor.b64 %rd1755, %rd1754, %rd1748;
xor.b64 %rd1756, %rd1755, %rd2643;
mul.lo.s64 %rd1757, %rd1756, %rd2632;
shr.u64 %rd1758, %rd1757, 32;
shr.u64 %rd1759, %rd1753, 32;
and.b64 %rd1760, %rd1727, 4294967295;
xor.b64 %rd1761, %rd1760, %rd1759;
xor.b64 %rd1762, %rd1761, %rd2644;
mul.lo.s64 %rd1763, %rd1762, %rd2632;
and.b64 %rd1764, %rd1763, 4294967295;
xor.b64 %rd1765, %rd1764, %rd1758;
xor.b64 %rd1766, %rd1765, %rd2645;
mul.lo.s64 %rd1767, %rd1766, %rd2634;
shr.u64 %rd1768, %rd1767, 32;
shr.u64 %rd1769, %rd1763, 32;
and.b64 %rd1770, %rd1737, 4294967295;
xor.b64 %rd1771, %rd1770, %rd1769;
xor.b64 %rd1772, %rd1771, %rd2646;
mul.lo.s64 %rd1773, %rd1772, %rd2634;
and.b64 %rd1774, %rd1773, 4294967295;
xor.b64 %rd1775, %rd1774, %rd1768;
xor.b64 %rd1776, %rd1775, %rd2647;
mul.lo.s64 %rd1777, %rd1776, %rd2632;
shr.u64 %rd1778, %rd1777, 32;
shr.u64 %rd1779, %rd1773, 32;
and.b64 %rd1780, %rd1747, 4294967295;
xor.b64 %rd1781, %rd1780, %rd1779;
xor.b64 %rd1782, %rd1781, %rd2648;
mul.lo.s64 %rd1783, %rd1782, %rd2632;
and.b64 %rd1784, %rd1783, 4294967295;
xor.b64 %rd1785, %rd1784, %rd1778;
xor.b64 %rd1786, %rd1785, %rd2649;
mul.lo.s64 %rd1787, %rd1786, %rd2634;
shr.u64 %rd1788, %rd1787, 32;
cvt.u32.u64 %r194, %rd1788;
shr.u64 %rd1789, %rd1783, 32;
xor.b64 %rd1790, %rd1789, %rd1757;
cvt.u32.u64 %r195, %rd1790;
xor.b32 %r196, %r335, %r195;
mul.lo.s32 %r197, %r196, %r336;
xor.b32 %r198, %r197, %r194;
xor.b32 %r199, %r198, %r337;
shr.u32 %r200, %r199, 9;
cvt.rn.f32.u32 %f175, %r200;
mul.rn.f32 %f176, %f175, 0f34000000;
cvt.rn.f16.f32 %h91, %f176;
mov.b16 %h92, 0x2E66;
setp.ge.f16 %p44, %h91, %h92;
ld.global.nc.b16 %h93, [%rd45+1280];
ld.global.nc.f32 %f177, [%rd46+2560];
cvt.rn.f16.f32 %h94, %f177;
add.rn.f16 %h95, %h93, %h94;
mov.b16 %h96, 0x3C72;
mul.rn.f16 %h97, %h95, %h96;
selp.b16 %h98, %h97, 0x0000, %p44;
cvt.f32.f16 %f178, %h98;
ld.global.nc.b16 %h99, [%rd47+1280];
cvt.f32.f16 %f179, %h99;
ld.global.nc.f32 %f180, [%rd48+2560];
mul.rn.f32 %f181, %f1, %f180;
mul.rn.f32 %f182, %f181, %f179;
ld.global.nc.f32 %f183, [%rd49+2560];
mul.rn.f32 %f184, %f2, %f181;
sub.rn.f32 %f185, %f183, %f184;
add.rn.f32 %f186, %f182, %f185;
add.rn.f32 %f187, %f186, %f178;
sub.rn.f32 %f188, %f187, %f3;
mul.rn.f32 %f189, %f188, %f188;
add.rn.f32 %f14, %f13, %f189;
or.b32 %r201, %r3, 641;
or.b32 %r202, %r201, %r4;
and.b32 %r203, %r201, 3;
shr.u32 %r204, %r202, 2;
setp.ne.s32 %p45, %r203, 1;
cvt.u64.u32 %rd1791, %r204;
add.s64 %rd324, %rd12, %rd1791;
@%p45 bra LBB30_37;
and.b64 %rd1831, %rd324, 4294967295;
mul.lo.s64 %rd2654, %rd1831, 3528531795;
setp.lt.u64 %p47, %rd324, %rd12;
selp.u64 %rd1832, 1, 0, %p47;
add.s64 %rd1833, %rd2464, %rd1832;
xor.b64 %rd1834, %rd1833, %rd2654;
shr.u64 %rd1835, %rd1834, 32;
mul.lo.s64 %rd2657, %rd1835, 3449720151;
shr.u64 %rd1836, %rd2657, 32;
and.b64 %rd1837, %rd1833, 4294967295;
mul.lo.s64 %rd1838, %rd1837, 3449720151;
and.b64 %rd1839, %rd1838, 4294967295;
xor.b64 %rd1840, %rd1839, %rd1836;
xor.b64 %rd1841, %rd1840, 2654435769;
mul.lo.s64 %rd2660, %rd1841, 3528531795;
xor.b64 %rd2650, %rd1838, %rd324;
mov.u32 %r339, -845247145;
mov.u32 %r338, -616729560;
mov.u64 %rd2667, 3041712726;
mov.u64 %rd2666, 1401181199;
mov.u64 %rd2665, 2835769497;
mov.u64 %rd2664, 1684936478;
mov.u64 %rd2663, 2027808484;
mov.u64 %rd2662, 387276957;
mov.u64 %rd2661, 842468239;
mov.u64 %rd2659, 3986602516;
mov.u64 %rd2658, 1013904242;
mov.u64 %rd2656, 3668340011;
mov.u64 %rd2655, 3144134277;
mov.u64 %rd2653, 3449720151;
mov.u64 %rd2652, 1993301258;
mov.u64 %rd2651, 3528531795;
bra.uni LBB30_38;
LBB30_37:
setp.lt.u64 %p46, %rd324, %rd12;
selp.u64 %rd1806, 1, 0, %p46;
add.s64 %rd1807, %rd2464, %rd1806;
and.b64 %rd1808, %rd1807, 4294967295;
mul.lo.s64 %rd2654, %rd1808, 3449720151;
xor.b64 %rd1809, %rd2654, %rd324;
shr.u64 %rd1810, %rd1809, 32;
mul.lo.s64 %rd2657, %rd1810, 3528531795;
shr.u64 %rd1811, %rd2657, 32;
and.b64 %rd1812, %rd324, 4294967295;
mul.lo.s64 %rd1813, %rd1812, 3528531795;
and.b64 %rd1814, %rd1813, 4294967295;
xor.b64 %rd1815, %rd1814, %rd1811;
xor.b64 %rd1816, %rd1815, 3144134277;
mul.lo.s64 %rd2660, %rd1816, 3449720151;
xor.b64 %rd2650, %rd1807, %rd1813;
mov.u32 %r339, -766435501;
mov.u32 %r338, -239350328;
mov.u64 %rd2667, 1684936478;
mov.u64 %rd2666, 534103459;
mov.u64 %rd2665, 387276957;
mov.u64 %rd2664, 3041712726;
mov.u64 %rd2663, 3986602516;
mov.u64 %rd2662, 2835769497;
mov.u64 %rd2661, 3668340011;
mov.u64 %rd2659, 2027808484;
mov.u64 %rd2658, 1993301258;
mov.u64 %rd2656, 842468239;
mov.u64 %rd2655, 2654435769;
mov.u64 %rd2653, 3528531795;
mov.u64 %rd2652, 1013904242;
mov.u64 %rd2651, 3449720151;
LBB30_38:
shr.u64 %rd1842, %rd2660, 32;
shr.u64 %rd1843, %rd2650, 32;
mul.lo.s64 %rd1844, %rd1843, %rd2651;
and.b64 %rd1845, %rd1844, 4294967295;
xor.b64 %rd1846, %rd1845, %rd1842;
xor.b64 %rd1847, %rd1846, %rd2652;
mul.lo.s64 %rd1848, %rd1847, %rd2653;
shr.u64 %rd1849, %rd1848, 32;
shr.u64 %rd1850, %rd1844, 32;
and.b64 %rd1851, %rd2654, 4294967295;
xor.b64 %rd1852, %rd1851, %rd1850;
xor.b64 %rd1853, %rd1852, %rd2655;
mul.lo.s64 %rd1854, %rd1853, %rd2653;
and.b64 %rd1855, %rd1854, 4294967295;
xor.b64 %rd1856, %rd1855, %rd1849;
xor.b64 %rd1857, %rd1856, %rd2656;
mul.lo.s64 %rd1858, %rd1857, %rd2651;
shr.u64 %rd1859, %rd1858, 32;
shr.u64 %rd1860, %rd1854, 32;
and.b64 %rd1861, %rd2657, 4294967295;
xor.b64 %rd1862, %rd1861, %rd1860;
xor.b64 %rd1863, %rd1862, %rd2658;
mul.lo.s64 %rd1864, %rd1863, %rd2651;
and.b64 %rd1865, %rd1864, 4294967295;
xor.b64 %rd1866, %rd1865, %rd1859;
xor.b64 %rd1867, %rd1866, %rd2659;
mul.lo.s64 %rd1868, %rd1867, %rd2653;
shr.u64 %rd1869, %rd1868, 32;
shr.u64 %rd1870, %rd1864, 32;
and.b64 %rd1871, %rd2660, 4294967295;
xor.b64 %rd1872, %rd1871, %rd1870;
xor.b64 %rd1873, %rd1872, %rd2661;
mul.lo.s64 %rd1874, %rd1873, %rd2653;
and.b64 %rd1875, %rd1874, 4294967295;
xor.b64 %rd1876, %rd1875, %rd1869;
xor.b64 %rd1877, %rd1876, %rd2662;
mul.lo.s64 %rd1878, %rd1877, %rd2651;
shr.u64 %rd1879, %rd1878, 32;
shr.u64 %rd1880, %rd1874, 32;
and.b64 %rd1881, %rd1848, 4294967295;
xor.b64 %rd1882, %rd1881, %rd1880;
xor.b64 %rd1883, %rd1882, %rd2663;
mul.lo.s64 %rd1884, %rd1883, %rd2651;
and.b64 %rd1885, %rd1884, 4294967295;
xor.b64 %rd1886, %rd1885, %rd1879;
xor.b64 %rd1887, %rd1886, %rd2664;
mul.lo.s64 %rd1888, %rd1887, %rd2653;
shr.u64 %rd1889, %rd1888, 32;
shr.u64 %rd1890, %rd1884, 32;
and.b64 %rd1891, %rd1858, 4294967295;
xor.b64 %rd1892, %rd1891, %rd1890;
xor.b64 %rd1893, %rd1892, %rd2665;
mul.lo.s64 %rd1894, %rd1893, %rd2653;
and.b64 %rd1895, %rd1894, 4294967295;
xor.b64 %rd1896, %rd1895, %rd1889;
xor.b64 %rd1897, %rd1896, %rd2666;
mul.lo.s64 %rd1898, %rd1897, %rd2651;
shr.u64 %rd1899, %rd1898, 32;
shr.u64 %rd1900, %rd1894, 32;
xor.b64 %rd1901, %rd1868, %rd1900;
xor.b64 %rd1902, %rd1901, %rd2667;
mul.lo.s64 %rd1903, %rd1902, %rd2651;
xor.b64 %rd1904, %rd1899, %rd1903;
cvt.u32.u64 %r209, %rd1904;
xor.b32 %r210, %r338, %r209;
mul.lo.s32 %r211, %r210, %r339;
shr.u32 %r212, %r211, 9;
cvt.rn.f32.u32 %f190, %r212;
mul.rn.f32 %f191, %f190, 0f34000000;
cvt.rn.f16.f32 %h100, %f191;
mov.b16 %h101, 0x2E66;
setp.ge.f16 %p49, %h100, %h101;
ld.global.nc.b16 %h102, [%rd45+1282];
ld.global.nc.f32 %f192, [%rd46+2564];
cvt.rn.f16.f32 %h103, %f192;
add.rn.f16 %h104, %h102, %h103;
mov.b16 %h105, 0x3C72;
mul.rn.f16 %h106, %h104, %h105;
selp.b16 %h107, %h106, 0x0000, %p49;
cvt.f32.f16 %f193, %h107;
ld.global.nc.b16 %h108, [%rd47+1282];
cvt.f32.f16 %f194, %h108;
ld.global.nc.f32 %f195, [%rd48+2564];
mul.rn.f32 %f196, %f1, %f195;
mul.rn.f32 %f197, %f196, %f194;
ld.global.nc.f32 %f198, [%rd49+2564];
mul.rn.f32 %f199, %f2, %f196;
sub.rn.f32 %f200, %f198, %f199;
add.rn.f32 %f201, %f197, %f200;
add.rn.f32 %f202, %f201, %f193;
sub.rn.f32 %f203, %f202, %f3;
mul.rn.f32 %f204, %f203, %f203;
add.rn.f32 %f15, %f14, %f204;
or.b32 %r214, %r73, 768;
shr.u32 %r215, %r214, 2;
cvt.u64.u32 %rd1905, %r215;
add.s64 %rd351, %rd12, %rd1905;
@%p8 bra LBB30_40;
and.b64 %rd1947, %rd351, 4294967295;
mul.lo.s64 %rd2672, %rd1947, 3528531795;
setp.lt.u64 %p51, %rd351, %rd12;
selp.u64 %rd1948, 1, 0, %p51;
add.s64 %rd1949, %rd2464, %rd1948;
xor.b64 %rd1950, %rd1949, %rd2672;
shr.u64 %rd1951, %rd1950, 32;
mul.lo.s64 %rd2675, %rd1951, 3449720151;
shr.u64 %rd1952, %rd2675, 32;
and.b64 %rd1953, %rd1949, 4294967295;
mul.lo.s64 %rd1954, %rd1953, 3449720151;
and.b64 %rd1955, %rd1954, 4294967295;
xor.b64 %rd1956, %rd1955, %rd1952;
xor.b64 %rd1957, %rd1956, 2654435769;
mul.lo.s64 %rd2678, %rd1957, 3528531795;
xor.b64 %rd2668, %rd1954, %rd351;
mov.u32 %r342, -1879881855;
mov.u32 %r341, -845247145;
mov.u32 %r340, 534103459;
mov.u64 %rd2686, 3678237736;
mov.u64 %rd2685, 3041712726;
mov.u64 %rd2684, 1401181199;
mov.u64 %rd2683, 2835769497;
mov.u64 %rd2682, 1684936478;
mov.u64 %rd2681, 2027808484;
mov.u64 %rd2680, 387276957;
mov.u64 %rd2679, 842468239;
mov.u64 %rd2677, 3986602516;
mov.u64 %rd2676, 1013904242;
mov.u64 %rd2674, 3668340011;
mov.u64 %rd2673, 3144134277;
mov.u64 %rd2671, 3449720151;
mov.u64 %rd2670, 1993301258;
mov.u64 %rd2669, 3528531795;
bra.uni LBB30_41;
LBB30_40:
setp.lt.u64 %p50, %rd351, %rd12;
selp.u64 %rd1921, 1, 0, %p50;
add.s64 %rd1922, %rd2464, %rd1921;
and.b64 %rd1923, %rd1922, 4294967295;
mul.lo.s64 %rd2672, %rd1923, 3449720151;
xor.b64 %rd1924, %rd2672, %rd351;
shr.u64 %rd1925, %rd1924, 32;
mul.lo.s64 %rd2675, %rd1925, 3528531795;
shr.u64 %rd1926, %rd2675, 32;
and.b64 %rd1927, %rd351, 4294967295;
mul.lo.s64 %rd1928, %rd1927, 3528531795;
and.b64 %rd1929, %rd1928, 4294967295;
xor.b64 %rd1930, %rd1929, %rd1926;
xor.b64 %rd1931, %rd1930, 3144134277;
mul.lo.s64 %rd2678, %rd1931, 3449720151;
xor.b64 %rd2668, %rd1922, %rd1928;
mov.u32 %r342, -1767562579;
mov.u32 %r341, -766435501;
mov.u32 %r340, 1401181199;
mov.u64 %rd2686, 4055616968;
mov.u64 %rd2685, 1684936478;
mov.u64 %rd2684, 534103459;
mov.u64 %rd2683, 387276957;
mov.u64 %rd2682, 3041712726;
mov.u64 %rd2681, 3986602516;
mov.u64 %rd2680, 2835769497;
mov.u64 %rd2679, 3668340011;
mov.u64 %rd2677, 2027808484;
mov.u64 %rd2676, 1993301258;
mov.u64 %rd2674, 842468239;
mov.u64 %rd2673, 2654435769;
mov.u64 %rd2671, 3528531795;
mov.u64 %rd2670, 1013904242;
mov.u64 %rd2669, 3449720151;
LBB30_41:
shr.u64 %rd1958, %rd2678, 32;
shr.u64 %rd1959, %rd2668, 32;
mul.lo.s64 %rd1960, %rd1959, %rd2669;
and.b64 %rd1961, %rd1960, 4294967295;
xor.b64 %rd1962, %rd1961, %rd1958;
xor.b64 %rd1963, %rd1962, %rd2670;
mul.lo.s64 %rd1964, %rd1963, %rd2671;
shr.u64 %rd1965, %rd1964, 32;
shr.u64 %rd1966, %rd1960, 32;
and.b64 %rd1967, %rd2672, 4294967295;
xor.b64 %rd1968, %rd1967, %rd1966;
xor.b64 %rd1969, %rd1968, %rd2673;
mul.lo.s64 %rd1970, %rd1969, %rd2671;
and.b64 %rd1971, %rd1970, 4294967295;
xor.b64 %rd1972, %rd1971, %rd1965;
xor.b64 %rd1973, %rd1972, %rd2674;
mul.lo.s64 %rd1974, %rd1973, %rd2669;
shr.u64 %rd1975, %rd1974, 32;
shr.u64 %rd1976, %rd1970, 32;
and.b64 %rd1977, %rd2675, 4294967295;
xor.b64 %rd1978, %rd1977, %rd1976;
xor.b64 %rd1979, %rd1978, %rd2676;
mul.lo.s64 %rd1980, %rd1979, %rd2669;
and.b64 %rd1981, %rd1980, 4294967295;
xor.b64 %rd1982, %rd1981, %rd1975;
xor.b64 %rd1983, %rd1982, %rd2677;
mul.lo.s64 %rd1984, %rd1983, %rd2671;
shr.u64 %rd1985, %rd1984, 32;
shr.u64 %rd1986, %rd1980, 32;
and.b64 %rd1987, %rd2678, 4294967295;
xor.b64 %rd1988, %rd1987, %rd1986;
xor.b64 %rd1989, %rd1988, %rd2679;
mul.lo.s64 %rd1990, %rd1989, %rd2671;
and.b64 %rd1991, %rd1990, 4294967295;
xor.b64 %rd1992, %rd1991, %rd1985;
xor.b64 %rd1993, %rd1992, %rd2680;
mul.lo.s64 %rd1994, %rd1993, %rd2669;
shr.u64 %rd1995, %rd1994, 32;
shr.u64 %rd1996, %rd1990, 32;
and.b64 %rd1997, %rd1964, 4294967295;
xor.b64 %rd1998, %rd1997, %rd1996;
xor.b64 %rd1999, %rd1998, %rd2681;
mul.lo.s64 %rd2000, %rd1999, %rd2669;
and.b64 %rd2001, %rd2000, 4294967295;
xor.b64 %rd2002, %rd2001, %rd1995;
xor.b64 %rd2003, %rd2002, %rd2682;
mul.lo.s64 %rd2004, %rd2003, %rd2671;
shr.u64 %rd2005, %rd2004, 32;
shr.u64 %rd2006, %rd2000, 32;
and.b64 %rd2007, %rd1974, 4294967295;
xor.b64 %rd2008, %rd2007, %rd2006;
xor.b64 %rd2009, %rd2008, %rd2683;
mul.lo.s64 %rd2010, %rd2009, %rd2671;
and.b64 %rd2011, %rd2010, 4294967295;
xor.b64 %rd2012, %rd2011, %rd2005;
xor.b64 %rd2013, %rd2012, %rd2684;
mul.lo.s64 %rd2014, %rd2013, %rd2669;
shr.u64 %rd2015, %rd2014, 32;
shr.u64 %rd2016, %rd2010, 32;
and.b64 %rd2017, %rd1984, 4294967295;
xor.b64 %rd2018, %rd2017, %rd2016;
xor.b64 %rd2019, %rd2018, %rd2685;
mul.lo.s64 %rd2020, %rd2019, %rd2669;
and.b64 %rd2021, %rd2020, 4294967295;
xor.b64 %rd2022, %rd2021, %rd2015;
xor.b64 %rd2023, %rd2022, %rd2686;
mul.lo.s64 %rd2024, %rd2023, %rd2671;
shr.u64 %rd2025, %rd2024, 32;
cvt.u32.u64 %r222, %rd2025;
shr.u64 %rd2026, %rd2020, 32;
xor.b64 %rd2027, %rd2026, %rd1994;
cvt.u32.u64 %r223, %rd2027;
xor.b32 %r224, %r340, %r223;
mul.lo.s32 %r225, %r224, %r341;
xor.b32 %r226, %r225, %r222;
xor.b32 %r227, %r226, %r342;
shr.u32 %r228, %r227, 9;
cvt.rn.f32.u32 %f205, %r228;
mul.rn.f32 %f206, %f205, 0f34000000;
cvt.rn.f16.f32 %h109, %f206;
mov.b16 %h110, 0x2E66;
setp.ge.f16 %p52, %h109, %h110;
ld.global.nc.b16 %h111, [%rd45+1536];
ld.global.nc.f32 %f207, [%rd46+3072];
cvt.rn.f16.f32 %h112, %f207;
add.rn.f16 %h113, %h111, %h112;
mov.b16 %h114, 0x3C72;
mul.rn.f16 %h115, %h113, %h114;
selp.b16 %h116, %h115, 0x0000, %p52;
cvt.f32.f16 %f208, %h116;
ld.global.nc.b16 %h117, [%rd47+1536];
cvt.f32.f16 %f209, %h117;
ld.global.nc.f32 %f210, [%rd48+3072];
mul.rn.f32 %f211, %f1, %f210;
mul.rn.f32 %f212, %f211, %f209;
ld.global.nc.f32 %f213, [%rd49+3072];
mul.rn.f32 %f214, %f2, %f211;
sub.rn.f32 %f215, %f213, %f214;
add.rn.f32 %f216, %f212, %f215;
add.rn.f32 %f217, %f216, %f208;
sub.rn.f32 %f218, %f217, %f3;
mul.rn.f32 %f219, %f218, %f218;
add.rn.f32 %f16, %f15, %f219;
or.b32 %r229, %r3, 769;
or.b32 %r230, %r229, %r4;
and.b32 %r231, %r229, 3;
shr.u32 %r232, %r230, 2;
setp.ne.s32 %p53, %r231, 1;
cvt.u64.u32 %rd2028, %r232;
add.s64 %rd379, %rd12, %rd2028;
@%p53 bra LBB30_43;
and.b64 %rd2068, %rd379, 4294967295;
mul.lo.s64 %rd2691, %rd2068, 3528531795;
setp.lt.u64 %p55, %rd379, %rd12;
selp.u64 %rd2069, 1, 0, %p55;
add.s64 %rd2070, %rd2464, %rd2069;
xor.b64 %rd2071, %rd2070, %rd2691;
shr.u64 %rd2072, %rd2071, 32;
mul.lo.s64 %rd2694, %rd2072, 3449720151;
shr.u64 %rd2073, %rd2694, 32;
and.b64 %rd2074, %rd2070, 4294967295;
mul.lo.s64 %rd2075, %rd2074, 3449720151;
and.b64 %rd2076, %rd2075, 4294967295;
xor.b64 %rd2077, %rd2076, %rd2073;
xor.b64 %rd2078, %rd2077, 2654435769;
mul.lo.s64 %rd2697, %rd2078, 3528531795;
xor.b64 %rd2687, %rd2075, %rd379;
mov.u32 %r344, -845247145;
mov.u32 %r343, -616729560;
mov.u64 %rd2704, 3041712726;
mov.u64 %rd2703, 1401181199;
mov.u64 %rd2702, 2835769497;
mov.u64 %rd2701, 1684936478;
mov.u64 %rd2700, 2027808484;
mov.u64 %rd2699, 387276957;
mov.u64 %rd2698, 842468239;
mov.u64 %rd2696, 3986602516;
mov.u64 %rd2695, 1013904242;
mov.u64 %rd2693, 3668340011;
mov.u64 %rd2692, 3144134277;
mov.u64 %rd2690, 3449720151;
mov.u64 %rd2689, 1993301258;
mov.u64 %rd2688, 3528531795;
bra.uni LBB30_44;
LBB30_43:
setp.lt.u64 %p54, %rd379, %rd12;
selp.u64 %rd2043, 1, 0, %p54;
add.s64 %rd2044, %rd2464, %rd2043;
and.b64 %rd2045, %rd2044, 4294967295;
mul.lo.s64 %rd2691, %rd2045, 3449720151;
xor.b64 %rd2046, %rd2691, %rd379;
shr.u64 %rd2047, %rd2046, 32;
mul.lo.s64 %rd2694, %rd2047, 3528531795;
shr.u64 %rd2048, %rd2694, 32;
and.b64 %rd2049, %rd379, 4294967295;
mul.lo.s64 %rd2050, %rd2049, 3528531795;
and.b64 %rd2051, %rd2050, 4294967295;
xor.b64 %rd2052, %rd2051, %rd2048;
xor.b64 %rd2053, %rd2052, 3144134277;
mul.lo.s64 %rd2697, %rd2053, 3449720151;
xor.b64 %rd2687, %rd2044, %rd2050;
mov.u32 %r344, -766435501;
mov.u32 %r343, -239350328;
mov.u64 %rd2704, 1684936478;
mov.u64 %rd2703, 534103459;
mov.u64 %rd2702, 387276957;
mov.u64 %rd2701, 3041712726;
mov.u64 %rd2700, 3986602516;
mov.u64 %rd2699, 2835769497;
mov.u64 %rd2698, 3668340011;
mov.u64 %rd2696, 2027808484;
mov.u64 %rd2695, 1993301258;
mov.u64 %rd2693, 842468239;
mov.u64 %rd2692, 2654435769;
mov.u64 %rd2690, 3528531795;
mov.u64 %rd2689, 1013904242;
mov.u64 %rd2688, 3449720151;
LBB30_44:
shr.u64 %rd2079, %rd2697, 32;
shr.u64 %rd2080, %rd2687, 32;
mul.lo.s64 %rd2081, %rd2080, %rd2688;
and.b64 %rd2082, %rd2081, 4294967295;
xor.b64 %rd2083, %rd2082, %rd2079;
xor.b64 %rd2084, %rd2083, %rd2689;
mul.lo.s64 %rd2085, %rd2084, %rd2690;
shr.u64 %rd2086, %rd2085, 32;
shr.u64 %rd2087, %rd2081, 32;
and.b64 %rd2088, %rd2691, 4294967295;
xor.b64 %rd2089, %rd2088, %rd2087;
xor.b64 %rd2090, %rd2089, %rd2692;
mul.lo.s64 %rd2091, %rd2090, %rd2690;
and.b64 %rd2092, %rd2091, 4294967295;
xor.b64 %rd2093, %rd2092, %rd2086;
xor.b64 %rd2094, %rd2093, %rd2693;
mul.lo.s64 %rd2095, %rd2094, %rd2688;
shr.u64 %rd2096, %rd2095, 32;
shr.u64 %rd2097, %rd2091, 32;
and.b64 %rd2098, %rd2694, 4294967295;
xor.b64 %rd2099, %rd2098, %rd2097;
xor.b64 %rd2100, %rd2099, %rd2695;
mul.lo.s64 %rd2101, %rd2100, %rd2688;
and.b64 %rd2102, %rd2101, 4294967295;
xor.b64 %rd2103, %rd2102, %rd2096;
xor.b64 %rd2104, %rd2103, %rd2696;
mul.lo.s64 %rd2105, %rd2104, %rd2690;
shr.u64 %rd2106, %rd2105, 32;
shr.u64 %rd2107, %rd2101, 32;
and.b64 %rd2108, %rd2697, 4294967295;
xor.b64 %rd2109, %rd2108, %rd2107;
xor.b64 %rd2110, %rd2109, %rd2698;
mul.lo.s64 %rd2111, %rd2110, %rd2690;
and.b64 %rd2112, %rd2111, 4294967295;
xor.b64 %rd2113, %rd2112, %rd2106;
xor.b64 %rd2114, %rd2113, %rd2699;
mul.lo.s64 %rd2115, %rd2114, %rd2688;
shr.u64 %rd2116, %rd2115, 32;
shr.u64 %rd2117, %rd2111, 32;
and.b64 %rd2118, %rd2085, 4294967295;
xor.b64 %rd2119, %rd2118, %rd2117;
xor.b64 %rd2120, %rd2119, %rd2700;
mul.lo.s64 %rd2121, %rd2120, %rd2688;
and.b64 %rd2122, %rd2121, 4294967295;
xor.b64 %rd2123, %rd2122, %rd2116;
xor.b64 %rd2124, %rd2123, %rd2701;
mul.lo.s64 %rd2125, %rd2124, %rd2690;
shr.u64 %rd2126, %rd2125, 32;
shr.u64 %rd2127, %rd2121, 32;
and.b64 %rd2128, %rd2095, 4294967295;
xor.b64 %rd2129, %rd2128, %rd2127;
xor.b64 %rd2130, %rd2129, %rd2702;
mul.lo.s64 %rd2131, %rd2130, %rd2690;
and.b64 %rd2132, %rd2131, 4294967295;
xor.b64 %rd2133, %rd2132, %rd2126;
xor.b64 %rd2134, %rd2133, %rd2703;
mul.lo.s64 %rd2135, %rd2134, %rd2688;
shr.u64 %rd2136, %rd2135, 32;
shr.u64 %rd2137, %rd2131, 32;
xor.b64 %rd2138, %rd2105, %rd2137;
xor.b64 %rd2139, %rd2138, %rd2704;
mul.lo.s64 %rd2140, %rd2139, %rd2688;
xor.b64 %rd2141, %rd2136, %rd2140;
cvt.u32.u64 %r237, %rd2141;
xor.b32 %r238, %r343, %r237;
mul.lo.s32 %r239, %r238, %r344;
shr.u32 %r240, %r239, 9;
cvt.rn.f32.u32 %f220, %r240;
mul.rn.f32 %f221, %f220, 0f34000000;
cvt.rn.f16.f32 %h118, %f221;
mov.b16 %h119, 0x2E66;
setp.ge.f16 %p57, %h118, %h119;
ld.global.nc.b16 %h120, [%rd45+1538];
ld.global.nc.f32 %f222, [%rd46+3076];
cvt.rn.f16.f32 %h121, %f222;
add.rn.f16 %h122, %h120, %h121;
mov.b16 %h123, 0x3C72;
mul.rn.f16 %h124, %h122, %h123;
selp.b16 %h125, %h124, 0x0000, %p57;
cvt.f32.f16 %f223, %h125;
ld.global.nc.b16 %h126, [%rd47+1538];
cvt.f32.f16 %f224, %h126;
ld.global.nc.f32 %f225, [%rd48+3076];
mul.rn.f32 %f226, %f1, %f225;
mul.rn.f32 %f227, %f226, %f224;
ld.global.nc.f32 %f228, [%rd49+3076];
mul.rn.f32 %f229, %f2, %f226;
sub.rn.f32 %f230, %f228, %f229;
add.rn.f32 %f231, %f227, %f230;
add.rn.f32 %f232, %f231, %f223;
sub.rn.f32 %f233, %f232, %f3;
mul.rn.f32 %f234, %f233, %f233;
add.rn.f32 %f17, %f16, %f234;
or.b32 %r242, %r73, 896;
shr.u32 %r243, %r242, 2;
cvt.u64.u32 %rd2142, %r243;
add.s64 %rd406, %rd12, %rd2142;
@%p8 bra LBB30_46;
mov.u32 %r347, -1879881855;
mov.u32 %r345, 534103459;
mov.u64 %rd2723, 3678237736;
and.b64 %rd2184, %rd406, 4294967295;
mul.lo.s64 %rd2709, %rd2184, 3528531795;
setp.lt.u64 %p59, %rd406, %rd12;
selp.u64 %rd2185, 1, 0, %p59;
add.s64 %rd2186, %rd2464, %rd2185;
xor.b64 %rd2187, %rd2186, %rd2709;
shr.u64 %rd2188, %rd2187, 32;
mul.lo.s64 %rd2712, %rd2188, 3449720151;
shr.u64 %rd2189, %rd2712, 32;
and.b64 %rd2190, %rd2186, 4294967295;
mul.lo.s64 %rd2191, %rd2190, 3449720151;
and.b64 %rd2192, %rd2191, 4294967295;
xor.b64 %rd2193, %rd2192, %rd2189;
xor.b64 %rd2194, %rd2193, 2654435769;
mul.lo.s64 %rd2715, %rd2194, 3528531795;
xor.b64 %rd2705, %rd2191, %rd406;
mov.u32 %r346, -845247145;
mov.u64 %rd2722, 3041712726;
mov.u64 %rd2721, 1401181199;
mov.u64 %rd2720, 2835769497;
mov.u64 %rd2719, 1684936478;
mov.u64 %rd2718, 2027808484;
mov.u64 %rd2717, 387276957;
mov.u64 %rd2716, 842468239;
mov.u64 %rd2714, 3986602516;
mov.u64 %rd2713, 1013904242;
mov.u64 %rd2711, 3668340011;
mov.u64 %rd2710, 3144134277;
mov.u64 %rd2708, 3449720151;
mov.u64 %rd2707, 1993301258;
mov.u64 %rd2706, 3528531795;
bra.uni LBB30_47;
LBB30_46:
setp.lt.u64 %p58, %rd406, %rd12;
selp.u64 %rd2158, 1, 0, %p58;
add.s64 %rd2159, %rd2464, %rd2158;
and.b64 %rd2160, %rd2159, 4294967295;
mul.lo.s64 %rd2709, %rd2160, 3449720151;
xor.b64 %rd2161, %rd2709, %rd406;
shr.u64 %rd2162, %rd2161, 32;
mul.lo.s64 %rd2712, %rd2162, 3528531795;
shr.u64 %rd2163, %rd2712, 32;
and.b64 %rd2164, %rd406, 4294967295;
mul.lo.s64 %rd2165, %rd2164, 3528531795;
and.b64 %rd2166, %rd2165, 4294967295;
xor.b64 %rd2167, %rd2166, %rd2163;
xor.b64 %rd2168, %rd2167, 3144134277;
mul.lo.s64 %rd2715, %rd2168, 3449720151;
xor.b64 %rd2705, %rd2159, %rd2165;
mov.u32 %r347, -1767562579;
mov.u32 %r346, -766435501;
mov.u32 %r345, 1401181199;
mov.u64 %rd2723, 4055616968;
mov.u64 %rd2722, 1684936478;
mov.u64 %rd2721, 534103459;
mov.u64 %rd2720, 387276957;
mov.u64 %rd2719, 3041712726;
mov.u64 %rd2718, 3986602516;
mov.u64 %rd2717, 2835769497;
mov.u64 %rd2716, 3668340011;
mov.u64 %rd2714, 2027808484;
mov.u64 %rd2713, 1993301258;
mov.u64 %rd2711, 842468239;
mov.u64 %rd2710, 2654435769;
mov.u64 %rd2708, 3528531795;
mov.u64 %rd2707, 1013904242;
mov.u64 %rd2706, 3449720151;
LBB30_47:
shr.u64 %rd2195, %rd2715, 32;
shr.u64 %rd2196, %rd2705, 32;
mul.lo.s64 %rd2197, %rd2196, %rd2706;
and.b64 %rd2198, %rd2197, 4294967295;
xor.b64 %rd2199, %rd2198, %rd2195;
xor.b64 %rd2200, %rd2199, %rd2707;
mul.lo.s64 %rd2201, %rd2200, %rd2708;
shr.u64 %rd2202, %rd2201, 32;
shr.u64 %rd2203, %rd2197, 32;
and.b64 %rd2204, %rd2709, 4294967295;
xor.b64 %rd2205, %rd2204, %rd2203;
xor.b64 %rd2206, %rd2205, %rd2710;
mul.lo.s64 %rd2207, %rd2206, %rd2708;
and.b64 %rd2208, %rd2207, 4294967295;
xor.b64 %rd2209, %rd2208, %rd2202;
xor.b64 %rd2210, %rd2209, %rd2711;
mul.lo.s64 %rd2211, %rd2210, %rd2706;
shr.u64 %rd2212, %rd2211, 32;
shr.u64 %rd2213, %rd2207, 32;
and.b64 %rd2214, %rd2712, 4294967295;
xor.b64 %rd2215, %rd2214, %rd2213;
xor.b64 %rd2216, %rd2215, %rd2713;
mul.lo.s64 %rd2217, %rd2216, %rd2706;
and.b64 %rd2218, %rd2217, 4294967295;
xor.b64 %rd2219, %rd2218, %rd2212;
xor.b64 %rd2220, %rd2219, %rd2714;
mul.lo.s64 %rd2221, %rd2220, %rd2708;
shr.u64 %rd2222, %rd2221, 32;
shr.u64 %rd2223, %rd2217, 32;
and.b64 %rd2224, %rd2715, 4294967295;
xor.b64 %rd2225, %rd2224, %rd2223;
xor.b64 %rd2226, %rd2225, %rd2716;
mul.lo.s64 %rd2227, %rd2226, %rd2708;
and.b64 %rd2228, %rd2227, 4294967295;
xor.b64 %rd2229, %rd2228, %rd2222;
xor.b64 %rd2230, %rd2229, %rd2717;
mul.lo.s64 %rd2231, %rd2230, %rd2706;
shr.u64 %rd2232, %rd2231, 32;
shr.u64 %rd2233, %rd2227, 32;
and.b64 %rd2234, %rd2201, 4294967295;
xor.b64 %rd2235, %rd2234, %rd2233;
xor.b64 %rd2236, %rd2235, %rd2718;
mul.lo.s64 %rd2237, %rd2236, %rd2706;
and.b64 %rd2238, %rd2237, 4294967295;
xor.b64 %rd2239, %rd2238, %rd2232;
xor.b64 %rd2240, %rd2239, %rd2719;
mul.lo.s64 %rd2241, %rd2240, %rd2708;
shr.u64 %rd2242, %rd2241, 32;
shr.u64 %rd2243, %rd2237, 32;
and.b64 %rd2244, %rd2211, 4294967295;
xor.b64 %rd2245, %rd2244, %rd2243;
xor.b64 %rd2246, %rd2245, %rd2720;
mul.lo.s64 %rd2247, %rd2246, %rd2708;
and.b64 %rd2248, %rd2247, 4294967295;
xor.b64 %rd2249, %rd2248, %rd2242;
xor.b64 %rd2250, %rd2249, %rd2721;
mul.lo.s64 %rd2251, %rd2250, %rd2706;
shr.u64 %rd2252, %rd2251, 32;
shr.u64 %rd2253, %rd2247, 32;
and.b64 %rd2254, %rd2221, 4294967295;
xor.b64 %rd2255, %rd2254, %rd2253;
xor.b64 %rd2256, %rd2255, %rd2722;
mul.lo.s64 %rd2257, %rd2256, %rd2706;
and.b64 %rd2258, %rd2257, 4294967295;
xor.b64 %rd2259, %rd2258, %rd2252;
xor.b64 %rd2260, %rd2259, %rd2723;
mul.lo.s64 %rd2261, %rd2260, %rd2708;
shr.u64 %rd2262, %rd2261, 32;
cvt.u32.u64 %r250, %rd2262;
shr.u64 %rd2263, %rd2257, 32;
xor.b64 %rd2264, %rd2263, %rd2231;
cvt.u32.u64 %r251, %rd2264;
xor.b32 %r252, %r345, %r251;
mul.lo.s32 %r253, %r252, %r346;
xor.b32 %r254, %r253, %r250;
xor.b32 %r255, %r254, %r347;
shr.u32 %r256, %r255, 9;
cvt.rn.f32.u32 %f235, %r256;
mul.rn.f32 %f236, %f235, 0f34000000;
cvt.rn.f16.f32 %h127, %f236;
mov.b16 %h128, 0x2E66;
setp.ge.f16 %p60, %h127, %h128;
ld.global.nc.b16 %h129, [%rd45+1792];
ld.global.nc.f32 %f237, [%rd46+3584];
cvt.rn.f16.f32 %h130, %f237;
add.rn.f16 %h131, %h129, %h130;
mov.b16 %h132, 0x3C72;
mul.rn.f16 %h133, %h131, %h132;
selp.b16 %h134, %h133, 0x0000, %p60;
cvt.f32.f16 %f238, %h134;
ld.global.nc.b16 %h135, [%rd47+1792];
cvt.f32.f16 %f239, %h135;
ld.global.nc.f32 %f240, [%rd48+3584];
mul.rn.f32 %f241, %f1, %f240;
mul.rn.f32 %f242, %f241, %f239;
ld.global.nc.f32 %f243, [%rd49+3584];
mul.rn.f32 %f244, %f2, %f241;
sub.rn.f32 %f245, %f243, %f244;
add.rn.f32 %f246, %f242, %f245;
add.rn.f32 %f247, %f246, %f238;
sub.rn.f32 %f248, %f247, %f3;
mul.rn.f32 %f249, %f248, %f248;
add.rn.f32 %f18, %f17, %f249;
or.b32 %r257, %r3, 897;
or.b32 %r258, %r257, %r4;
and.b32 %r259, %r257, 3;
shr.u32 %r260, %r258, 2;
setp.ne.s32 %p61, %r259, 1;
cvt.u64.u32 %rd2265, %r260;
add.s64 %rd434, %rd12, %rd2265;
@%p61 bra LBB30_49;
mov.u32 %r349, -845247145;
mov.u64 %rd2740, 1401181199;
mov.u64 %rd2729, 3144134277;
mov.u32 %r348, -616729560;
and.b64 %rd2305, %rd434, 4294967295;
mul.lo.s64 %rd2728, %rd2305, 3528531795;
setp.lt.u64 %p63, %rd434, %rd12;
selp.u64 %rd2306, 1, 0, %p63;
add.s64 %rd2307, %rd2464, %rd2306;
xor.b64 %rd2308, %rd2307, %rd2728;
shr.u64 %rd2309, %rd2308, 32;
mul.lo.s64 %rd2731, %rd2309, 3449720151;
shr.u64 %rd2310, %rd2731, 32;
and.b64 %rd2311, %rd2307, 4294967295;
mul.lo.s64 %rd2312, %rd2311, 3449720151;
and.b64 %rd2313, %rd2312, 4294967295;
xor.b64 %rd2314, %rd2313, %rd2310;
xor.b64 %rd2315, %rd2314, 2654435769;
mul.lo.s64 %rd2734, %rd2315, 3528531795;
xor.b64 %rd2724, %rd2312, %rd434;
mov.u64 %rd2741, 3041712726;
mov.u64 %rd2739, 2835769497;
mov.u64 %rd2738, 1684936478;
mov.u64 %rd2737, 2027808484;
mov.u64 %rd2736, 387276957;
mov.u64 %rd2735, 842468239;
mov.u64 %rd2733, 3986602516;
mov.u64 %rd2732, 1013904242;
mov.u64 %rd2730, 3668340011;
mov.u64 %rd2727, 3449720151;
mov.u64 %rd2726, 1993301258;
mov.u64 %rd2725, 3528531795;
bra.uni LBB30_50;
LBB30_49:
setp.lt.u64 %p62, %rd434, %rd12;
selp.u64 %rd2280, 1, 0, %p62;
add.s64 %rd2281, %rd2464, %rd2280;
and.b64 %rd2282, %rd2281, 4294967295;
mul.lo.s64 %rd2728, %rd2282, 3449720151;
xor.b64 %rd2283, %rd2728, %rd434;
shr.u64 %rd2284, %rd2283, 32;
mul.lo.s64 %rd2731, %rd2284, 3528531795;
shr.u64 %rd2285, %rd2731, 32;
and.b64 %rd2286, %rd434, 4294967295;
mul.lo.s64 %rd2287, %rd2286, 3528531795;
and.b64 %rd2288, %rd2287, 4294967295;
xor.b64 %rd2289, %rd2288, %rd2285;
xor.b64 %rd2290, %rd2289, 3144134277;
mul.lo.s64 %rd2734, %rd2290, 3449720151;
xor.b64 %rd2724, %rd2281, %rd2287;
mov.u32 %r349, -766435501;
mov.u32 %r348, -239350328;
mov.u64 %rd2741, 1684936478;
mov.u64 %rd2740, 534103459;
mov.u64 %rd2739, 387276957;
mov.u64 %rd2738, 3041712726;
mov.u64 %rd2737, 3986602516;
mov.u64 %rd2736, 2835769497;
mov.u64 %rd2735, 3668340011;
mov.u64 %rd2733, 2027808484;
mov.u64 %rd2732, 1993301258;
mov.u64 %rd2730, 842468239;
mov.u64 %rd2729, 2654435769;
mov.u64 %rd2727, 3528531795;
mov.u64 %rd2726, 1013904242;
mov.u64 %rd2725, 3449720151;
LBB30_50:
shr.u64 %rd2316, %rd2734, 32;
shr.u64 %rd2317, %rd2724, 32;
mul.lo.s64 %rd2318, %rd2317, %rd2725;
and.b64 %rd2319, %rd2318, 4294967295;
xor.b64 %rd2320, %rd2319, %rd2316;
xor.b64 %rd2321, %rd2320, %rd2726;
mul.lo.s64 %rd2322, %rd2321, %rd2727;
shr.u64 %rd2323, %rd2322, 32;
shr.u64 %rd2324, %rd2318, 32;
and.b64 %rd2325, %rd2728, 4294967295;
xor.b64 %rd2326, %rd2325, %rd2324;
xor.b64 %rd2327, %rd2326, %rd2729;
mul.lo.s64 %rd2328, %rd2327, %rd2727;
and.b64 %rd2329, %rd2328, 4294967295;
xor.b64 %rd2330, %rd2329, %rd2323;
xor.b64 %rd2331, %rd2330, %rd2730;
mul.lo.s64 %rd2332, %rd2331, %rd2725;
shr.u64 %rd2333, %rd2332, 32;
shr.u64 %rd2334, %rd2328, 32;
and.b64 %rd2335, %rd2731, 4294967295;
xor.b64 %rd2336, %rd2335, %rd2334;
xor.b64 %rd2337, %rd2336, %rd2732;
mul.lo.s64 %rd2338, %rd2337, %rd2725;
and.b64 %rd2339, %rd2338, 4294967295;
xor.b64 %rd2340, %rd2339, %rd2333;
xor.b64 %rd2341, %rd2340, %rd2733;
mul.lo.s64 %rd2342, %rd2341, %rd2727;
shr.u64 %rd2343, %rd2342, 32;
shr.u64 %rd2344, %rd2338, 32;
and.b64 %rd2345, %rd2734, 4294967295;
xor.b64 %rd2346, %rd2345, %rd2344;
xor.b64 %rd2347, %rd2346, %rd2735;
mul.lo.s64 %rd2348, %rd2347, %rd2727;
and.b64 %rd2349, %rd2348, 4294967295;
xor.b64 %rd2350, %rd2349, %rd2343;
xor.b64 %rd2351, %rd2350, %rd2736;
mul.lo.s64 %rd2352, %rd2351, %rd2725;
shr.u64 %rd2353, %rd2352, 32;
shr.u64 %rd2354, %rd2348, 32;
and.b64 %rd2355, %rd2322, 4294967295;
xor.b64 %rd2356, %rd2355, %rd2354;
xor.b64 %rd2357, %rd2356, %rd2737;
mul.lo.s64 %rd2358, %rd2357, %rd2725;
and.b64 %rd2359, %rd2358, 4294967295;
xor.b64 %rd2360, %rd2359, %rd2353;
xor.b64 %rd2361, %rd2360, %rd2738;
mul.lo.s64 %rd2362, %rd2361, %rd2727;
shr.u64 %rd2363, %rd2362, 32;
shr.u64 %rd2364, %rd2358, 32;
and.b64 %rd2365, %rd2332, 4294967295;
xor.b64 %rd2366, %rd2365, %rd2364;
xor.b64 %rd2367, %rd2366, %rd2739;
mul.lo.s64 %rd2368, %rd2367, %rd2727;
and.b64 %rd2369, %rd2368, 4294967295;
xor.b64 %rd2370, %rd2369, %rd2363;
xor.b64 %rd2371, %rd2370, %rd2740;
mul.lo.s64 %rd2372, %rd2371, %rd2725;
shr.u64 %rd2373, %rd2372, 32;
shr.u64 %rd2374, %rd2368, 32;
xor.b64 %rd2375, %rd2342, %rd2374;
xor.b64 %rd2376, %rd2375, %rd2741;
mul.lo.s64 %rd2377, %rd2376, %rd2725;
xor.b64 %rd2378, %rd2373, %rd2377;
cvt.u32.u64 %r265, %rd2378;
xor.b32 %r266, %r348, %r265;
mul.lo.s32 %r267, %r266, %r349;
shr.u32 %r268, %r267, 9;
cvt.rn.f32.u32 %f250, %r268;
mul.rn.f32 %f251, %f250, 0f34000000;
cvt.rn.f16.f32 %h136, %f251;
mov.b16 %h137, 0x2E66;
setp.ge.f16 %p64, %h136, %h137;
ld.global.nc.b16 %h138, [%rd45+1794];
ld.global.nc.f32 %f252, [%rd46+3588];
cvt.rn.f16.f32 %h139, %f252;
add.rn.f16 %h140, %h138, %h139;
mov.b16 %h141, 0x3C72;
mul.rn.f16 %h142, %h140, %h141;
selp.b16 %h143, %h142, 0x0000, %p64;
cvt.f32.f16 %f253, %h143;
ld.global.nc.b16 %h144, [%rd47+1794];
cvt.f32.f16 %f254, %h144;
ld.global.nc.f32 %f255, [%rd48+3588];
mul.rn.f32 %f256, %f1, %f255;
mul.rn.f32 %f257, %f256, %f254;
ld.global.nc.f32 %f258, [%rd49+3588];
mul.rn.f32 %f259, %f2, %f256;
sub.rn.f32 %f260, %f258, %f259;
add.rn.f32 %f261, %f257, %f260;
add.rn.f32 %f262, %f261, %f253;
sub.rn.f32 %f263, %f262, %f3;
mul.rn.f32 %f264, %f263, %f263;
add.rn.f32 %f265, %f18, %f264;
and.b32 %r46, %r1, 31;
shfl.sync.down.b32 %f266, %f265, 16, 31, -1;
add.rn.f32 %f267, %f266, %f265;
shfl.sync.down.b32 %f268, %f267, 8, 31, -1;
add.rn.f32 %f269, %f268, %f267;
shfl.sync.down.b32 %f270, %f269, 4, 31, -1;
add.rn.f32 %f271, %f270, %f269;
shfl.sync.down.b32 %f272, %f271, 2, 31, -1;
add.rn.f32 %f273, %f272, %f271;
shfl.sync.down.b32 %f274, %f273, 1, 31, -1;
shr.u32 %r47, %r1, 5;
setp.ne.s32 %p65, %r46, 0;
mov.u64 %rd2381, shared_cache_07;
@%p65 bra LBB30_2;
mul.wide.u32 %rd2380, %r47, 4;
add.s64 %rd462, %rd2381, %rd2380;
add.rn.f32 %f19, %f274, %f273;
st.shared.f32 [%rd462], %f19;
LBB30_2:
bar.sync 0;
setp.eq.s32 %p66, %r47, 0;
@%p66 bra LBB30_52;
bra.uni LBB30_3;
LBB30_52:
add.u64 %rd474, %SP, 0;
add.u64 %rd11, %SPL, 0;
mul.wide.u32 %rd2382, %r46, 4;
add.s64 %rd463, %rd2381, %rd2382;
cvta.shared.u64 %rd2384, %rd463;
mov.u32 %r269, 0;
st.local.u32 [%rd11], %r269;
setp.lt.u32 %p67, %r1, 2;
selp.b64 %rd2386, %rd2384, %rd474, %p67;
ld.f32 %f275, [%rd2386];
shfl.sync.down.b32 %f276, %f275, 16, 31, -1;
add.rn.f32 %f277, %f275, %f276;
shfl.sync.down.b32 %f278, %f277, 8, 31, -1;
add.rn.f32 %f279, %f277, %f278;
shfl.sync.down.b32 %f280, %f279, 4, 31, -1;
add.rn.f32 %f281, %f279, %f280;
shfl.sync.down.b32 %f282, %f281, 2, 31, -1;
add.rn.f32 %f283, %f281, %f282;
shfl.sync.down.b32 %f284, %f283, 1, 31, -1;
add.rn.f32 %f285, %f283, %f284;
st.f32 [%rd2386], %f285;
setp.ne.s32 %p68, %r1, 0;
@%p68 bra LBB30_3;
ld.param.u64 %rd470, [fusion_2246_param_3];
cvt.u64.u32 %rd44, %r2;
cvta.to.global.u64 %rd7, %rd470;
shl.b64 %rd2379, %rd44, 2;
add.s64 %rd461, %rd7, %rd2379;
ld.shared.f32 %f286, [%rd463];
atom.global.add.f32 %f287, [%rd461], %f286;
LBB30_3:
ret;
}
// .globl fusion_2243
.visible .entry fusion_2243(
.param .u64 fusion_2243_param_0,
.param .u64 fusion_2243_param_1,
.param .u64 fusion_2243_param_2,
.param .u64 fusion_2243_param_3,
.param .u64 fusion_2243_param_4,
.param .u64 fusion_2243_param_5,
.param .u64 fusion_2243_param_6,
.param .u64 fusion_2243_param_7,
.param .u64 fusion_2243_param_8,
.param .u64 fusion_2243_param_9,
.param .u64 fusion_2243_param_10,
.param .u64 fusion_2243_param_11,
.param .u64 fusion_2243_param_12,
.param .u64 fusion_2243_param_13
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<39>;
.reg .b32 %hh<5>;
.reg .f32 %f<97>;
.reg .b32 %r<31>;
.reg .b64 %rd<162>;
ld.param.u64 %rd1, [fusion_2243_param_0];
ld.param.u64 %rd2, [fusion_2243_param_12];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2243_param_1];
ld.param.u64 %rd5, [fusion_2243_param_11];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2243_param_2];
ld.param.u64 %rd8, [fusion_2243_param_10];
cvta.to.global.u64 %rd9, %rd8;
ld.param.u64 %rd10, [fusion_2243_param_3];
ld.param.u64 %rd11, [fusion_2243_param_9];
cvta.to.global.u64 %rd12, %rd11;
ld.param.u64 %rd13, [fusion_2243_param_4];
ld.param.u64 %rd14, [fusion_2243_param_8];
cvta.to.global.u64 %rd15, %rd14;
ld.param.u64 %rd16, [fusion_2243_param_5];
ld.param.u64 %rd17, [fusion_2243_param_7];
cvta.to.global.u64 %rd18, %rd17;
ld.param.u64 %rd19, [fusion_2243_param_6];
cvta.to.global.u64 %rd20, %rd19;
cvta.to.global.u64 %rd21, %rd16;
cvta.to.global.u64 %rd22, %rd13;
cvta.to.global.u64 %rd23, %rd10;
cvta.to.global.u64 %rd24, %rd7;
cvta.to.global.u64 %rd25, %rd4;
cvta.to.global.u64 %rd26, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
shr.u32 %r8, %r5, 2;
ld.global.nc.v2.u64 {%rd27, %rd28}, [%rd23];
cvt.u64.u32 %rd29, %r8;
add.s64 %rd30, %rd27, %rd29;
setp.lt.u64 %p1, %rd30, %rd27;
and.b64 %rd31, %rd30, 4294967295;
mul.lo.s64 %rd32, %rd31, 3528531795;
selp.u64 %rd33, 1, 0, %p1;
add.s64 %rd34, %rd28, %rd33;
xor.b64 %rd35, %rd34, %rd32;
shr.u64 %rd36, %rd35, 32;
mul.lo.s64 %rd37, %rd36, 3449720151;
shr.u64 %rd38, %rd37, 32;
and.b64 %rd39, %rd34, 4294967295;
mul.lo.s64 %rd40, %rd39, 3449720151;
and.b64 %rd41, %rd40, 4294967295;
xor.b64 %rd42, %rd41, %rd38;
xor.b64 %rd43, %rd42, 2654435769;
mul.lo.s64 %rd44, %rd43, 3528531795;
shr.u64 %rd45, %rd44, 32;
xor.b64 %rd46, %rd40, %rd30;
shr.u64 %rd47, %rd46, 32;
mul.lo.s64 %rd48, %rd47, 3528531795;
and.b64 %rd49, %rd48, 4294967295;
xor.b64 %rd50, %rd49, %rd45;
xor.b64 %rd51, %rd50, 1993301258;
mul.lo.s64 %rd52, %rd51, 3449720151;
shr.u64 %rd53, %rd52, 32;
shr.u64 %rd54, %rd48, 32;
and.b64 %rd55, %rd32, 4294967295;
xor.b64 %rd56, %rd55, %rd54;
xor.b64 %rd57, %rd56, 3144134277;
mul.lo.s64 %rd58, %rd57, 3449720151;
and.b64 %rd59, %rd58, 4294967295;
xor.b64 %rd60, %rd59, %rd53;
xor.b64 %rd61, %rd60, 3668340011;
mul.lo.s64 %rd62, %rd61, 3528531795;
shr.u64 %rd63, %rd62, 32;
shr.u64 %rd64, %rd58, 32;
and.b64 %rd65, %rd37, 4294967295;
xor.b64 %rd66, %rd65, %rd64;
xor.b64 %rd67, %rd66, 1013904242;
mul.lo.s64 %rd68, %rd67, 3528531795;
and.b64 %rd69, %rd68, 4294967295;
xor.b64 %rd70, %rd69, %rd63;
xor.b64 %rd71, %rd70, 3986602516;
mul.lo.s64 %rd72, %rd71, 3449720151;
shr.u64 %rd73, %rd72, 32;
shr.u64 %rd74, %rd68, 32;
and.b64 %rd75, %rd44, 4294967295;
xor.b64 %rd76, %rd75, %rd74;
xor.b64 %rd77, %rd76, 842468239;
mul.lo.s64 %rd78, %rd77, 3449720151;
and.b64 %rd79, %rd78, 4294967295;
xor.b64 %rd80, %rd79, %rd73;
xor.b64 %rd81, %rd80, 387276957;
mul.lo.s64 %rd82, %rd81, 3528531795;
shr.u64 %rd83, %rd82, 32;
shr.u64 %rd84, %rd78, 32;
and.b64 %rd85, %rd52, 4294967295;
xor.b64 %rd86, %rd85, %rd84;
xor.b64 %rd87, %rd86, 2027808484;
mul.lo.s64 %rd88, %rd87, 3528531795;
and.b64 %rd89, %rd88, 4294967295;
shr.u64 %rd90, %rd88, 32;
and.b64 %rd91, %rd62, 4294967295;
xor.b64 %rd92, %rd91, %rd90;
xor.b64 %rd93, %rd92, 2835769497;
mul.lo.s64 %rd94, %rd93, 3449720151;
and.b64 %rd95, %rd94, 4294967295;
shr.u64 %rd96, %rd94, 32;
and.b64 %rd97, %rd72, 4294967295;
xor.b64 %rd98, %rd97, %rd96;
xor.b64 %rd99, %rd98, 3041712726;
mul.lo.s64 %rd100, %rd99, 3528531795;
and.b64 %rd101, %rd100, 4294967295;
xor.b64 %rd102, %rd89, %rd83;
xor.b64 %rd103, %rd102, 1684936478;
mul.lo.s64 %rd104, %rd103, 3449720151;
shr.u64 %rd105, %rd104, 32;
xor.b64 %rd106, %rd95, %rd105;
xor.b64 %rd107, %rd106, 1401181199;
mul.lo.s64 %rd108, %rd107, 3528531795;
shr.u64 %rd109, %rd108, 32;
xor.b64 %rd110, %rd101, %rd109;
xor.b64 %rd111, %rd110, 3678237736;
mul.lo.s64 %rd112, %rd111, 3449720151;
shr.u64 %rd113, %rd112, 32;
cvt.u32.u64 %r9, %rd113;
shr.u64 %rd114, %rd100, 32;
xor.b64 %rd115, %rd114, %rd82;
cvt.u32.u64 %r10, %rd115;
xor.b32 %r11, %r10, 534103459;
mul.lo.s32 %r12, %r11, -845247145;
xor.b32 %r13, %r12, %r9;
shr.u32 %r14, %r13, 9;
xor.b32 %r15, %r14, 4716963;
cvt.rn.f32.u32 %f1, %r15;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h1, %f2;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p2, %h1, %h2;
mul.wide.u32 %rd116, %r5, 2;
add.s64 %rd117, %rd25, %rd116;
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd117];
mov.b32 %hh1, {%h5, %h6};
mov.b32 %hh2, {%h3, %h4};
mov.b32 {%h7, %h8}, %hh2;
mov.b32 {%h9, %h10}, %hh1;
mul.wide.u32 %rd118, %r4, 4;
add.s64 %rd119, %rd6, %rd118;
ld.global.nc.f32 %f3, [%rd119];
cvt.rn.f16.f32 %h11, %f3;
add.rn.f16 %h12, %h7, %h11;
mov.b16 %h13, 0x3C72;
mul.rn.f16 %h14, %h12, %h13;
cvt.f32.f16 %f4, %h14;
selp.f32 %f5, %f4, 0f00000000, %p2;
add.s64 %rd120, %rd24, %rd116;
ld.global.nc.v4.b16 {%h15, %h16, %h17, %h18}, [%rd120];
mov.b32 %hh3, {%h17, %h18};
mov.b32 %hh4, {%h15, %h16};
mov.b32 {%h19, %h20}, %hh4;
mov.b32 {%h21, %h22}, %hh3;
cvt.f32.f16 %f6, %h19;
mul.wide.u32 %rd121, %r1, 4;
add.s64 %rd122, %rd20, %rd121;
ld.global.nc.f32 %f7, [%rd122];
mul.rn.f32 %f8, %f7, 0f3A800000;
add.rn.f32 %f9, %f8, 0f2B8CBCCC;
rsqrt.approx.f32 %f10, %f9;
add.s64 %rd123, %rd9, %rd118;
ld.global.nc.f32 %f11, [%rd123];
mul.rn.f32 %f12, %f10, %f11;
mul.rn.f32 %f13, %f12, %f6;
add.s64 %rd124, %rd12, %rd118;
ld.global.nc.f32 %f14, [%rd124];
add.s64 %rd125, %rd18, %rd121;
ld.global.nc.f32 %f15, [%rd125];
mul.rn.f32 %f16, %f15, 0f3A800000;
mul.rn.f32 %f17, %f12, %f16;
sub.rn.f32 %f18, %f14, %f17;
add.rn.f32 %f19, %f13, %f18;
add.rn.f32 %f20, %f19, %f5;
add.s64 %rd126, %rd22, %rd121;
ld.global.nc.f32 %f21, [%rd126];
mul.rn.f32 %f22, %f21, 0f3A800000;
add.rn.f32 %f23, %f22, 0f2B8CBCCC;
rsqrt.approx.f32 %f24, %f23;
add.s64 %rd127, %rd15, %rd118;
ld.global.nc.f32 %f25, [%rd127];
mul.rn.f32 %f26, %f24, %f25;
mul.rn.f32 %f27, %f26, %f20;
add.s64 %rd128, %rd3, %rd118;
ld.global.nc.f32 %f28, [%rd128];
add.s64 %rd129, %rd21, %rd121;
ld.global.nc.f32 %f29, [%rd129];
mul.rn.f32 %f30, %f29, 0f3A800000;
mul.rn.f32 %f31, %f26, %f30;
sub.rn.f32 %f32, %f28, %f31;
add.rn.f32 %f33, %f32, %f27;
cvt.rn.f16.f32 %h23, %f33;
add.s64 %rd130, %rd26, %rd116;
xor.b64 %rd131, %rd72, %rd96;
xor.b64 %rd132, %rd131, 3041712726;
mul.lo.s64 %rd133, %rd132, 3528531795;
xor.b64 %rd134, %rd109, %rd133;
cvt.u32.u64 %r16, %rd134;
xor.b32 %r17, %r16, -616729560;
mul.lo.s32 %r18, %r17, -845247145;
shr.u32 %r19, %r18, 9;
cvt.rn.f32.u32 %f34, %r19;
mul.rn.f32 %f35, %f34, 0f34000000;
cvt.rn.f16.f32 %h24, %f35;
setp.ge.f16 %p3, %h24, %h2;
mul.wide.u32 %rd135, %r6, 4;
add.s64 %rd136, %rd6, %rd135;
ld.global.nc.f32 %f36, [%rd136];
cvt.rn.f16.f32 %h25, %f36;
add.rn.f16 %h26, %h8, %h25;
mul.rn.f16 %h27, %h26, %h13;
cvt.f32.f16 %f37, %h27;
selp.f32 %f38, %f37, 0f00000000, %p3;
cvt.f32.f16 %f39, %h20;
add.s64 %rd137, %rd9, %rd135;
ld.global.nc.f32 %f40, [%rd137];
mul.rn.f32 %f41, %f10, %f40;
mul.rn.f32 %f42, %f41, %f39;
add.s64 %rd138, %rd12, %rd135;
ld.global.nc.f32 %f43, [%rd138];
mul.rn.f32 %f44, %f16, %f41;
sub.rn.f32 %f45, %f43, %f44;
add.rn.f32 %f46, %f42, %f45;
add.rn.f32 %f47, %f46, %f38;
add.s64 %rd139, %rd15, %rd135;
ld.global.nc.f32 %f48, [%rd139];
mul.rn.f32 %f49, %f24, %f48;
mul.rn.f32 %f50, %f49, %f47;
add.s64 %rd140, %rd3, %rd135;
ld.global.nc.f32 %f51, [%rd140];
mul.rn.f32 %f52, %f30, %f49;
sub.rn.f32 %f53, %f51, %f52;
add.rn.f32 %f54, %f53, %f50;
cvt.rn.f16.f32 %h28, %f54;
and.b64 %rd141, %rd104, 4294967295;
and.b64 %rd142, %rd82, 4294967295;
xor.b64 %rd143, %rd142, %rd114;
xor.b64 %rd144, %rd143, 534103459;
mul.lo.s64 %rd145, %rd144, 3449720151;
shr.u64 %rd146, %rd145, 32;
xor.b64 %rd147, %rd141, %rd146;
xor.b64 %rd148, %rd147, 4055616968;
mul.lo.s64 %rd149, %rd148, 3528531795;
shr.u64 %rd150, %rd149, 32;
cvt.u32.u64 %r20, %rd150;
xor.b64 %rd151, %rd105, %rd94;
cvt.u32.u64 %r21, %rd151;
xor.b32 %r22, %r21, 1401181199;
mul.lo.s32 %r23, %r22, -766435501;
xor.b32 %r24, %r23, %r20;
shr.u32 %r25, %r24, 9;
xor.b32 %r26, %r25, 4936337;
cvt.rn.f32.u32 %f55, %r26;
mul.rn.f32 %f56, %f55, 0f34000000;
cvt.rn.f16.f32 %h29, %f56;
setp.ge.f16 %p4, %h29, %h2;
mul.wide.u32 %rd152, %r7, 4;
add.s64 %rd153, %rd6, %rd152;
ld.global.nc.f32 %f57, [%rd153];
cvt.rn.f16.f32 %h30, %f57;
add.rn.f16 %h31, %h9, %h30;
mul.rn.f16 %h32, %h31, %h13;
cvt.f32.f16 %f58, %h32;
selp.f32 %f59, %f58, 0f00000000, %p4;
cvt.f32.f16 %f60, %h21;
add.s64 %rd154, %rd9, %rd152;
ld.global.nc.f32 %f61, [%rd154];
mul.rn.f32 %f62, %f10, %f61;
mul.rn.f32 %f63, %f62, %f60;
add.s64 %rd155, %rd12, %rd152;
ld.global.nc.f32 %f64, [%rd155];
mul.rn.f32 %f65, %f16, %f62;
sub.rn.f32 %f66, %f64, %f65;
add.rn.f32 %f67, %f63, %f66;
add.rn.f32 %f68, %f67, %f59;
add.s64 %rd156, %rd15, %rd152;
ld.global.nc.f32 %f69, [%rd156];
mul.rn.f32 %f70, %f24, %f69;
mul.rn.f32 %f71, %f70, %f68;
add.s64 %rd157, %rd3, %rd152;
ld.global.nc.f32 %f72, [%rd157];
mul.rn.f32 %f73, %f30, %f70;
sub.rn.f32 %f74, %f72, %f73;
add.rn.f32 %f75, %f74, %f71;
cvt.rn.f16.f32 %h33, %f75;
xor.b64 %rd158, %rd83, %rd88;
xor.b64 %rd159, %rd158, 1684936478;
mul.lo.s64 %rd160, %rd159, 3449720151;
xor.b64 %rd161, %rd146, %rd160;
cvt.u32.u64 %r27, %rd161;
xor.b32 %r28, %r27, -239350328;
mul.lo.s32 %r29, %r28, -766435501;
shr.u32 %r30, %r29, 9;
cvt.rn.f32.u32 %f76, %r30;
mul.rn.f32 %f77, %f76, 0f34000000;
cvt.rn.f16.f32 %h34, %f77;
setp.ge.f16 %p5, %h34, %h2;
ld.global.nc.f32 %f78, [%rd119+12];
cvt.rn.f16.f32 %h35, %f78;
add.rn.f16 %h36, %h10, %h35;
mul.rn.f16 %h37, %h36, %h13;
cvt.f32.f16 %f79, %h37;
selp.f32 %f80, %f79, 0f00000000, %p5;
cvt.f32.f16 %f81, %h22;
ld.global.nc.f32 %f82, [%rd123+12];
mul.rn.f32 %f83, %f10, %f82;
mul.rn.f32 %f84, %f83, %f81;
ld.global.nc.f32 %f85, [%rd124+12];
mul.rn.f32 %f86, %f16, %f83;
sub.rn.f32 %f87, %f85, %f86;
add.rn.f32 %f88, %f84, %f87;
add.rn.f32 %f89, %f88, %f80;
ld.global.nc.f32 %f90, [%rd127+12];
mul.rn.f32 %f91, %f24, %f90;
mul.rn.f32 %f92, %f91, %f89;
ld.global.nc.f32 %f93, [%rd128+12];
mul.rn.f32 %f94, %f30, %f91;
sub.rn.f32 %f95, %f93, %f94;
add.rn.f32 %f96, %f95, %f92;
cvt.rn.f16.f32 %h38, %f96;
st.global.v4.b16 [%rd130], {%h23, %h28, %h33, %h38};
ret;
}
// .globl fusion_2706
.visible .entry fusion_2706(
.param .u64 fusion_2706_param_0,
.param .u64 fusion_2706_param_1,
.param .u64 fusion_2706_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2706_param_0];
ld.param.u64 %rd2, [fusion_2706_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2239
.visible .entry fusion_2239(
.param .u64 fusion_2239_param_0,
.param .u64 fusion_2239_param_1,
.param .u64 fusion_2239_param_2,
.param .u64 fusion_2239_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2239_param_0];
ld.param.u64 %rd2, [fusion_2239_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2239_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd5, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd6, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2707
.visible .entry fusion_2707(
.param .u64 fusion_2707_param_0,
.param .u64 fusion_2707_param_1,
.param .u64 fusion_2707_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2707_param_0];
ld.param.u64 %rd2, [fusion_2707_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2240
.visible .entry fusion_2240(
.param .u64 fusion_2240_param_0,
.param .u64 fusion_2240_param_1,
.param .u64 fusion_2240_param_2,
.param .u64 fusion_2240_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2240_param_0];
ld.param.u64 %rd2, [fusion_2240_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2240_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd5, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd6, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2237
.visible .entry fusion_2237(
.param .u64 fusion_2237_param_0,
.param .u64 fusion_2237_param_1,
.param .u64 fusion_2237_param_2,
.param .u64 fusion_2237_param_3
)
.reqntid 32, 1, 1
{
.local .align 4 .b8 __local_depot36[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<4>;
.reg .b16 %h<83>;
.reg .b32 %hh<9>;
.reg .f32 %f<57>;
.reg .b32 %r<37>;
.reg .b64 %rd<37>;
mov.u64 %SPL, __local_depot36;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd4, [fusion_2237_param_0];
ld.param.u64 %rd5, [fusion_2237_param_2];
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd9, %rd4;
add.u64 %rd10, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r5, %ctaid.x;
shl.b32 %r6, %r1, 1;
shl.b32 %r7, %r5, 9;
or.b32 %r8, %r7, %r6;
mul.wide.u32 %rd11, %r8, 2;
add.s64 %rd12, %rd9, %rd11;
ld.global.nc.b32 %hh1, [%rd12];
mov.b32 {%h1, %h2}, %hh1;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd6, %rd13;
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14];
cvt.rn.f16.s32 %h3, %r9;
mov.b16 %h4, 0x3C00;
sub.rn.f16 %h5, %h4, %h3;
mov.b16 %h6, 0x70E2;
mul.rn.f16 %h7, %h5, %h6;
sub.rn.f16 %h8, %h1, %h7;
cvt.f32.f16 %f2, %h8;
max.f32 %f3, %f2, 0fFF800000;
cvt.rn.f16.s32 %h9, %r10;
sub.rn.f16 %h10, %h4, %h9;
mul.rn.f16 %h11, %h10, %h6;
sub.rn.f16 %h12, %h2, %h11;
cvt.f32.f16 %f4, %h12;
max.f32 %f5, %f3, %f4;
or.b32 %r11, %r6, 64;
ld.global.nc.b32 %hh2, [%rd12+128];
mov.b32 {%h13, %h14}, %hh2;
mul.wide.u32 %rd15, %r11, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.u32 %r12, [%rd16];
cvt.rn.f16.s32 %h15, %r12;
sub.rn.f16 %h16, %h4, %h15;
mul.rn.f16 %h17, %h16, %h6;
sub.rn.f16 %h18, %h13, %h17;
cvt.f32.f16 %f6, %h18;
max.f32 %f7, %f5, %f6;
ld.global.nc.u32 %r13, [%rd14+260];
cvt.rn.f16.s32 %h19, %r13;
sub.rn.f16 %h20, %h4, %h19;
mul.rn.f16 %h21, %h20, %h6;
sub.rn.f16 %h22, %h14, %h21;
cvt.f32.f16 %f8, %h22;
max.f32 %f9, %f7, %f8;
or.b32 %r14, %r6, 128;
ld.global.nc.b32 %hh3, [%rd12+256];
mov.b32 {%h23, %h24}, %hh3;
mul.wide.u32 %rd17, %r14, 4;
add.s64 %rd18, %rd6, %rd17;
ld.global.nc.u32 %r15, [%rd18];
cvt.rn.f16.s32 %h25, %r15;
sub.rn.f16 %h26, %h4, %h25;
mul.rn.f16 %h27, %h26, %h6;
sub.rn.f16 %h28, %h23, %h27;
cvt.f32.f16 %f10, %h28;
max.f32 %f11, %f9, %f10;
ld.global.nc.u32 %r16, [%rd14+516];
cvt.rn.f16.s32 %h29, %r16;
sub.rn.f16 %h30, %h4, %h29;
mul.rn.f16 %h31, %h30, %h6;
sub.rn.f16 %h32, %h24, %h31;
cvt.f32.f16 %f12, %h32;
max.f32 %f13, %f11, %f12;
or.b32 %r17, %r6, 192;
ld.global.nc.b32 %hh4, [%rd12+384];
mov.b32 {%h33, %h34}, %hh4;
mul.wide.u32 %rd19, %r17, 4;
add.s64 %rd20, %rd6, %rd19;
ld.global.nc.u32 %r18, [%rd20];
cvt.rn.f16.s32 %h35, %r18;
sub.rn.f16 %h36, %h4, %h35;
mul.rn.f16 %h37, %h36, %h6;
sub.rn.f16 %h38, %h33, %h37;
cvt.f32.f16 %f14, %h38;
max.f32 %f15, %f13, %f14;
ld.global.nc.u32 %r19, [%rd14+772];
cvt.rn.f16.s32 %h39, %r19;
sub.rn.f16 %h40, %h4, %h39;
mul.rn.f16 %h41, %h40, %h6;
sub.rn.f16 %h42, %h34, %h41;
cvt.f32.f16 %f16, %h42;
max.f32 %f17, %f15, %f16;
or.b32 %r20, %r6, 256;
ld.global.nc.b32 %hh5, [%rd12+512];
mov.b32 {%h43, %h44}, %hh5;
mul.wide.u32 %rd21, %r20, 4;
add.s64 %rd22, %rd6, %rd21;
ld.global.nc.u32 %r21, [%rd22];
cvt.rn.f16.s32 %h45, %r21;
sub.rn.f16 %h46, %h4, %h45;
mul.rn.f16 %h47, %h46, %h6;
sub.rn.f16 %h48, %h43, %h47;
cvt.f32.f16 %f18, %h48;
max.f32 %f19, %f17, %f18;
ld.global.nc.u32 %r22, [%rd14+1028];
cvt.rn.f16.s32 %h49, %r22;
sub.rn.f16 %h50, %h4, %h49;
mul.rn.f16 %h51, %h50, %h6;
sub.rn.f16 %h52, %h44, %h51;
cvt.f32.f16 %f20, %h52;
max.f32 %f21, %f19, %f20;
or.b32 %r23, %r6, 320;
ld.global.nc.b32 %hh6, [%rd12+640];
mov.b32 {%h53, %h54}, %hh6;
mul.wide.u32 %rd23, %r23, 4;
add.s64 %rd24, %rd6, %rd23;
ld.global.nc.u32 %r24, [%rd24];
cvt.rn.f16.s32 %h55, %r24;
sub.rn.f16 %h56, %h4, %h55;
mul.rn.f16 %h57, %h56, %h6;
sub.rn.f16 %h58, %h53, %h57;
cvt.f32.f16 %f22, %h58;
max.f32 %f23, %f21, %f22;
ld.global.nc.u32 %r25, [%rd14+1284];
cvt.rn.f16.s32 %h59, %r25;
sub.rn.f16 %h60, %h4, %h59;
mul.rn.f16 %h61, %h60, %h6;
sub.rn.f16 %h62, %h54, %h61;
cvt.f32.f16 %f24, %h62;
max.f32 %f25, %f23, %f24;
or.b32 %r26, %r6, 384;
ld.global.nc.b32 %hh7, [%rd12+768];
mov.b32 {%h63, %h64}, %hh7;
mul.wide.u32 %rd25, %r26, 4;
add.s64 %rd26, %rd6, %rd25;
ld.global.nc.u32 %r27, [%rd26];
cvt.rn.f16.s32 %h65, %r27;
sub.rn.f16 %h66, %h4, %h65;
mul.rn.f16 %h67, %h66, %h6;
sub.rn.f16 %h68, %h63, %h67;
cvt.f32.f16 %f26, %h68;
max.f32 %f27, %f25, %f26;
ld.global.nc.u32 %r28, [%rd14+1540];
cvt.rn.f16.s32 %h69, %r28;
sub.rn.f16 %h70, %h4, %h69;
mul.rn.f16 %h71, %h70, %h6;
sub.rn.f16 %h72, %h64, %h71;
cvt.f32.f16 %f28, %h72;
max.f32 %f29, %f27, %f28;
or.b32 %r29, %r6, 448;
ld.global.nc.b32 %hh8, [%rd12+896];
mov.b32 {%h73, %h74}, %hh8;
mul.wide.u32 %rd27, %r29, 4;
add.s64 %rd28, %rd6, %rd27;
ld.global.nc.u32 %r30, [%rd28];
cvt.rn.f16.s32 %h75, %r30;
sub.rn.f16 %h76, %h4, %h75;
mul.rn.f16 %h77, %h76, %h6;
sub.rn.f16 %h78, %h73, %h77;
cvt.f32.f16 %f30, %h78;
max.f32 %f31, %f29, %f30;
ld.global.nc.u32 %r31, [%rd14+1796];
cvt.rn.f16.s32 %h79, %r31;
sub.rn.f16 %h80, %h4, %h79;
mul.rn.f16 %h81, %h80, %h6;
sub.rn.f16 %h82, %h74, %h81;
cvt.f32.f16 %f32, %h82;
max.f32 %f33, %f31, %f32;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
max.f32 %f35, %f33, %f34;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
max.f32 %f37, %f35, %f36;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
max.f32 %f39, %f37, %f38;
shfl.sync.down.b32 %f40, %f39, 2, 31, -1;
max.f32 %f41, %f39, %f40;
shfl.sync.down.b32 %f42, %f41, 1, 31, -1;
setp.eq.s32 %p1, %r1, 0;
@%p1 bra LBB36_3;
bra.uni LBB36_1;
LBB36_3:
max.f32 %f1, %f41, %f42;
st.shared.f32 [shared_cache_08], %f1;
LBB36_1:
bar.sync 0;
mul.wide.u32 %rd32, %r1, 4;
mov.u64 %rd33, shared_cache_08;
add.s64 %rd3, %rd33, %rd32;
cvta.shared.u64 %rd34, %rd3;
mov.u32 %r34, -8388608;
st.local.u32 [%rd1], %r34;
selp.b64 %rd36, %rd34, %rd10, %p1;
ld.f32 %f43, [%rd36];
shfl.sync.down.b32 %f44, %f43, 16, 31, -1;
max.f32 %f45, %f43, %f44;
shfl.sync.down.b32 %f46, %f45, 8, 31, -1;
max.f32 %f47, %f45, %f46;
shfl.sync.down.b32 %f48, %f47, 4, 31, -1;
max.f32 %f49, %f47, %f48;
shfl.sync.down.b32 %f50, %f49, 2, 31, -1;
max.f32 %f51, %f49, %f50;
shfl.sync.down.b32 %f52, %f51, 1, 31, -1;
max.f32 %f53, %f51, %f52;
st.f32 [%rd36], %f53;
@%p1 bra LBB36_4;
bra.uni LBB36_2;
LBB36_4:
ld.param.u64 %rd7, [fusion_2237_param_1];
shr.u32 %r33, %r5, 9;
cvta.to.global.u64 %rd8, %rd7;
and.b32 %r32, %r5, 511;
mul.wide.u32 %rd29, %r33, 2048;
add.s64 %rd30, %rd8, %rd29;
mul.wide.u32 %rd31, %r32, 4;
add.s64 %rd2, %rd30, %rd31;
ld.global.u32 %r36, [%rd2];
LBB36_5:
mov.b32 %f54, %r36;
ld.shared.f32 %f55, [%rd3];
max.f32 %f56, %f54, %f55;
mov.b32 %r35, %f56;
atom.global.cas.b32 %r4, [%rd2], %r36, %r35;
setp.eq.s32 %p3, %r4, %r36;
mov.u32 %r36, %r4;
@%p3 bra LBB36_2;
bra.uni LBB36_5;
LBB36_2:
ret;
}
// .globl fusion_2235
.visible .entry fusion_2235(
.param .u64 fusion_2235_param_0,
.param .u64 fusion_2235_param_1,
.param .u64 fusion_2235_param_2,
.param .u64 fusion_2235_param_3,
.param .u64 fusion_2235_param_4
)
.reqntid 32, 1, 1
{
.local .align 4 .b8 __local_depot37[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<35>;
.reg .b16 %h<83>;
.reg .b32 %hh<9>;
.reg .f32 %f<249>;
.reg .b32 %r<32>;
.reg .b64 %rd<41>;
mov.u64 %SPL, __local_depot37;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd4, [fusion_2235_param_0];
ld.param.u64 %rd5, [fusion_2235_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd8, [fusion_2235_param_2];
cvta.to.global.u64 %rd9, %rd8;
cvta.to.global.u64 %rd11, %rd4;
add.u64 %rd12, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 9;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd13, %r5, 2;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.b32 %hh1, [%rd14];
mov.b32 {%h1, %h2}, %hh1;
mul.wide.u32 %rd15, %r3, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.v2.u32 {%r6, %r7}, [%rd16];
cvt.rn.f16.s32 %h3, %r6;
mov.b16 %h4, 0x3C00;
sub.rn.f16 %h5, %h4, %h3;
mov.b16 %h6, 0x70E2;
mul.rn.f16 %h7, %h5, %h6;
sub.rn.f16 %h8, %h1, %h7;
cvt.f32.f16 %f2, %h8;
mul.wide.u32 %rd17, %r2, 4;
add.s64 %rd18, %rd9, %rd17;
ld.global.nc.f32 %f3, [%rd18];
sub.rn.f32 %f4, %f2, %f3;
mul.rn.f32 %f5, %f4, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f6, %f5;
add.rn.f32 %f7, %f6, 0f00000000;
ex2.approx.f32 %f8, %f7;
fma.rn.f32 %f9, %f6, 0fBF317200, %f4;
fma.rn.f32 %f10, %f6, 0fB5BFBE8E, %f9;
mul.rn.f32 %f11, %f10, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f12, %f11;
mul.rn.f32 %f13, %f8, %f12;
setp.lt.f32 %p1, %f4, 0fC2D20000;
setp.gt.f32 %p2, %f4, 0f42D20000;
add.rn.f32 %f14, %f13, 0f00000000;
selp.f32 %f15, 0f00000000, %f14, %p1;
selp.f32 %f16, 0f7F800000, %f15, %p2;
cvt.rn.f16.s32 %h9, %r7;
sub.rn.f16 %h10, %h4, %h9;
mul.rn.f16 %h11, %h10, %h6;
sub.rn.f16 %h12, %h2, %h11;
cvt.f32.f16 %f17, %h12;
sub.rn.f32 %f18, %f17, %f3;
mul.rn.f32 %f19, %f18, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f20, %f19;
add.rn.f32 %f21, %f20, 0f00000000;
ex2.approx.f32 %f22, %f21;
fma.rn.f32 %f23, %f20, 0fBF317200, %f18;
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23;
mul.rn.f32 %f25, %f24, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f26, %f25;
mul.rn.f32 %f27, %f22, %f26;
setp.lt.f32 %p3, %f18, 0fC2D20000;
selp.f32 %f28, 0f00000000, %f27, %p3;
setp.gt.f32 %p4, %f18, 0f42D20000;
selp.f32 %f29, 0f7F800000, %f28, %p4;
add.rn.f32 %f30, %f16, %f29;
or.b32 %r8, %r3, 64;
ld.global.nc.b32 %hh2, [%rd14+128];
mov.b32 {%h13, %h14}, %hh2;
mul.wide.u32 %rd19, %r8, 4;
add.s64 %rd20, %rd6, %rd19;
ld.global.nc.u32 %r9, [%rd20];
cvt.rn.f16.s32 %h15, %r9;
sub.rn.f16 %h16, %h4, %h15;
mul.rn.f16 %h17, %h16, %h6;
sub.rn.f16 %h18, %h13, %h17;
cvt.f32.f16 %f31, %h18;
sub.rn.f32 %f32, %f31, %f3;
mul.rn.f32 %f33, %f32, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f34, %f33;
add.rn.f32 %f35, %f34, 0f00000000;
ex2.approx.f32 %f36, %f35;
fma.rn.f32 %f37, %f34, 0fBF317200, %f32;
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37;
mul.rn.f32 %f39, %f38, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f40, %f39;
mul.rn.f32 %f41, %f36, %f40;
setp.lt.f32 %p5, %f32, 0fC2D20000;
selp.f32 %f42, 0f00000000, %f41, %p5;
setp.gt.f32 %p6, %f32, 0f42D20000;
selp.f32 %f43, 0f7F800000, %f42, %p6;
add.rn.f32 %f44, %f30, %f43;
ld.global.nc.u32 %r10, [%rd16+260];
cvt.rn.f16.s32 %h19, %r10;
sub.rn.f16 %h20, %h4, %h19;
mul.rn.f16 %h21, %h20, %h6;
sub.rn.f16 %h22, %h14, %h21;
cvt.f32.f16 %f45, %h22;
sub.rn.f32 %f46, %f45, %f3;
mul.rn.f32 %f47, %f46, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f48, %f47;
add.rn.f32 %f49, %f48, 0f00000000;
ex2.approx.f32 %f50, %f49;
fma.rn.f32 %f51, %f48, 0fBF317200, %f46;
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51;
mul.rn.f32 %f53, %f52, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f54, %f53;
mul.rn.f32 %f55, %f50, %f54;
setp.lt.f32 %p7, %f46, 0fC2D20000;
selp.f32 %f56, 0f00000000, %f55, %p7;
setp.gt.f32 %p8, %f46, 0f42D20000;
selp.f32 %f57, 0f7F800000, %f56, %p8;
add.rn.f32 %f58, %f44, %f57;
or.b32 %r11, %r3, 128;
ld.global.nc.b32 %hh3, [%rd14+256];
mov.b32 {%h23, %h24}, %hh3;
mul.wide.u32 %rd21, %r11, 4;
add.s64 %rd22, %rd6, %rd21;
ld.global.nc.u32 %r12, [%rd22];
cvt.rn.f16.s32 %h25, %r12;
sub.rn.f16 %h26, %h4, %h25;
mul.rn.f16 %h27, %h26, %h6;
sub.rn.f16 %h28, %h23, %h27;
cvt.f32.f16 %f59, %h28;
sub.rn.f32 %f60, %f59, %f3;
mul.rn.f32 %f61, %f60, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f62, %f61;
add.rn.f32 %f63, %f62, 0f00000000;
ex2.approx.f32 %f64, %f63;
fma.rn.f32 %f65, %f62, 0fBF317200, %f60;
fma.rn.f32 %f66, %f62, 0fB5BFBE8E, %f65;
mul.rn.f32 %f67, %f66, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f68, %f67;
mul.rn.f32 %f69, %f64, %f68;
setp.lt.f32 %p9, %f60, 0fC2D20000;
selp.f32 %f70, 0f00000000, %f69, %p9;
setp.gt.f32 %p10, %f60, 0f42D20000;
selp.f32 %f71, 0f7F800000, %f70, %p10;
add.rn.f32 %f72, %f58, %f71;
ld.global.nc.u32 %r13, [%rd16+516];
cvt.rn.f16.s32 %h29, %r13;
sub.rn.f16 %h30, %h4, %h29;
mul.rn.f16 %h31, %h30, %h6;
sub.rn.f16 %h32, %h24, %h31;
cvt.f32.f16 %f73, %h32;
sub.rn.f32 %f74, %f73, %f3;
mul.rn.f32 %f75, %f74, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f76, %f75;
add.rn.f32 %f77, %f76, 0f00000000;
ex2.approx.f32 %f78, %f77;
fma.rn.f32 %f79, %f76, 0fBF317200, %f74;
fma.rn.f32 %f80, %f76, 0fB5BFBE8E, %f79;
mul.rn.f32 %f81, %f80, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f82, %f81;
mul.rn.f32 %f83, %f78, %f82;
setp.lt.f32 %p11, %f74, 0fC2D20000;
selp.f32 %f84, 0f00000000, %f83, %p11;
setp.gt.f32 %p12, %f74, 0f42D20000;
selp.f32 %f85, 0f7F800000, %f84, %p12;
add.rn.f32 %f86, %f72, %f85;
or.b32 %r14, %r3, 192;
ld.global.nc.b32 %hh4, [%rd14+384];
mov.b32 {%h33, %h34}, %hh4;
mul.wide.u32 %rd23, %r14, 4;
add.s64 %rd24, %rd6, %rd23;
ld.global.nc.u32 %r15, [%rd24];
cvt.rn.f16.s32 %h35, %r15;
sub.rn.f16 %h36, %h4, %h35;
mul.rn.f16 %h37, %h36, %h6;
sub.rn.f16 %h38, %h33, %h37;
cvt.f32.f16 %f87, %h38;
sub.rn.f32 %f88, %f87, %f3;
mul.rn.f32 %f89, %f88, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f90, %f89;
add.rn.f32 %f91, %f90, 0f00000000;
ex2.approx.f32 %f92, %f91;
fma.rn.f32 %f93, %f90, 0fBF317200, %f88;
fma.rn.f32 %f94, %f90, 0fB5BFBE8E, %f93;
mul.rn.f32 %f95, %f94, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f96, %f95;
mul.rn.f32 %f97, %f92, %f96;
setp.lt.f32 %p13, %f88, 0fC2D20000;
selp.f32 %f98, 0f00000000, %f97, %p13;
setp.gt.f32 %p14, %f88, 0f42D20000;
selp.f32 %f99, 0f7F800000, %f98, %p14;
add.rn.f32 %f100, %f86, %f99;
ld.global.nc.u32 %r16, [%rd16+772];
cvt.rn.f16.s32 %h39, %r16;
sub.rn.f16 %h40, %h4, %h39;
mul.rn.f16 %h41, %h40, %h6;
sub.rn.f16 %h42, %h34, %h41;
cvt.f32.f16 %f101, %h42;
sub.rn.f32 %f102, %f101, %f3;
mul.rn.f32 %f103, %f102, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f104, %f103;
add.rn.f32 %f105, %f104, 0f00000000;
ex2.approx.f32 %f106, %f105;
fma.rn.f32 %f107, %f104, 0fBF317200, %f102;
fma.rn.f32 %f108, %f104, 0fB5BFBE8E, %f107;
mul.rn.f32 %f109, %f108, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f110, %f109;
mul.rn.f32 %f111, %f106, %f110;
setp.lt.f32 %p15, %f102, 0fC2D20000;
selp.f32 %f112, 0f00000000, %f111, %p15;
setp.gt.f32 %p16, %f102, 0f42D20000;
selp.f32 %f113, 0f7F800000, %f112, %p16;
add.rn.f32 %f114, %f100, %f113;
or.b32 %r17, %r3, 256;
ld.global.nc.b32 %hh5, [%rd14+512];
mov.b32 {%h43, %h44}, %hh5;
mul.wide.u32 %rd25, %r17, 4;
add.s64 %rd26, %rd6, %rd25;
ld.global.nc.u32 %r18, [%rd26];
cvt.rn.f16.s32 %h45, %r18;
sub.rn.f16 %h46, %h4, %h45;
mul.rn.f16 %h47, %h46, %h6;
sub.rn.f16 %h48, %h43, %h47;
cvt.f32.f16 %f115, %h48;
sub.rn.f32 %f116, %f115, %f3;
mul.rn.f32 %f117, %f116, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f118, %f117;
add.rn.f32 %f119, %f118, 0f00000000;
ex2.approx.f32 %f120, %f119;
fma.rn.f32 %f121, %f118, 0fBF317200, %f116;
fma.rn.f32 %f122, %f118, 0fB5BFBE8E, %f121;
mul.rn.f32 %f123, %f122, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f124, %f123;
mul.rn.f32 %f125, %f120, %f124;
setp.lt.f32 %p17, %f116, 0fC2D20000;
selp.f32 %f126, 0f00000000, %f125, %p17;
setp.gt.f32 %p18, %f116, 0f42D20000;
selp.f32 %f127, 0f7F800000, %f126, %p18;
add.rn.f32 %f128, %f114, %f127;
ld.global.nc.u32 %r19, [%rd16+1028];
cvt.rn.f16.s32 %h49, %r19;
sub.rn.f16 %h50, %h4, %h49;
mul.rn.f16 %h51, %h50, %h6;
sub.rn.f16 %h52, %h44, %h51;
cvt.f32.f16 %f129, %h52;
sub.rn.f32 %f130, %f129, %f3;
mul.rn.f32 %f131, %f130, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f132, %f131;
add.rn.f32 %f133, %f132, 0f00000000;
ex2.approx.f32 %f134, %f133;
fma.rn.f32 %f135, %f132, 0fBF317200, %f130;
fma.rn.f32 %f136, %f132, 0fB5BFBE8E, %f135;
mul.rn.f32 %f137, %f136, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f138, %f137;
mul.rn.f32 %f139, %f134, %f138;
setp.lt.f32 %p19, %f130, 0fC2D20000;
selp.f32 %f140, 0f00000000, %f139, %p19;
setp.gt.f32 %p20, %f130, 0f42D20000;
selp.f32 %f141, 0f7F800000, %f140, %p20;
add.rn.f32 %f142, %f128, %f141;
or.b32 %r20, %r3, 320;
ld.global.nc.b32 %hh6, [%rd14+640];
mov.b32 {%h53, %h54}, %hh6;
mul.wide.u32 %rd27, %r20, 4;
add.s64 %rd28, %rd6, %rd27;
ld.global.nc.u32 %r21, [%rd28];
cvt.rn.f16.s32 %h55, %r21;
sub.rn.f16 %h56, %h4, %h55;
mul.rn.f16 %h57, %h56, %h6;
sub.rn.f16 %h58, %h53, %h57;
cvt.f32.f16 %f143, %h58;
sub.rn.f32 %f144, %f143, %f3;
mul.rn.f32 %f145, %f144, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f146, %f145;
add.rn.f32 %f147, %f146, 0f00000000;
ex2.approx.f32 %f148, %f147;
fma.rn.f32 %f149, %f146, 0fBF317200, %f144;
fma.rn.f32 %f150, %f146, 0fB5BFBE8E, %f149;
mul.rn.f32 %f151, %f150, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f152, %f151;
mul.rn.f32 %f153, %f148, %f152;
setp.lt.f32 %p21, %f144, 0fC2D20000;
selp.f32 %f154, 0f00000000, %f153, %p21;
setp.gt.f32 %p22, %f144, 0f42D20000;
selp.f32 %f155, 0f7F800000, %f154, %p22;
add.rn.f32 %f156, %f142, %f155;
ld.global.nc.u32 %r22, [%rd16+1284];
cvt.rn.f16.s32 %h59, %r22;
sub.rn.f16 %h60, %h4, %h59;
mul.rn.f16 %h61, %h60, %h6;
sub.rn.f16 %h62, %h54, %h61;
cvt.f32.f16 %f157, %h62;
sub.rn.f32 %f158, %f157, %f3;
mul.rn.f32 %f159, %f158, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f160, %f159;
add.rn.f32 %f161, %f160, 0f00000000;
ex2.approx.f32 %f162, %f161;
fma.rn.f32 %f163, %f160, 0fBF317200, %f158;
fma.rn.f32 %f164, %f160, 0fB5BFBE8E, %f163;
mul.rn.f32 %f165, %f164, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f166, %f165;
mul.rn.f32 %f167, %f162, %f166;
setp.lt.f32 %p23, %f158, 0fC2D20000;
selp.f32 %f168, 0f00000000, %f167, %p23;
setp.gt.f32 %p24, %f158, 0f42D20000;
selp.f32 %f169, 0f7F800000, %f168, %p24;
add.rn.f32 %f170, %f156, %f169;
or.b32 %r23, %r3, 384;
ld.global.nc.b32 %hh7, [%rd14+768];
mov.b32 {%h63, %h64}, %hh7;
mul.wide.u32 %rd29, %r23, 4;
add.s64 %rd30, %rd6, %rd29;
ld.global.nc.u32 %r24, [%rd30];
cvt.rn.f16.s32 %h65, %r24;
sub.rn.f16 %h66, %h4, %h65;
mul.rn.f16 %h67, %h66, %h6;
sub.rn.f16 %h68, %h63, %h67;
cvt.f32.f16 %f171, %h68;
sub.rn.f32 %f172, %f171, %f3;
mul.rn.f32 %f173, %f172, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f174, %f173;
add.rn.f32 %f175, %f174, 0f00000000;
ex2.approx.f32 %f176, %f175;
fma.rn.f32 %f177, %f174, 0fBF317200, %f172;
fma.rn.f32 %f178, %f174, 0fB5BFBE8E, %f177;
mul.rn.f32 %f179, %f178, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f180, %f179;
mul.rn.f32 %f181, %f176, %f180;
setp.lt.f32 %p25, %f172, 0fC2D20000;
selp.f32 %f182, 0f00000000, %f181, %p25;
setp.gt.f32 %p26, %f172, 0f42D20000;
selp.f32 %f183, 0f7F800000, %f182, %p26;
add.rn.f32 %f184, %f170, %f183;
ld.global.nc.u32 %r25, [%rd16+1540];
cvt.rn.f16.s32 %h69, %r25;
sub.rn.f16 %h70, %h4, %h69;
mul.rn.f16 %h71, %h70, %h6;
sub.rn.f16 %h72, %h64, %h71;
cvt.f32.f16 %f185, %h72;
sub.rn.f32 %f186, %f185, %f3;
mul.rn.f32 %f187, %f186, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f188, %f187;
add.rn.f32 %f189, %f188, 0f00000000;
ex2.approx.f32 %f190, %f189;
fma.rn.f32 %f191, %f188, 0fBF317200, %f186;
fma.rn.f32 %f192, %f188, 0fB5BFBE8E, %f191;
mul.rn.f32 %f193, %f192, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f194, %f193;
mul.rn.f32 %f195, %f190, %f194;
setp.lt.f32 %p27, %f186, 0fC2D20000;
selp.f32 %f196, 0f00000000, %f195, %p27;
setp.gt.f32 %p28, %f186, 0f42D20000;
selp.f32 %f197, 0f7F800000, %f196, %p28;
add.rn.f32 %f198, %f184, %f197;
or.b32 %r26, %r3, 448;
ld.global.nc.b32 %hh8, [%rd14+896];
mov.b32 {%h73, %h74}, %hh8;
mul.wide.u32 %rd31, %r26, 4;
add.s64 %rd32, %rd6, %rd31;
ld.global.nc.u32 %r27, [%rd32];
cvt.rn.f16.s32 %h75, %r27;
sub.rn.f16 %h76, %h4, %h75;
mul.rn.f16 %h77, %h76, %h6;
sub.rn.f16 %h78, %h73, %h77;
cvt.f32.f16 %f199, %h78;
sub.rn.f32 %f200, %f199, %f3;
mul.rn.f32 %f201, %f200, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f202, %f201;
add.rn.f32 %f203, %f202, 0f00000000;
ex2.approx.f32 %f204, %f203;
fma.rn.f32 %f205, %f202, 0fBF317200, %f200;
fma.rn.f32 %f206, %f202, 0fB5BFBE8E, %f205;
mul.rn.f32 %f207, %f206, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f208, %f207;
mul.rn.f32 %f209, %f204, %f208;
setp.lt.f32 %p29, %f200, 0fC2D20000;
selp.f32 %f210, 0f00000000, %f209, %p29;
setp.gt.f32 %p30, %f200, 0f42D20000;
selp.f32 %f211, 0f7F800000, %f210, %p30;
add.rn.f32 %f212, %f198, %f211;
ld.global.nc.u32 %r28, [%rd16+1796];
cvt.rn.f16.s32 %h79, %r28;
sub.rn.f16 %h80, %h4, %h79;
mul.rn.f16 %h81, %h80, %h6;
sub.rn.f16 %h82, %h74, %h81;
cvt.f32.f16 %f213, %h82;
sub.rn.f32 %f214, %f213, %f3;
mul.rn.f32 %f215, %f214, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f216, %f215;
add.rn.f32 %f217, %f216, 0f00000000;
ex2.approx.f32 %f218, %f217;
fma.rn.f32 %f219, %f216, 0fBF317200, %f214;
fma.rn.f32 %f220, %f216, 0fB5BFBE8E, %f219;
mul.rn.f32 %f221, %f220, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f222, %f221;
mul.rn.f32 %f223, %f218, %f222;
setp.lt.f32 %p31, %f214, 0fC2D20000;
selp.f32 %f224, 0f00000000, %f223, %p31;
setp.gt.f32 %p32, %f214, 0f42D20000;
selp.f32 %f225, 0f7F800000, %f224, %p32;
add.rn.f32 %f226, %f212, %f225;
shfl.sync.down.b32 %f227, %f226, 16, 31, -1;
add.rn.f32 %f228, %f227, %f226;
shfl.sync.down.b32 %f229, %f228, 8, 31, -1;
add.rn.f32 %f230, %f229, %f228;
shfl.sync.down.b32 %f231, %f230, 4, 31, -1;
add.rn.f32 %f232, %f231, %f230;
shfl.sync.down.b32 %f233, %f232, 2, 31, -1;
add.rn.f32 %f234, %f233, %f232;
shfl.sync.down.b32 %f235, %f234, 1, 31, -1;
setp.eq.s32 %p33, %r1, 0;
@%p33 bra LBB37_3;
bra.uni LBB37_1;
LBB37_3:
add.rn.f32 %f1, %f235, %f234;
st.shared.f32 [shared_cache_09], %f1;
LBB37_1:
bar.sync 0;
mul.wide.u32 %rd36, %r1, 4;
mov.u64 %rd37, shared_cache_09;
add.s64 %rd3, %rd37, %rd36;
cvta.shared.u64 %rd38, %rd3;
mov.u32 %r31, 0;
st.local.u32 [%rd1], %r31;
selp.b64 %rd40, %rd38, %rd12, %p33;
ld.f32 %f236, [%rd40];
shfl.sync.down.b32 %f237, %f236, 16, 31, -1;
add.rn.f32 %f238, %f236, %f237;
shfl.sync.down.b32 %f239, %f238, 8, 31, -1;
add.rn.f32 %f240, %f238, %f239;
shfl.sync.down.b32 %f241, %f240, 4, 31, -1;
add.rn.f32 %f242, %f240, %f241;
shfl.sync.down.b32 %f243, %f242, 2, 31, -1;
add.rn.f32 %f244, %f242, %f243;
shfl.sync.down.b32 %f245, %f244, 1, 31, -1;
add.rn.f32 %f246, %f244, %f245;
st.f32 [%rd40], %f246;
@%p33 bra LBB37_4;
bra.uni LBB37_2;
LBB37_4:
ld.param.u64 %rd7, [fusion_2235_param_1];
shr.u32 %r30, %r2, 9;
cvta.to.global.u64 %rd10, %rd7;
and.b32 %r29, %r2, 511;
mul.wide.u32 %rd33, %r30, 2048;
add.s64 %rd34, %rd10, %rd33;
mul.wide.u32 %rd35, %r29, 4;
add.s64 %rd2, %rd34, %rd35;
ld.shared.f32 %f247, [%rd3];
atom.global.add.f32 %f248, [%rd2], %f247;
LBB37_2:
ret;
}
// .globl fusion_2234
.visible .entry fusion_2234(
.param .u64 fusion_2234_param_0,
.param .u64 fusion_2234_param_1,
.param .u64 fusion_2234_param_2,
.param .u64 fusion_2234_param_3,
.param .u64 fusion_2234_param_4,
.param .u64 fusion_2234_param_5
)
.reqntid 256, 1, 1
{
.reg .pred %p<9>;
.reg .b16 %h<27>;
.reg .b32 %hh<3>;
.reg .f32 %f<59>;
.reg .b32 %r<18>;
.reg .b64 %rd<26>;
ld.param.u64 %rd1, [fusion_2234_param_0];
ld.param.u64 %rd2, [fusion_2234_param_4];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2234_param_1];
ld.param.u64 %rd5, [fusion_2234_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2234_param_2];
cvta.to.global.u64 %rd8, %rd7;
cvta.to.global.u64 %rd9, %rd4;
cvta.to.global.u64 %rd10, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
or.b32 %r8, %r4, 3;
shr.u32 %r9, %r5, 9;
and.b32 %r10, %r8, 511;
and.b32 %r11, %r7, 510;
and.b32 %r12, %r6, 509;
and.b32 %r13, %r4, 508;
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd10, %rd11;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
mul.wide.u32 %rd13, %r13, 4;
add.s64 %rd14, %rd3, %rd13;
ld.global.nc.u32 %r14, [%rd14];
cvt.rn.f16.s32 %h9, %r14;
mov.b16 %h10, 0x3C00;
sub.rn.f16 %h11, %h10, %h9;
mov.b16 %h12, 0x70E2;
mul.rn.f16 %h13, %h11, %h12;
sub.rn.f16 %h14, %h5, %h13;
cvt.f32.f16 %f1, %h14;
mul.wide.u32 %rd15, %r9, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.f32 %f2, [%rd16];
sub.rn.f32 %f3, %f1, %f2;
mul.rn.f32 %f4, %f3, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f5, %f4;
add.rn.f32 %f6, %f5, 0f00000000;
ex2.approx.f32 %f7, %f6;
fma.rn.f32 %f8, %f5, 0fBF317200, %f3;
fma.rn.f32 %f9, %f5, 0fB5BFBE8E, %f8;
mul.rn.f32 %f10, %f9, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f11, %f10;
mul.rn.f32 %f12, %f7, %f11;
setp.lt.f32 %p1, %f3, 0fC2D20000;
selp.f32 %f13, 0f00000000, %f12, %p1;
setp.gt.f32 %p2, %f3, 0f42D20000;
selp.f32 %f14, 0f7F800000, %f13, %p2;
add.s64 %rd17, %rd8, %rd15;
ld.global.nc.f32 %f15, [%rd17];
div.full.f32 %f16, %f14, %f15;
mul.wide.u32 %rd18, %r5, 4;
add.s64 %rd19, %rd9, %rd18;
mul.wide.u32 %rd20, %r12, 4;
add.s64 %rd21, %rd3, %rd20;
ld.global.nc.u32 %r15, [%rd21];
cvt.rn.f16.s32 %h15, %r15;
sub.rn.f16 %h16, %h10, %h15;
mul.rn.f16 %h17, %h16, %h12;
sub.rn.f16 %h18, %h6, %h17;
cvt.f32.f16 %f17, %h18;
sub.rn.f32 %f18, %f17, %f2;
mul.rn.f32 %f19, %f18, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f20, %f19;
add.rn.f32 %f21, %f20, 0f00000000;
ex2.approx.f32 %f22, %f21;
fma.rn.f32 %f23, %f20, 0fBF317200, %f18;
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23;
mul.rn.f32 %f25, %f24, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f26, %f25;
mul.rn.f32 %f27, %f22, %f26;
setp.lt.f32 %p3, %f18, 0fC2D20000;
selp.f32 %f28, 0f00000000, %f27, %p3;
setp.gt.f32 %p4, %f18, 0f42D20000;
selp.f32 %f29, 0f7F800000, %f28, %p4;
div.full.f32 %f30, %f29, %f15;
mul.wide.u32 %rd22, %r11, 4;
add.s64 %rd23, %rd3, %rd22;
ld.global.nc.u32 %r16, [%rd23];
cvt.rn.f16.s32 %h19, %r16;
sub.rn.f16 %h20, %h10, %h19;
mul.rn.f16 %h21, %h20, %h12;
sub.rn.f16 %h22, %h7, %h21;
cvt.f32.f16 %f31, %h22;
sub.rn.f32 %f32, %f31, %f2;
mul.rn.f32 %f33, %f32, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f34, %f33;
add.rn.f32 %f35, %f34, 0f00000000;
ex2.approx.f32 %f36, %f35;
fma.rn.f32 %f37, %f34, 0fBF317200, %f32;
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37;
mul.rn.f32 %f39, %f38, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f40, %f39;
mul.rn.f32 %f41, %f36, %f40;
setp.lt.f32 %p5, %f32, 0fC2D20000;
selp.f32 %f42, 0f00000000, %f41, %p5;
setp.gt.f32 %p6, %f32, 0f42D20000;
selp.f32 %f43, 0f7F800000, %f42, %p6;
div.full.f32 %f44, %f43, %f15;
mul.wide.u32 %rd24, %r10, 4;
add.s64 %rd25, %rd3, %rd24;
ld.global.nc.u32 %r17, [%rd25];
cvt.rn.f16.s32 %h23, %r17;
sub.rn.f16 %h24, %h10, %h23;
mul.rn.f16 %h25, %h24, %h12;
sub.rn.f16 %h26, %h8, %h25;
cvt.f32.f16 %f45, %h26;
sub.rn.f32 %f46, %f45, %f2;
mul.rn.f32 %f47, %f46, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f48, %f47;
add.rn.f32 %f49, %f48, 0f00000000;
ex2.approx.f32 %f50, %f49;
fma.rn.f32 %f51, %f48, 0fBF317200, %f46;
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51;
mul.rn.f32 %f53, %f52, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f54, %f53;
mul.rn.f32 %f55, %f50, %f54;
setp.lt.f32 %p7, %f46, 0fC2D20000;
selp.f32 %f56, 0f00000000, %f55, %p7;
setp.gt.f32 %p8, %f46, 0f42D20000;
selp.f32 %f57, 0f7F800000, %f56, %p8;
div.full.f32 %f58, %f57, %f15;
st.global.v4.f32 [%rd19], {%f16, %f30, %f44, %f58};
ret;
}
// .globl rng_get_and_update_state_6
.visible .entry rng_get_and_update_state_6(
.param .u64 rng_get_and_update_state_6_param_0,
.param .u64 rng_get_and_update_state_6_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_6_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 4194304;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 4194304;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2233
.visible .entry fusion_2233(
.param .u64 fusion_2233_param_0,
.param .u64 fusion_2233_param_1,
.param .u64 fusion_2233_param_2,
.param .u64 fusion_2233_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<19>;
.reg .f32 %f<13>;
.reg .b32 %r<29>;
.reg .b64 %rd<119>;
ld.param.u64 %rd1, [fusion_2233_param_0];
ld.param.u64 %rd2, [fusion_2233_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2233_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
ld.global.nc.v2.u64 {%rd7, %rd8}, [%rd3];
shr.u32 %r6, %r5, 2;
cvt.u64.u32 %rd9, %r6;
add.s64 %rd10, %rd7, %rd9;
setp.lt.u64 %p1, %rd10, %rd7;
and.b64 %rd11, %rd10, 4294967295;
mul.lo.s64 %rd12, %rd11, 3528531795;
selp.u64 %rd13, 1, 0, %p1;
add.s64 %rd14, %rd8, %rd13;
xor.b64 %rd15, %rd14, %rd12;
shr.u64 %rd16, %rd15, 32;
mul.lo.s64 %rd17, %rd16, 3449720151;
shr.u64 %rd18, %rd17, 32;
and.b64 %rd19, %rd14, 4294967295;
mul.lo.s64 %rd20, %rd19, 3449720151;
and.b64 %rd21, %rd20, 4294967295;
xor.b64 %rd22, %rd21, %rd18;
xor.b64 %rd23, %rd22, 2654435769;
mul.lo.s64 %rd24, %rd23, 3528531795;
shr.u64 %rd25, %rd24, 32;
xor.b64 %rd26, %rd20, %rd10;
shr.u64 %rd27, %rd26, 32;
mul.lo.s64 %rd28, %rd27, 3528531795;
and.b64 %rd29, %rd28, 4294967295;
xor.b64 %rd30, %rd29, %rd25;
xor.b64 %rd31, %rd30, 1993301258;
mul.lo.s64 %rd32, %rd31, 3449720151;
shr.u64 %rd33, %rd32, 32;
shr.u64 %rd34, %rd28, 32;
and.b64 %rd35, %rd12, 4294967295;
xor.b64 %rd36, %rd35, %rd34;
xor.b64 %rd37, %rd36, 3144134277;
mul.lo.s64 %rd38, %rd37, 3449720151;
and.b64 %rd39, %rd38, 4294967295;
xor.b64 %rd40, %rd39, %rd33;
xor.b64 %rd41, %rd40, 3668340011;
mul.lo.s64 %rd42, %rd41, 3528531795;
shr.u64 %rd43, %rd42, 32;
shr.u64 %rd44, %rd38, 32;
and.b64 %rd45, %rd17, 4294967295;
xor.b64 %rd46, %rd45, %rd44;
xor.b64 %rd47, %rd46, 1013904242;
mul.lo.s64 %rd48, %rd47, 3528531795;
and.b64 %rd49, %rd48, 4294967295;
xor.b64 %rd50, %rd49, %rd43;
xor.b64 %rd51, %rd50, 3986602516;
mul.lo.s64 %rd52, %rd51, 3449720151;
shr.u64 %rd53, %rd52, 32;
shr.u64 %rd54, %rd48, 32;
and.b64 %rd55, %rd24, 4294967295;
xor.b64 %rd56, %rd55, %rd54;
xor.b64 %rd57, %rd56, 842468239;
mul.lo.s64 %rd58, %rd57, 3449720151;
and.b64 %rd59, %rd58, 4294967295;
xor.b64 %rd60, %rd59, %rd53;
xor.b64 %rd61, %rd60, 387276957;
mul.lo.s64 %rd62, %rd61, 3528531795;
shr.u64 %rd63, %rd62, 32;
shr.u64 %rd64, %rd58, 32;
and.b64 %rd65, %rd32, 4294967295;
xor.b64 %rd66, %rd65, %rd64;
xor.b64 %rd67, %rd66, 2027808484;
mul.lo.s64 %rd68, %rd67, 3528531795;
and.b64 %rd69, %rd68, 4294967295;
xor.b64 %rd70, %rd69, %rd63;
xor.b64 %rd71, %rd70, 1684936478;
mul.lo.s64 %rd72, %rd71, 3449720151;
shr.u64 %rd73, %rd72, 32;
shr.u64 %rd74, %rd68, 32;
and.b64 %rd75, %rd42, 4294967295;
xor.b64 %rd76, %rd75, %rd74;
xor.b64 %rd77, %rd76, 2835769497;
mul.lo.s64 %rd78, %rd77, 3449720151;
and.b64 %rd79, %rd78, 4294967295;
xor.b64 %rd80, %rd79, %rd73;
xor.b64 %rd81, %rd80, 1401181199;
mul.lo.s64 %rd82, %rd81, 3528531795;
shr.u64 %rd83, %rd82, 32;
shr.u64 %rd84, %rd78, 32;
and.b64 %rd85, %rd52, 4294967295;
xor.b64 %rd86, %rd85, %rd84;
xor.b64 %rd87, %rd86, 3041712726;
mul.lo.s64 %rd88, %rd87, 3528531795;
and.b64 %rd89, %rd88, 4294967295;
xor.b64 %rd90, %rd89, %rd83;
xor.b64 %rd91, %rd90, 3678237736;
mul.lo.s64 %rd92, %rd91, 3449720151;
shr.u64 %rd93, %rd92, 32;
cvt.u32.u64 %r7, %rd93;
shr.u64 %rd94, %rd88, 32;
xor.b64 %rd95, %rd94, %rd62;
cvt.u32.u64 %r8, %rd95;
xor.b32 %r9, %r8, 534103459;
mul.lo.s32 %r10, %r9, -845247145;
xor.b32 %r11, %r10, %r7;
shr.u32 %r12, %r11, 9;
xor.b32 %r13, %r12, 4716963;
cvt.rn.f32.u32 %f1, %r13;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h1, %f2;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p2, %h1, %h2;
mul.wide.u32 %rd96, %r5, 4;
add.s64 %rd97, %rd5, %rd96;
ld.global.nc.v4.f32 {%f3, %f4, %f5, %f6}, [%rd97];
cvt.rn.f16.f32 %h3, %f3;
mov.b16 %h4, 0x3C72;
mul.rn.f16 %h5, %h3, %h4;
selp.b16 %h6, %h5, 0x0000, %p2;
mul.wide.u32 %rd98, %r5, 2;
add.s64 %rd99, %rd6, %rd98;
xor.b64 %rd100, %rd84, %rd52;
xor.b64 %rd101, %rd100, 3041712726;
mul.lo.s64 %rd102, %rd101, 3528531795;
xor.b64 %rd103, %rd83, %rd102;
cvt.u32.u64 %r14, %rd103;
xor.b32 %r15, %r14, -616729560;
mul.lo.s32 %r16, %r15, -845247145;
shr.u32 %r17, %r16, 9;
cvt.rn.f32.u32 %f7, %r17;
mul.rn.f32 %f8, %f7, 0f34000000;
cvt.rn.f16.f32 %h7, %f8;
setp.ge.f16 %p3, %h7, %h2;
cvt.rn.f16.f32 %h8, %f4;
mul.rn.f16 %h9, %h8, %h4;
selp.b16 %h10, %h9, 0x0000, %p3;
and.b64 %rd104, %rd62, 4294967295;
xor.b64 %rd105, %rd104, %rd94;
xor.b64 %rd106, %rd105, 534103459;
mul.lo.s64 %rd107, %rd106, 3449720151;
shr.u64 %rd108, %rd107, 32;
and.b64 %rd109, %rd72, 4294967295;
xor.b64 %rd110, %rd109, %rd108;
xor.b64 %rd111, %rd110, 4055616968;
mul.lo.s64 %rd112, %rd111, 3528531795;
shr.u64 %rd113, %rd112, 32;
cvt.u32.u64 %r18, %rd113;
xor.b64 %rd114, %rd73, %rd78;
cvt.u32.u64 %r19, %rd114;
xor.b32 %r20, %r19, 1401181199;
mul.lo.s32 %r21, %r20, -766435501;
xor.b32 %r22, %r21, %r18;
shr.u32 %r23, %r22, 9;
xor.b32 %r24, %r23, 4936337;
cvt.rn.f32.u32 %f9, %r24;
mul.rn.f32 %f10, %f9, 0f34000000;
cvt.rn.f16.f32 %h11, %f10;
setp.ge.f16 %p4, %h11, %h2;
cvt.rn.f16.f32 %h12, %f5;
mul.rn.f16 %h13, %h12, %h4;
selp.b16 %h14, %h13, 0x0000, %p4;
xor.b64 %rd115, %rd63, %rd68;
xor.b64 %rd116, %rd115, 1684936478;
mul.lo.s64 %rd117, %rd116, 3449720151;
xor.b64 %rd118, %rd108, %rd117;
cvt.u32.u64 %r25, %rd118;
xor.b32 %r26, %r25, -239350328;
mul.lo.s32 %r27, %r26, -766435501;
shr.u32 %r28, %r27, 9;
cvt.rn.f32.u32 %f11, %r28;
mul.rn.f32 %f12, %f11, 0f34000000;
cvt.rn.f16.f32 %h15, %f12;
setp.ge.f16 %p5, %h15, %h2;
cvt.rn.f16.f32 %h16, %f6;
mul.rn.f16 %h17, %h16, %h4;
selp.b16 %h18, %h17, 0x0000, %p5;
st.global.v4.b16 [%rd99], {%h6, %h10, %h14, %h18};
ret;
}
// .globl fusion_2705
.visible .entry fusion_2705(
.param .u64 fusion_2705_param_0,
.param .u64 fusion_2705_param_1,
.param .u64 fusion_2705_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2705_param_0];
ld.param.u64 %rd2, [fusion_2705_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2231
.visible .entry fusion_2231(
.param .u64 fusion_2231_param_0,
.param .u64 fusion_2231_param_1,
.param .u64 fusion_2231_param_2,
.param .u64 fusion_2231_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2231_param_0];
ld.param.u64 %rd2, [fusion_2231_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2231_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd6, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd5, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2230
.visible .entry fusion_2230(
.param .u64 fusion_2230_param_0,
.param .u64 fusion_2230_param_1,
.param .u64 fusion_2230_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .b32 %r<12>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [fusion_2230_param_0];
ld.param.u64 %rd2, [fusion_2230_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
and.b32 %r8, %r4, 60;
shr.u32 %r9, %r2, 4;
mul.wide.u32 %rd5, %r9, 65536;
add.s64 %rd6, %rd3, %rd5;
mul.wide.u32 %rd7, %r1, 128;
add.s64 %rd8, %rd6, %rd7;
mul.wide.u32 %rd9, %r8, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd4, %rd11;
and.b32 %r10, %r6, 61;
mul.wide.u32 %rd13, %r10, 2;
add.s64 %rd14, %rd8, %rd13;
ld.global.nc.b16 %h2, [%rd14];
and.b32 %r11, %r7, 62;
mul.wide.u32 %rd15, %r11, 2;
add.s64 %rd16, %rd8, %rd15;
ld.global.nc.b16 %h3, [%rd16];
ld.global.nc.b16 %h4, [%rd10+6];
st.global.v4.b16 [%rd12], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2704
.visible .entry fusion_2704(
.param .u64 fusion_2704_param_0,
.param .u64 fusion_2704_param_1,
.param .u64 fusion_2704_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2704_param_0];
ld.param.u64 %rd2, [fusion_2704_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl rng_get_and_update_state_4
.visible .entry rng_get_and_update_state_4(
.param .u64 rng_get_and_update_state_4_param_0,
.param .u64 rng_get_and_update_state_4_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_4_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 524288;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 524288;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2229
.visible .entry fusion_2229(
.param .u64 fusion_2229_param_0,
.param .u64 fusion_2229_param_1,
.param .u64 fusion_2229_param_2,
.param .u64 fusion_2229_param_3,
.param .u64 fusion_2229_param_4,
.param .u64 fusion_2229_param_5
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<43>;
.reg .b32 %hh<5>;
.reg .f32 %f<13>;
.reg .b32 %r<31>;
.reg .b64 %rd<129>;
ld.param.u64 %rd1, [fusion_2229_param_0];
ld.param.u64 %rd2, [fusion_2229_param_4];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2229_param_1];
ld.param.u64 %rd5, [fusion_2229_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2229_param_2];
cvta.to.global.u64 %rd8, %rd7;
cvta.to.global.u64 %rd9, %rd4;
cvta.to.global.u64 %rd10, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd9, %rd11;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
shr.u32 %r8, %r5, 2;
ld.global.nc.v2.u64 {%rd13, %rd14}, [%rd6];
cvt.u64.u32 %rd15, %r8;
add.s64 %rd16, %rd13, %rd15;
setp.lt.u64 %p1, %rd16, %rd13;
and.b64 %rd17, %rd16, 4294967295;
mul.lo.s64 %rd18, %rd17, 3528531795;
selp.u64 %rd19, 1, 0, %p1;
add.s64 %rd20, %rd14, %rd19;
xor.b64 %rd21, %rd20, %rd18;
shr.u64 %rd22, %rd21, 32;
mul.lo.s64 %rd23, %rd22, 3449720151;
shr.u64 %rd24, %rd23, 32;
and.b64 %rd25, %rd20, 4294967295;
mul.lo.s64 %rd26, %rd25, 3449720151;
and.b64 %rd27, %rd26, 4294967295;
xor.b64 %rd28, %rd27, %rd24;
xor.b64 %rd29, %rd28, 2654435769;
mul.lo.s64 %rd30, %rd29, 3528531795;
shr.u64 %rd31, %rd30, 32;
xor.b64 %rd32, %rd26, %rd16;
shr.u64 %rd33, %rd32, 32;
mul.lo.s64 %rd34, %rd33, 3528531795;
and.b64 %rd35, %rd34, 4294967295;
xor.b64 %rd36, %rd35, %rd31;
xor.b64 %rd37, %rd36, 1993301258;
mul.lo.s64 %rd38, %rd37, 3449720151;
shr.u64 %rd39, %rd38, 32;
shr.u64 %rd40, %rd34, 32;
and.b64 %rd41, %rd18, 4294967295;
xor.b64 %rd42, %rd41, %rd40;
xor.b64 %rd43, %rd42, 3144134277;
mul.lo.s64 %rd44, %rd43, 3449720151;
and.b64 %rd45, %rd44, 4294967295;
xor.b64 %rd46, %rd45, %rd39;
xor.b64 %rd47, %rd46, 3668340011;
mul.lo.s64 %rd48, %rd47, 3528531795;
shr.u64 %rd49, %rd48, 32;
shr.u64 %rd50, %rd44, 32;
and.b64 %rd51, %rd23, 4294967295;
xor.b64 %rd52, %rd51, %rd50;
xor.b64 %rd53, %rd52, 1013904242;
mul.lo.s64 %rd54, %rd53, 3528531795;
and.b64 %rd55, %rd54, 4294967295;
xor.b64 %rd56, %rd55, %rd49;
xor.b64 %rd57, %rd56, 3986602516;
mul.lo.s64 %rd58, %rd57, 3449720151;
shr.u64 %rd59, %rd58, 32;
shr.u64 %rd60, %rd54, 32;
and.b64 %rd61, %rd30, 4294967295;
xor.b64 %rd62, %rd61, %rd60;
xor.b64 %rd63, %rd62, 842468239;
mul.lo.s64 %rd64, %rd63, 3449720151;
and.b64 %rd65, %rd64, 4294967295;
xor.b64 %rd66, %rd65, %rd59;
xor.b64 %rd67, %rd66, 387276957;
mul.lo.s64 %rd68, %rd67, 3528531795;
shr.u64 %rd69, %rd68, 32;
shr.u64 %rd70, %rd64, 32;
and.b64 %rd71, %rd38, 4294967295;
xor.b64 %rd72, %rd71, %rd70;
xor.b64 %rd73, %rd72, 2027808484;
mul.lo.s64 %rd74, %rd73, 3528531795;
and.b64 %rd75, %rd74, 4294967295;
shr.u64 %rd76, %rd74, 32;
and.b64 %rd77, %rd48, 4294967295;
xor.b64 %rd78, %rd77, %rd76;
xor.b64 %rd79, %rd78, 2835769497;
mul.lo.s64 %rd80, %rd79, 3449720151;
and.b64 %rd81, %rd80, 4294967295;
shr.u64 %rd82, %rd80, 32;
and.b64 %rd83, %rd58, 4294967295;
xor.b64 %rd84, %rd83, %rd82;
xor.b64 %rd85, %rd84, 3041712726;
mul.lo.s64 %rd86, %rd85, 3528531795;
and.b64 %rd87, %rd86, 4294967295;
xor.b64 %rd88, %rd75, %rd69;
xor.b64 %rd89, %rd88, 1684936478;
mul.lo.s64 %rd90, %rd89, 3449720151;
shr.u64 %rd91, %rd90, 32;
xor.b64 %rd92, %rd81, %rd91;
xor.b64 %rd93, %rd92, 1401181199;
mul.lo.s64 %rd94, %rd93, 3528531795;
shr.u64 %rd95, %rd94, 32;
xor.b64 %rd96, %rd87, %rd95;
xor.b64 %rd97, %rd96, 3678237736;
mul.lo.s64 %rd98, %rd97, 3449720151;
shr.u64 %rd99, %rd98, 32;
cvt.u32.u64 %r9, %rd99;
shr.u64 %rd100, %rd86, 32;
xor.b64 %rd101, %rd100, %rd68;
cvt.u32.u64 %r10, %rd101;
xor.b32 %r11, %r10, 534103459;
mul.lo.s32 %r12, %r11, -845247145;
xor.b32 %r13, %r12, %r9;
shr.u32 %r14, %r13, 9;
xor.b32 %r15, %r14, 4716963;
cvt.rn.f32.u32 %f1, %r15;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h9, %f2;
mov.b16 %h10, 0x2E66;
setp.ge.f16 %p2, %h9, %h10;
add.s64 %rd102, %rd8, %rd11;
ld.global.nc.v4.b16 {%h11, %h12, %h13, %h14}, [%rd102];
mov.b32 %hh3, {%h13, %h14};
mov.b32 %hh4, {%h11, %h12};
mov.b32 {%h15, %h16}, %hh4;
mov.b32 {%h17, %h18}, %hh3;
mul.wide.u32 %rd103, %r4, 4;
add.s64 %rd104, %rd3, %rd103;
ld.global.nc.f32 %f3, [%rd104];
cvt.rn.f16.f32 %h19, %f3;
add.rn.f16 %h20, %h15, %h19;
mov.b16 %h21, 0x3C72;
mul.rn.f16 %h22, %h20, %h21;
selp.b16 %h23, %h22, 0x0000, %p2;
add.rn.f16 %h24, %h5, %h23;
add.s64 %rd105, %rd10, %rd11;
xor.b64 %rd106, %rd58, %rd82;
xor.b64 %rd107, %rd106, 3041712726;
mul.lo.s64 %rd108, %rd107, 3528531795;
xor.b64 %rd109, %rd95, %rd108;
cvt.u32.u64 %r16, %rd109;
xor.b32 %r17, %r16, -616729560;
mul.lo.s32 %r18, %r17, -845247145;
shr.u32 %r19, %r18, 9;
cvt.rn.f32.u32 %f4, %r19;
mul.rn.f32 %f5, %f4, 0f34000000;
cvt.rn.f16.f32 %h25, %f5;
setp.ge.f16 %p3, %h25, %h10;
mul.wide.u32 %rd110, %r6, 4;
add.s64 %rd111, %rd3, %rd110;
ld.global.nc.f32 %f6, [%rd111];
cvt.rn.f16.f32 %h26, %f6;
add.rn.f16 %h27, %h16, %h26;
mul.rn.f16 %h28, %h27, %h21;
selp.b16 %h29, %h28, 0x0000, %p3;
add.rn.f16 %h30, %h6, %h29;
and.b64 %rd112, %rd90, 4294967295;
and.b64 %rd113, %rd68, 4294967295;
xor.b64 %rd114, %rd113, %rd100;
xor.b64 %rd115, %rd114, 534103459;
mul.lo.s64 %rd116, %rd115, 3449720151;
shr.u64 %rd117, %rd116, 32;
xor.b64 %rd118, %rd112, %rd117;
xor.b64 %rd119, %rd118, 4055616968;
mul.lo.s64 %rd120, %rd119, 3528531795;
shr.u64 %rd121, %rd120, 32;
cvt.u32.u64 %r20, %rd121;
xor.b64 %rd122, %rd91, %rd80;
cvt.u32.u64 %r21, %rd122;
xor.b32 %r22, %r21, 1401181199;
mul.lo.s32 %r23, %r22, -766435501;
xor.b32 %r24, %r23, %r20;
shr.u32 %r25, %r24, 9;
xor.b32 %r26, %r25, 4936337;
cvt.rn.f32.u32 %f7, %r26;
mul.rn.f32 %f8, %f7, 0f34000000;
cvt.rn.f16.f32 %h31, %f8;
setp.ge.f16 %p4, %h31, %h10;
mul.wide.u32 %rd123, %r7, 4;
add.s64 %rd124, %rd3, %rd123;
ld.global.nc.f32 %f9, [%rd124];
cvt.rn.f16.f32 %h32, %f9;
add.rn.f16 %h33, %h17, %h32;
mul.rn.f16 %h34, %h33, %h21;
selp.b16 %h35, %h34, 0x0000, %p4;
add.rn.f16 %h36, %h7, %h35;
xor.b64 %rd125, %rd69, %rd74;
xor.b64 %rd126, %rd125, 1684936478;
mul.lo.s64 %rd127, %rd126, 3449720151;
xor.b64 %rd128, %rd117, %rd127;
cvt.u32.u64 %r27, %rd128;
xor.b32 %r28, %r27, -239350328;
mul.lo.s32 %r29, %r28, -766435501;
shr.u32 %r30, %r29, 9;
cvt.rn.f32.u32 %f10, %r30;
mul.rn.f32 %f11, %f10, 0f34000000;
cvt.rn.f16.f32 %h37, %f11;
setp.ge.f16 %p5, %h37, %h10;
ld.global.nc.f32 %f12, [%rd104+12];
cvt.rn.f16.f32 %h38, %f12;
add.rn.f16 %h39, %h18, %h38;
mul.rn.f16 %h40, %h39, %h21;
selp.b16 %h41, %h40, 0x0000, %p5;
add.rn.f16 %h42, %h8, %h41;
st.global.v4.b16 [%rd105], {%h24, %h30, %h36, %h42};
ret;
}
// .globl fusion_2228
.visible .entry fusion_2228(
.param .u64 fusion_2228_param_0,
.param .u64 fusion_2228_param_1,
.param .u64 fusion_2228_param_2
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot47[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<56>;
.reg .b32 %r<7>;
.reg .b64 %rd<22>;
mov.u64 %SPL, __local_depot47;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2228_param_0];
cvta.to.global.u64 %rd8, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd10, %r4, 2048;
add.s64 %rd11, %rd8, %rd10;
mul.wide.u32 %rd12, %r5, 2;
add.s64 %rd13, %rd11, %rd12;
ld.global.nc.b32 %hh1, [%rd13];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
add.rn.f32 %f3, %f2, 0f00000000;
cvt.f32.f16 %f4, %h2;
add.rn.f32 %f5, %f3, %f4;
ld.global.nc.b32 %hh2, [%rd13+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f6, %h3;
add.rn.f32 %f7, %f5, %f6;
cvt.f32.f16 %f8, %h4;
add.rn.f32 %f9, %f7, %f8;
ld.global.nc.b32 %hh3, [%rd13+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f10, %h5;
add.rn.f32 %f11, %f9, %f10;
cvt.f32.f16 %f12, %h6;
add.rn.f32 %f13, %f11, %f12;
ld.global.nc.b32 %hh4, [%rd13+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f14, %h7;
add.rn.f32 %f15, %f13, %f14;
cvt.f32.f16 %f16, %h8;
add.rn.f32 %f17, %f15, %f16;
ld.global.nc.b32 %hh5, [%rd13+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f18, %h9;
add.rn.f32 %f19, %f17, %f18;
cvt.f32.f16 %f20, %h10;
add.rn.f32 %f21, %f19, %f20;
ld.global.nc.b32 %hh6, [%rd13+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f22, %h11;
add.rn.f32 %f23, %f21, %f22;
cvt.f32.f16 %f24, %h12;
add.rn.f32 %f25, %f23, %f24;
ld.global.nc.b32 %hh7, [%rd13+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f26, %h13;
add.rn.f32 %f27, %f25, %f26;
cvt.f32.f16 %f28, %h14;
add.rn.f32 %f29, %f27, %f28;
ld.global.nc.b32 %hh8, [%rd13+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f30, %h15;
add.rn.f32 %f31, %f29, %f30;
cvt.f32.f16 %f32, %h16;
add.rn.f32 %f33, %f31, %f32;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
add.rn.f32 %f35, %f34, %f33;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
add.rn.f32 %f37, %f36, %f35;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
add.rn.f32 %f39, %f38, %f37;
shfl.sync.down.b32 %f40, %f39, 2, 31, -1;
add.rn.f32 %f41, %f40, %f39;
shfl.sync.down.b32 %f42, %f41, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd16, shared_cache_010;
@%p1 bra LBB47_3;
bra.uni LBB47_1;
LBB47_3:
mul.wide.u32 %rd15, %r3, 4;
add.s64 %rd3, %rd16, %rd15;
add.rn.f32 %f1, %f42, %f41;
st.shared.f32 [%rd3], %f1;
LBB47_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB47_4;
bra.uni LBB47_2;
LBB47_4:
add.u64 %rd9, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd17, %r2, 4;
add.s64 %rd4, %rd16, %rd17;
cvta.shared.u64 %rd19, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd21, %rd19, %rd9, %p3;
ld.f32 %f43, [%rd21];
shfl.sync.down.b32 %f44, %f43, 16, 31, -1;
add.rn.f32 %f45, %f43, %f44;
shfl.sync.down.b32 %f46, %f45, 8, 31, -1;
add.rn.f32 %f47, %f45, %f46;
shfl.sync.down.b32 %f48, %f47, 4, 31, -1;
add.rn.f32 %f49, %f47, %f48;
shfl.sync.down.b32 %f50, %f49, 2, 31, -1;
add.rn.f32 %f51, %f49, %f50;
shfl.sync.down.b32 %f52, %f51, 1, 31, -1;
add.rn.f32 %f53, %f51, %f52;
st.f32 [%rd21], %f53;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB47_2;
ld.param.u64 %rd6, [fusion_2228_param_1];
cvta.to.global.u64 %rd7, %rd6;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd2, %rd7, %rd14;
ld.shared.f32 %f54, [%rd4];
atom.global.add.f32 %f55, [%rd2], %f54;
LBB47_2:
ret;
}
// .globl fusion_2225
.visible .entry fusion_2225(
.param .u64 fusion_2225_param_0,
.param .u64 fusion_2225_param_1,
.param .u64 fusion_2225_param_2,
.param .u64 fusion_2225_param_3
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot48[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<90>;
.reg .b32 %r<7>;
.reg .b64 %rd<25>;
mov.u64 %SPL, __local_depot48;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2225_param_0];
ld.param.u64 %rd6, [fusion_2225_param_2];
cvta.to.global.u64 %rd7, %rd6;
cvta.to.global.u64 %rd10, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd12, %r4, 2048;
add.s64 %rd13, %rd10, %rd12;
mul.wide.u32 %rd14, %r5, 2;
add.s64 %rd15, %rd13, %rd14;
ld.global.nc.b32 %hh1, [%rd15];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
mul.wide.u32 %rd16, %r4, 4;
add.s64 %rd17, %rd7, %rd16;
ld.global.nc.f32 %f3, [%rd17];
mul.rn.f32 %f4, %f3, 0f3A800000;
sub.rn.f32 %f5, %f2, %f4;
mul.rn.f32 %f6, %f5, %f5;
add.rn.f32 %f7, %f6, 0f00000000;
cvt.f32.f16 %f8, %h2;
sub.rn.f32 %f9, %f8, %f4;
mul.rn.f32 %f10, %f9, %f9;
add.rn.f32 %f11, %f7, %f10;
ld.global.nc.b32 %hh2, [%rd15+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f12, %h3;
sub.rn.f32 %f13, %f12, %f4;
mul.rn.f32 %f14, %f13, %f13;
add.rn.f32 %f15, %f11, %f14;
cvt.f32.f16 %f16, %h4;
sub.rn.f32 %f17, %f16, %f4;
mul.rn.f32 %f18, %f17, %f17;
add.rn.f32 %f19, %f15, %f18;
ld.global.nc.b32 %hh3, [%rd15+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f20, %h5;
sub.rn.f32 %f21, %f20, %f4;
mul.rn.f32 %f22, %f21, %f21;
add.rn.f32 %f23, %f19, %f22;
cvt.f32.f16 %f24, %h6;
sub.rn.f32 %f25, %f24, %f4;
mul.rn.f32 %f26, %f25, %f25;
add.rn.f32 %f27, %f23, %f26;
ld.global.nc.b32 %hh4, [%rd15+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f28, %h7;
sub.rn.f32 %f29, %f28, %f4;
mul.rn.f32 %f30, %f29, %f29;
add.rn.f32 %f31, %f27, %f30;
cvt.f32.f16 %f32, %h8;
sub.rn.f32 %f33, %f32, %f4;
mul.rn.f32 %f34, %f33, %f33;
add.rn.f32 %f35, %f31, %f34;
ld.global.nc.b32 %hh5, [%rd15+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f36, %h9;
sub.rn.f32 %f37, %f36, %f4;
mul.rn.f32 %f38, %f37, %f37;
add.rn.f32 %f39, %f35, %f38;
cvt.f32.f16 %f40, %h10;
sub.rn.f32 %f41, %f40, %f4;
mul.rn.f32 %f42, %f41, %f41;
add.rn.f32 %f43, %f39, %f42;
ld.global.nc.b32 %hh6, [%rd15+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f44, %h11;
sub.rn.f32 %f45, %f44, %f4;
mul.rn.f32 %f46, %f45, %f45;
add.rn.f32 %f47, %f43, %f46;
cvt.f32.f16 %f48, %h12;
sub.rn.f32 %f49, %f48, %f4;
mul.rn.f32 %f50, %f49, %f49;
add.rn.f32 %f51, %f47, %f50;
ld.global.nc.b32 %hh7, [%rd15+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f52, %h13;
sub.rn.f32 %f53, %f52, %f4;
mul.rn.f32 %f54, %f53, %f53;
add.rn.f32 %f55, %f51, %f54;
cvt.f32.f16 %f56, %h14;
sub.rn.f32 %f57, %f56, %f4;
mul.rn.f32 %f58, %f57, %f57;
add.rn.f32 %f59, %f55, %f58;
ld.global.nc.b32 %hh8, [%rd15+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f60, %h15;
sub.rn.f32 %f61, %f60, %f4;
mul.rn.f32 %f62, %f61, %f61;
add.rn.f32 %f63, %f59, %f62;
cvt.f32.f16 %f64, %h16;
sub.rn.f32 %f65, %f64, %f4;
mul.rn.f32 %f66, %f65, %f65;
add.rn.f32 %f67, %f63, %f66;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f68, %f67, 16, 31, -1;
add.rn.f32 %f69, %f68, %f67;
shfl.sync.down.b32 %f70, %f69, 8, 31, -1;
add.rn.f32 %f71, %f70, %f69;
shfl.sync.down.b32 %f72, %f71, 4, 31, -1;
add.rn.f32 %f73, %f72, %f71;
shfl.sync.down.b32 %f74, %f73, 2, 31, -1;
add.rn.f32 %f75, %f74, %f73;
shfl.sync.down.b32 %f76, %f75, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd19, shared_cache_011;
@%p1 bra LBB48_3;
bra.uni LBB48_1;
LBB48_3:
mul.wide.u32 %rd18, %r3, 4;
add.s64 %rd3, %rd19, %rd18;
add.rn.f32 %f1, %f76, %f75;
st.shared.f32 [%rd3], %f1;
LBB48_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB48_4;
bra.uni LBB48_2;
LBB48_4:
add.u64 %rd11, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd20, %r2, 4;
add.s64 %rd4, %rd19, %rd20;
cvta.shared.u64 %rd22, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd24, %rd22, %rd11, %p3;
ld.f32 %f77, [%rd24];
shfl.sync.down.b32 %f78, %f77, 16, 31, -1;
add.rn.f32 %f79, %f77, %f78;
shfl.sync.down.b32 %f80, %f79, 8, 31, -1;
add.rn.f32 %f81, %f79, %f80;
shfl.sync.down.b32 %f82, %f81, 4, 31, -1;
add.rn.f32 %f83, %f81, %f82;
shfl.sync.down.b32 %f84, %f83, 2, 31, -1;
add.rn.f32 %f85, %f83, %f84;
shfl.sync.down.b32 %f86, %f85, 1, 31, -1;
add.rn.f32 %f87, %f85, %f86;
st.f32 [%rd24], %f87;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB48_2;
ld.param.u64 %rd8, [fusion_2225_param_1];
cvta.to.global.u64 %rd9, %rd8;
add.s64 %rd2, %rd9, %rd16;
ld.shared.f32 %f88, [%rd4];
atom.global.add.f32 %f89, [%rd2], %f88;
LBB48_2:
ret;
}
// .globl fusion_2221
.visible .entry fusion_2221(
.param .u64 fusion_2221_param_0,
.param .u64 fusion_2221_param_1,
.param .u64 fusion_2221_param_2,
.param .u64 fusion_2221_param_3,
.param .u64 fusion_2221_param_4,
.param .u64 fusion_2221_param_5,
.param .u64 fusion_2221_param_6
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .b32 %hh<3>;
.reg .f32 %f<39>;
.reg .b32 %r<8>;
.reg .b64 %rd<28>;
ld.param.u64 %rd1, [fusion_2221_param_0];
ld.param.u64 %rd2, [fusion_2221_param_5];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2221_param_1];
ld.param.u64 %rd5, [fusion_2221_param_4];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2221_param_2];
ld.param.u64 %rd8, [fusion_2221_param_3];
cvta.to.global.u64 %rd9, %rd8;
cvta.to.global.u64 %rd10, %rd7;
cvta.to.global.u64 %rd11, %rd4;
cvta.to.global.u64 %rd12, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
mul.wide.u32 %rd13, %r5, 2;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd14];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
cvt.f32.f16 %f1, %h5;
mul.wide.u32 %rd15, %r1, 4;
add.s64 %rd16, %rd10, %rd15;
ld.global.nc.f32 %f2, [%rd16];
mul.rn.f32 %f3, %f2, 0f3A800000;
add.rn.f32 %f4, %f3, 0f2B8CBCCC;
rsqrt.approx.f32 %f5, %f4;
mul.wide.u32 %rd17, %r4, 4;
add.s64 %rd18, %rd3, %rd17;
ld.global.nc.f32 %f6, [%rd18];
mul.rn.f32 %f7, %f5, %f6;
mul.rn.f32 %f8, %f7, %f1;
add.s64 %rd19, %rd6, %rd17;
ld.global.nc.f32 %f9, [%rd19];
add.s64 %rd20, %rd9, %rd15;
ld.global.nc.f32 %f10, [%rd20];
mul.rn.f32 %f11, %f10, 0f3A800000;
mul.rn.f32 %f12, %f7, %f11;
sub.rn.f32 %f13, %f9, %f12;
add.rn.f32 %f14, %f8, %f13;
cvt.rn.f16.f32 %h9, %f14;
add.s64 %rd21, %rd12, %rd13;
cvt.f32.f16 %f15, %h6;
mul.wide.u32 %rd22, %r6, 4;
add.s64 %rd23, %rd3, %rd22;
ld.global.nc.f32 %f16, [%rd23];
mul.rn.f32 %f17, %f5, %f16;
mul.rn.f32 %f18, %f17, %f15;
add.s64 %rd24, %rd6, %rd22;
ld.global.nc.f32 %f19, [%rd24];
mul.rn.f32 %f20, %f11, %f17;
sub.rn.f32 %f21, %f19, %f20;
add.rn.f32 %f22, %f18, %f21;
cvt.rn.f16.f32 %h10, %f22;
cvt.f32.f16 %f23, %h7;
mul.wide.u32 %rd25, %r7, 4;
add.s64 %rd26, %rd3, %rd25;
ld.global.nc.f32 %f24, [%rd26];
mul.rn.f32 %f25, %f5, %f24;
mul.rn.f32 %f26, %f25, %f23;
add.s64 %rd27, %rd6, %rd25;
ld.global.nc.f32 %f27, [%rd27];
mul.rn.f32 %f28, %f11, %f25;
sub.rn.f32 %f29, %f27, %f28;
add.rn.f32 %f30, %f26, %f29;
cvt.rn.f16.f32 %h11, %f30;
cvt.f32.f16 %f31, %h8;
ld.global.nc.f32 %f32, [%rd18+12];
mul.rn.f32 %f33, %f5, %f32;
mul.rn.f32 %f34, %f33, %f31;
ld.global.nc.f32 %f35, [%rd19+12];
mul.rn.f32 %f36, %f11, %f33;
sub.rn.f32 %f37, %f35, %f36;
add.rn.f32 %f38, %f34, %f37;
cvt.rn.f16.f32 %h12, %f38;
st.global.v4.b16 [%rd21], {%h9, %h10, %h11, %h12};
ret;
}
// .globl convert_1525
.visible .entry convert_1525(
.param .u64 convert_1525_param_0,
.param .u64 convert_1525_param_1,
.param .u64 convert_1525_param_2
)
.reqntid 128, 1, 1
{
.reg .pred %p<3>;
.reg .b16 %h<29>;
.reg .f32 %f<29>;
.reg .b32 %r<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [convert_1525_param_0];
ld.param.u64 %rd4, [convert_1525_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd3;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
shl.b32 %r1, %r3, 9;
shl.b32 %r2, %r4, 2;
or.b32 %r5, %r1, %r2;
mul.wide.u32 %rd7, %r5, 4;
add.s64 %rd1, %rd5, %rd7;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd8, %r5, 2;
add.s64 %rd2, %rd6, %rd8;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4};
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440];
cvt.rn.f16.f32 %h5, %f5;
cvt.rn.f16.f32 %h6, %f6;
cvt.rn.f16.f32 %h7, %f7;
cvt.rn.f16.f32 %h8, %f8;
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8};
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880];
cvt.rn.f16.f32 %h9, %f9;
cvt.rn.f16.f32 %h10, %f10;
cvt.rn.f16.f32 %h11, %f11;
cvt.rn.f16.f32 %h12, %f12;
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12};
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320];
cvt.rn.f16.f32 %h13, %f13;
cvt.rn.f16.f32 %h14, %f14;
cvt.rn.f16.f32 %h15, %f15;
cvt.rn.f16.f32 %h16, %f16;
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16};
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760];
cvt.rn.f16.f32 %h17, %f17;
cvt.rn.f16.f32 %h18, %f18;
cvt.rn.f16.f32 %h19, %f19;
cvt.rn.f16.f32 %h20, %f20;
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20};
add.s32 %r6, %r5, 3276800;
setp.gt.u32 %p1, %r6, 4194303;
@%p1 bra LBB50_2;
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200];
cvt.rn.f16.f32 %h21, %f21;
cvt.rn.f16.f32 %h22, %f22;
cvt.rn.f16.f32 %h23, %f23;
cvt.rn.f16.f32 %h24, %f24;
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24};
LBB50_2:
add.s32 %r7, %r1, 3932160;
or.b32 %r8, %r7, %r2;
setp.gt.u32 %p2, %r8, 4194303;
@%p2 bra LBB50_4;
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640];
cvt.rn.f16.f32 %h25, %f25;
cvt.rn.f16.f32 %h26, %f26;
cvt.rn.f16.f32 %h27, %f27;
cvt.rn.f16.f32 %h28, %f28;
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28};
LBB50_4:
ret;
}
// .globl fusion_2216
.visible .entry fusion_2216(
.param .u64 fusion_2216_param_0,
.param .u64 fusion_2216_param_1,
.param .u64 fusion_2216_param_2,
.param .u64 fusion_2216_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<21>;
.reg .b16 %h<21>;
.reg .b32 %hh<3>;
.reg .f32 %f<150>;
.reg .b32 %r<25>;
.reg .b64 %rd<18>;
ld.param.u64 %rd1, [fusion_2216_param_0];
ld.param.u64 %rd2, [fusion_2216_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2216_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r5, 1;
or.b32 %r7, %r5, 2;
or.b32 %r8, %r5, 3;
and.b32 %r9, %r8, 4095;
and.b32 %r10, %r7, 4094;
and.b32 %r11, %r6, 4093;
and.b32 %r12, %r5, 4092;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd5, %rd7;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd8];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
mul.wide.u32 %rd9, %r12, 4;
add.s64 %rd10, %rd3, %rd9;
ld.global.nc.f32 %f1, [%rd10];
cvt.rn.f16.f32 %h9, %f1;
add.rn.f16 %h10, %h5, %h9;
cvt.f32.f16 %f2, %h10;
mul.rn.f32 %f3, %f2, %f2;
mul.rn.f32 %f4, %f3, %f2;
mul.rn.f32 %f5, %f4, 0f3D372713;
add.rn.f32 %f6, %f5, %f2;
mul.rn.f32 %f7, %f6, 0f3F4C422A;
abs.f32 %f8, %f7;
setp.lt.f32 %p1, %f8, 0f39D1B717;
setp.lt.f32 %p2, %f7, 0fC1100000;
selp.f32 %f9, 0fC1100000, %f7, %p2;
setp.gt.f32 %p3, %f9, 0f41100000;
selp.f32 %f10, 0f41100000, %f9, %p3;
mul.rn.f32 %f11, %f10, %f10;
mul.rn.f32 %f12, %f11, 0f259F25C0;
mov.f32 %f13, 0f2A61337E;
sub.rn.f32 %f14, %f13, %f12;
mul.rn.f32 %f15, %f11, %f14;
add.rn.f32 %f16, %f15, 0fAEBD37FF;
mul.rn.f32 %f17, %f11, %f16;
add.rn.f32 %f18, %f17, 0f335C0041;
mul.rn.f32 %f19, %f11, %f18;
add.rn.f32 %f20, %f19, 0f3779434A;
mul.rn.f32 %f21, %f11, %f20;
add.rn.f32 %f22, %f21, 0f3A270DED;
mul.rn.f32 %f23, %f11, %f22;
add.rn.f32 %f24, %f23, 0f3BA059DC;
mul.rn.f32 %f25, %f10, %f24;
mul.rn.f32 %f26, %f11, 0f35A0D3D8;
add.rn.f32 %f27, %f26, 0f38F895D6;
mul.rn.f32 %f28, %f11, %f27;
add.rn.f32 %f29, %f28, 0f3B14AA05;
mul.rn.f32 %f30, %f11, %f29;
add.rn.f32 %f31, %f30, 0f3BA059DD;
div.full.f32 %f32, %f25, %f31;
selp.f32 %f33, %f7, %f32, %p1;
mov.b32 %r13, %f7;
shr.u32 %r14, %r13, 31;
and.b32 %r15, %r14, 1;
setp.eq.b32 %p4, %r15, 1;
selp.f32 %f34, 0fBF800000, 0f3F800000, %p4;
setp.ltu.f32 %p5, %f8, 0f41A00000;
selp.f32 %f35, %f33, %f34, %p5;
add.rn.f32 %f36, %f35, 0f3F800000;
mul.rn.f32 %f37, %f36, 0f3F000000;
mul.rn.f32 %f38, %f37, %f2;
cvt.rn.f16.f32 %h11, %f38;
add.s64 %rd11, %rd6, %rd7;
mul.wide.u32 %rd12, %r11, 4;
add.s64 %rd13, %rd3, %rd12;
ld.global.nc.f32 %f39, [%rd13];
cvt.rn.f16.f32 %h12, %f39;
add.rn.f16 %h13, %h6, %h12;
cvt.f32.f16 %f40, %h13;
mul.rn.f32 %f41, %f40, %f40;
mul.rn.f32 %f42, %f41, %f40;
mul.rn.f32 %f43, %f42, 0f3D372713;
add.rn.f32 %f44, %f43, %f40;
mul.rn.f32 %f45, %f44, 0f3F4C422A;
abs.f32 %f46, %f45;
setp.lt.f32 %p6, %f46, 0f39D1B717;
setp.lt.f32 %p7, %f45, 0fC1100000;
selp.f32 %f47, 0fC1100000, %f45, %p7;
setp.gt.f32 %p8, %f47, 0f41100000;
selp.f32 %f48, 0f41100000, %f47, %p8;
mul.rn.f32 %f49, %f48, %f48;
mul.rn.f32 %f50, %f49, 0f259F25C0;
sub.rn.f32 %f51, %f13, %f50;
mul.rn.f32 %f52, %f49, %f51;
add.rn.f32 %f53, %f52, 0fAEBD37FF;
mul.rn.f32 %f54, %f49, %f53;
add.rn.f32 %f55, %f54, 0f335C0041;
mul.rn.f32 %f56, %f49, %f55;
add.rn.f32 %f57, %f56, 0f3779434A;
mul.rn.f32 %f58, %f49, %f57;
add.rn.f32 %f59, %f58, 0f3A270DED;
mul.rn.f32 %f60, %f49, %f59;
add.rn.f32 %f61, %f60, 0f3BA059DC;
mul.rn.f32 %f62, %f48, %f61;
mul.rn.f32 %f63, %f49, 0f35A0D3D8;
add.rn.f32 %f64, %f63, 0f38F895D6;
mul.rn.f32 %f65, %f49, %f64;
add.rn.f32 %f66, %f65, 0f3B14AA05;
mul.rn.f32 %f67, %f49, %f66;
add.rn.f32 %f68, %f67, 0f3BA059DD;
div.full.f32 %f69, %f62, %f68;
selp.f32 %f70, %f45, %f69, %p6;
mov.b32 %r16, %f45;
shr.u32 %r17, %r16, 31;
and.b32 %r18, %r17, 1;
setp.eq.b32 %p9, %r18, 1;
selp.f32 %f71, 0fBF800000, 0f3F800000, %p9;
setp.ltu.f32 %p10, %f46, 0f41A00000;
selp.f32 %f72, %f70, %f71, %p10;
add.rn.f32 %f73, %f72, 0f3F800000;
mul.rn.f32 %f74, %f73, 0f3F000000;
mul.rn.f32 %f75, %f74, %f40;
cvt.rn.f16.f32 %h14, %f75;
mul.wide.u32 %rd14, %r10, 4;
add.s64 %rd15, %rd3, %rd14;
ld.global.nc.f32 %f76, [%rd15];
cvt.rn.f16.f32 %h15, %f76;
add.rn.f16 %h16, %h7, %h15;
cvt.f32.f16 %f77, %h16;
mul.rn.f32 %f78, %f77, %f77;
mul.rn.f32 %f79, %f78, %f77;
mul.rn.f32 %f80, %f79, 0f3D372713;
add.rn.f32 %f81, %f80, %f77;
mul.rn.f32 %f82, %f81, 0f3F4C422A;
abs.f32 %f83, %f82;
setp.lt.f32 %p11, %f83, 0f39D1B717;
setp.lt.f32 %p12, %f82, 0fC1100000;
selp.f32 %f84, 0fC1100000, %f82, %p12;
setp.gt.f32 %p13, %f84, 0f41100000;
selp.f32 %f85, 0f41100000, %f84, %p13;
mul.rn.f32 %f86, %f85, %f85;
mul.rn.f32 %f87, %f86, 0f259F25C0;
sub.rn.f32 %f88, %f13, %f87;
mul.rn.f32 %f89, %f86, %f88;
add.rn.f32 %f90, %f89, 0fAEBD37FF;
mul.rn.f32 %f91, %f86, %f90;
add.rn.f32 %f92, %f91, 0f335C0041;
mul.rn.f32 %f93, %f86, %f92;
add.rn.f32 %f94, %f93, 0f3779434A;
mul.rn.f32 %f95, %f86, %f94;
add.rn.f32 %f96, %f95, 0f3A270DED;
mul.rn.f32 %f97, %f86, %f96;
add.rn.f32 %f98, %f97, 0f3BA059DC;
mul.rn.f32 %f99, %f85, %f98;
mul.rn.f32 %f100, %f86, 0f35A0D3D8;
add.rn.f32 %f101, %f100, 0f38F895D6;
mul.rn.f32 %f102, %f86, %f101;
add.rn.f32 %f103, %f102, 0f3B14AA05;
mul.rn.f32 %f104, %f86, %f103;
add.rn.f32 %f105, %f104, 0f3BA059DD;
div.full.f32 %f106, %f99, %f105;
selp.f32 %f107, %f82, %f106, %p11;
mov.b32 %r19, %f82;
shr.u32 %r20, %r19, 31;
and.b32 %r21, %r20, 1;
setp.eq.b32 %p14, %r21, 1;
selp.f32 %f108, 0fBF800000, 0f3F800000, %p14;
setp.ltu.f32 %p15, %f83, 0f41A00000;
selp.f32 %f109, %f107, %f108, %p15;
add.rn.f32 %f110, %f109, 0f3F800000;
mul.rn.f32 %f111, %f110, 0f3F000000;
mul.rn.f32 %f112, %f111, %f77;
cvt.rn.f16.f32 %h17, %f112;
mul.wide.u32 %rd16, %r9, 4;
add.s64 %rd17, %rd3, %rd16;
ld.global.nc.f32 %f113, [%rd17];
cvt.rn.f16.f32 %h18, %f113;
add.rn.f16 %h19, %h8, %h18;
cvt.f32.f16 %f114, %h19;
mul.rn.f32 %f115, %f114, %f114;
mul.rn.f32 %f116, %f115, %f114;
mul.rn.f32 %f117, %f116, 0f3D372713;
add.rn.f32 %f118, %f117, %f114;
mul.rn.f32 %f119, %f118, 0f3F4C422A;
abs.f32 %f120, %f119;
setp.lt.f32 %p16, %f120, 0f39D1B717;
setp.lt.f32 %p17, %f119, 0fC1100000;
selp.f32 %f121, 0fC1100000, %f119, %p17;
setp.gt.f32 %p18, %f121, 0f41100000;
selp.f32 %f122, 0f41100000, %f121, %p18;
mul.rn.f32 %f123, %f122, %f122;
mul.rn.f32 %f124, %f123, 0f259F25C0;
sub.rn.f32 %f125, %f13, %f124;
mul.rn.f32 %f126, %f123, %f125;
add.rn.f32 %f127, %f126, 0fAEBD37FF;
mul.rn.f32 %f128, %f123, %f127;
add.rn.f32 %f129, %f128, 0f335C0041;
mul.rn.f32 %f130, %f123, %f129;
add.rn.f32 %f131, %f130, 0f3779434A;
mul.rn.f32 %f132, %f123, %f131;
add.rn.f32 %f133, %f132, 0f3A270DED;
mul.rn.f32 %f134, %f123, %f133;
add.rn.f32 %f135, %f134, 0f3BA059DC;
mul.rn.f32 %f136, %f122, %f135;
mul.rn.f32 %f137, %f123, 0f35A0D3D8;
add.rn.f32 %f138, %f137, 0f38F895D6;
mul.rn.f32 %f139, %f123, %f138;
add.rn.f32 %f140, %f139, 0f3B14AA05;
mul.rn.f32 %f141, %f123, %f140;
add.rn.f32 %f142, %f141, 0f3BA059DD;
div.full.f32 %f143, %f136, %f142;
selp.f32 %f144, %f119, %f143, %p16;
mov.b32 %r22, %f119;
shr.u32 %r23, %r22, 31;
and.b32 %r24, %r23, 1;
setp.eq.b32 %p19, %r24, 1;
selp.f32 %f145, 0fBF800000, 0f3F800000, %p19;
setp.ltu.f32 %p20, %f120, 0f41A00000;
selp.f32 %f146, %f144, %f145, %p20;
add.rn.f32 %f147, %f146, 0f3F800000;
mul.rn.f32 %f148, %f147, 0f3F000000;
mul.rn.f32 %f149, %f148, %f114;
cvt.rn.f16.f32 %h20, %f149;
st.global.v4.b16 [%rd11], {%h11, %h14, %h17, %h20};
ret;
}
// .globl convert_1527
.visible .entry convert_1527(
.param .u64 convert_1527_param_0,
.param .u64 convert_1527_param_1,
.param .u64 convert_1527_param_2
)
.reqntid 128, 1, 1
{
.reg .pred %p<3>;
.reg .b16 %h<29>;
.reg .f32 %f<29>;
.reg .b32 %r<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [convert_1527_param_0];
ld.param.u64 %rd4, [convert_1527_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd3;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
shl.b32 %r1, %r3, 9;
shl.b32 %r2, %r4, 2;
or.b32 %r5, %r1, %r2;
mul.wide.u32 %rd7, %r5, 4;
add.s64 %rd1, %rd5, %rd7;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd8, %r5, 2;
add.s64 %rd2, %rd6, %rd8;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4};
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440];
cvt.rn.f16.f32 %h5, %f5;
cvt.rn.f16.f32 %h6, %f6;
cvt.rn.f16.f32 %h7, %f7;
cvt.rn.f16.f32 %h8, %f8;
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8};
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880];
cvt.rn.f16.f32 %h9, %f9;
cvt.rn.f16.f32 %h10, %f10;
cvt.rn.f16.f32 %h11, %f11;
cvt.rn.f16.f32 %h12, %f12;
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12};
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320];
cvt.rn.f16.f32 %h13, %f13;
cvt.rn.f16.f32 %h14, %f14;
cvt.rn.f16.f32 %h15, %f15;
cvt.rn.f16.f32 %h16, %f16;
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16};
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760];
cvt.rn.f16.f32 %h17, %f17;
cvt.rn.f16.f32 %h18, %f18;
cvt.rn.f16.f32 %h19, %f19;
cvt.rn.f16.f32 %h20, %f20;
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20};
add.s32 %r6, %r5, 3276800;
setp.gt.u32 %p1, %r6, 4194303;
@%p1 bra LBB52_2;
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200];
cvt.rn.f16.f32 %h21, %f21;
cvt.rn.f16.f32 %h22, %f22;
cvt.rn.f16.f32 %h23, %f23;
cvt.rn.f16.f32 %h24, %f24;
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24};
LBB52_2:
add.s32 %r7, %r1, 3932160;
or.b32 %r8, %r7, %r2;
setp.gt.u32 %p2, %r8, 4194303;
@%p2 bra LBB52_4;
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640];
cvt.rn.f16.f32 %h25, %f25;
cvt.rn.f16.f32 %h26, %f26;
cvt.rn.f16.f32 %h27, %f27;
cvt.rn.f16.f32 %h28, %f28;
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28};
LBB52_4:
ret;
}
// .globl rng_get_and_update_state_5
.visible .entry rng_get_and_update_state_5(
.param .u64 rng_get_and_update_state_5_param_0,
.param .u64 rng_get_and_update_state_5_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_5_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 524288;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 524288;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2214
.visible .entry fusion_2214(
.param .u64 fusion_2214_param_0,
.param .u64 fusion_2214_param_1,
.param .u64 fusion_2214_param_2,
.param .u64 fusion_2214_param_3,
.param .u64 fusion_2214_param_4,
.param .u64 fusion_2214_param_5,
.param .u64 fusion_2214_param_6,
.param .u64 fusion_2214_param_7,
.param .u64 fusion_2214_param_8,
.param .u64 fusion_2214_param_9
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot54[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<75>;
.reg .b16 %h<145>;
.reg .f32 %f<254>;
.reg .b32 %r<350>;
.reg .b64 %rd<2739>;
mov.u64 %SPL, __local_depot54;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd463, [fusion_2214_param_0];
ld.param.u64 %rd464, [fusion_2214_param_8];
cvta.to.global.u64 %rd1, %rd464;
ld.param.u64 %rd465, [fusion_2214_param_1];
ld.param.u64 %rd466, [fusion_2214_param_7];
cvta.to.global.u64 %rd2, %rd466;
ld.param.u64 %rd467, [fusion_2214_param_2];
ld.param.u64 %rd468, [fusion_2214_param_6];
cvta.to.global.u64 %rd3, %rd468;
ld.param.u64 %rd470, [fusion_2214_param_5];
cvta.to.global.u64 %rd4, %rd470;
ld.param.u64 %rd471, [fusion_2214_param_4];
cvta.to.global.u64 %rd5, %rd471;
cvta.to.global.u64 %rd7, %rd467;
cvta.to.global.u64 %rd8, %rd465;
cvta.to.global.u64 %rd9, %rd463;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 10;
or.b32 %r48, %r4, %r3;
shr.u32 %r49, %r48, 2;
and.b32 %r5, %r1, 1;
setp.eq.s32 %p1, %r5, 0;
ld.global.nc.u64 %rd11, [%rd7];
cvt.u64.u32 %rd473, %r49;
add.s64 %rd12, %rd11, %rd473;
setp.lt.u64 %p69, %rd12, %rd11;
and.b64 %rd2384, %rd12, 4294967295;
@%p1 bra LBB54_1;
bra.uni LBB54_4;
LBB54_1:
mul.lo.s64 %rd2446, %rd2384, 3528531795;
ld.global.nc.u64 %rd2461, [%rd7+8];
selp.u64 %rd516, 1, 0, %p69;
add.s64 %rd517, %rd2461, %rd516;
xor.b64 %rd518, %rd517, %rd2446;
shr.u64 %rd519, %rd518, 32;
mul.lo.s64 %rd2449, %rd519, 3449720151;
shr.u64 %rd520, %rd2449, 32;
and.b64 %rd521, %rd517, 4294967295;
mul.lo.s64 %rd522, %rd521, 3449720151;
and.b64 %rd523, %rd522, 4294967295;
xor.b64 %rd524, %rd523, %rd520;
xor.b64 %rd525, %rd524, 2654435769;
mul.lo.s64 %rd2452, %rd525, 3528531795;
xor.b64 %rd2442, %rd522, %rd12;
mov.u32 %r312, -1879881855;
mov.u32 %r311, -845247145;
mov.u32 %r310, 534103459;
mov.u64 %rd2460, 3678237736;
mov.u64 %rd2459, 3041712726;
mov.u64 %rd2458, 1401181199;
mov.u64 %rd2457, 2835769497;
mov.u64 %rd2456, 1684936478;
mov.u64 %rd2455, 2027808484;
mov.u64 %rd2454, 387276957;
mov.u64 %rd2453, 842468239;
mov.u64 %rd2451, 3986602516;
mov.u64 %rd2450, 1013904242;
mov.u64 %rd2448, 3668340011;
mov.u64 %rd2447, 3144134277;
mov.u64 %rd2445, 3449720151;
mov.u64 %rd2444, 1993301258;
mov.u64 %rd2443, 3528531795;
bra.uni LBB54_5;
LBB54_4:
mov.u32 %r311, -766435501;
mov.u64 %rd2459, 1684936478;
mov.u64 %rd2458, 534103459;
mov.u64 %rd2457, 387276957;
mov.u64 %rd2456, 3041712726;
mov.u64 %rd2455, 3986602516;
mov.u64 %rd2454, 2835769497;
mov.u64 %rd2453, 3668340011;
mov.u64 %rd2451, 2027808484;
mov.u64 %rd2450, 1993301258;
mov.u64 %rd2448, 842468239;
mov.u64 %rd2447, 2654435769;
mov.u64 %rd2445, 3528531795;
mov.u64 %rd2444, 1013904242;
mov.u64 %rd2443, 3449720151;
mov.u32 %r312, -1767562579;
mov.u32 %r310, 1401181199;
mov.u64 %rd2460, 4055616968;
ld.global.nc.u64 %rd2461, [%rd7+8];
selp.u64 %rd489, 1, 0, %p69;
add.s64 %rd490, %rd2461, %rd489;
and.b64 %rd491, %rd490, 4294967295;
mul.lo.s64 %rd2446, %rd491, 3449720151;
xor.b64 %rd492, %rd2446, %rd12;
shr.u64 %rd493, %rd492, 32;
mul.lo.s64 %rd2449, %rd493, 3528531795;
shr.u64 %rd494, %rd2449, 32;
mul.lo.s64 %rd496, %rd2384, 3528531795;
and.b64 %rd497, %rd496, 4294967295;
xor.b64 %rd498, %rd497, %rd494;
xor.b64 %rd499, %rd498, 3144134277;
mul.lo.s64 %rd2452, %rd499, 3449720151;
xor.b64 %rd2442, %rd490, %rd496;
LBB54_5:
shr.u64 %rd526, %rd2452, 32;
shr.u64 %rd527, %rd2442, 32;
mul.lo.s64 %rd528, %rd527, %rd2443;
and.b64 %rd529, %rd528, 4294967295;
xor.b64 %rd530, %rd529, %rd526;
xor.b64 %rd531, %rd530, %rd2444;
mul.lo.s64 %rd532, %rd531, %rd2445;
shr.u64 %rd533, %rd532, 32;
shr.u64 %rd534, %rd528, 32;
and.b64 %rd535, %rd2446, 4294967295;
xor.b64 %rd536, %rd535, %rd534;
xor.b64 %rd537, %rd536, %rd2447;
mul.lo.s64 %rd538, %rd537, %rd2445;
and.b64 %rd539, %rd538, 4294967295;
xor.b64 %rd540, %rd539, %rd533;
xor.b64 %rd541, %rd540, %rd2448;
mul.lo.s64 %rd542, %rd541, %rd2443;
shr.u64 %rd543, %rd542, 32;
shr.u64 %rd544, %rd538, 32;
and.b64 %rd545, %rd2449, 4294967295;
xor.b64 %rd546, %rd545, %rd544;
xor.b64 %rd547, %rd546, %rd2450;
mul.lo.s64 %rd548, %rd547, %rd2443;
and.b64 %rd549, %rd548, 4294967295;
xor.b64 %rd550, %rd549, %rd543;
xor.b64 %rd551, %rd550, %rd2451;
mul.lo.s64 %rd552, %rd551, %rd2445;
shr.u64 %rd553, %rd552, 32;
shr.u64 %rd554, %rd548, 32;
and.b64 %rd555, %rd2452, 4294967295;
xor.b64 %rd556, %rd555, %rd554;
xor.b64 %rd557, %rd556, %rd2453;
mul.lo.s64 %rd558, %rd557, %rd2445;
and.b64 %rd559, %rd558, 4294967295;
xor.b64 %rd560, %rd559, %rd553;
xor.b64 %rd561, %rd560, %rd2454;
mul.lo.s64 %rd562, %rd561, %rd2443;
shr.u64 %rd563, %rd562, 32;
shr.u64 %rd564, %rd558, 32;
and.b64 %rd565, %rd532, 4294967295;
xor.b64 %rd566, %rd565, %rd564;
xor.b64 %rd567, %rd566, %rd2455;
mul.lo.s64 %rd568, %rd567, %rd2443;
and.b64 %rd569, %rd568, 4294967295;
xor.b64 %rd570, %rd569, %rd563;
xor.b64 %rd571, %rd570, %rd2456;
mul.lo.s64 %rd572, %rd571, %rd2445;
shr.u64 %rd573, %rd572, 32;
shr.u64 %rd574, %rd568, 32;
and.b64 %rd575, %rd542, 4294967295;
xor.b64 %rd576, %rd575, %rd574;
xor.b64 %rd577, %rd576, %rd2457;
mul.lo.s64 %rd578, %rd577, %rd2445;
and.b64 %rd579, %rd578, 4294967295;
xor.b64 %rd580, %rd579, %rd573;
xor.b64 %rd581, %rd580, %rd2458;
mul.lo.s64 %rd582, %rd581, %rd2443;
shr.u64 %rd583, %rd582, 32;
shr.u64 %rd584, %rd578, 32;
and.b64 %rd585, %rd552, 4294967295;
xor.b64 %rd586, %rd585, %rd584;
xor.b64 %rd587, %rd586, %rd2459;
mul.lo.s64 %rd588, %rd587, %rd2443;
and.b64 %rd589, %rd588, 4294967295;
xor.b64 %rd590, %rd589, %rd583;
xor.b64 %rd591, %rd590, %rd2460;
mul.lo.s64 %rd592, %rd591, %rd2445;
shr.u64 %rd593, %rd592, 32;
cvt.u32.u64 %r56, %rd593;
shr.u64 %rd594, %rd588, 32;
xor.b64 %rd595, %rd594, %rd562;
cvt.u32.u64 %r57, %rd595;
xor.b32 %r58, %r310, %r57;
mul.lo.s32 %r59, %r58, %r311;
xor.b32 %r60, %r59, %r56;
xor.b32 %r61, %r60, %r312;
shr.u32 %r62, %r61, 9;
cvt.rn.f32.u32 %f19, %r62;
mul.rn.f32 %f20, %f19, 0f34000000;
cvt.rn.f16.f32 %h1, %f20;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p4, %h1, %h2;
mul.wide.u32 %rd596, %r2, 2048;
add.s64 %rd597, %rd9, %rd596;
mul.wide.u32 %rd598, %r3, 2;
add.s64 %rd44, %rd597, %rd598;
ld.global.nc.b16 %h3, [%rd44];
mul.wide.u32 %rd599, %r3, 4;
add.s64 %rd45, %rd1, %rd599;
ld.global.nc.f32 %f21, [%rd45];
cvt.rn.f16.f32 %h4, %f21;
add.rn.f16 %h5, %h3, %h4;
mov.b16 %h6, 0x3C72;
mul.rn.f16 %h7, %h5, %h6;
selp.b16 %h8, %h7, 0x0000, %p4;
cvt.f32.f16 %f22, %h8;
add.s64 %rd600, %rd8, %rd596;
add.s64 %rd46, %rd600, %rd598;
ld.global.nc.b16 %h9, [%rd46];
cvt.f32.f16 %f23, %h9;
mul.wide.u32 %rd601, %r2, 4;
add.s64 %rd602, %rd5, %rd601;
ld.global.nc.f32 %f24, [%rd602];
mul.rn.f32 %f25, %f24, 0f3A800000;
add.rn.f32 %f26, %f25, 0f2B8CBCCC;
rsqrt.approx.f32 %f1, %f26;
add.s64 %rd47, %rd2, %rd599;
ld.global.nc.f32 %f27, [%rd47];
mul.rn.f32 %f28, %f1, %f27;
mul.rn.f32 %f29, %f28, %f23;
add.s64 %rd48, %rd3, %rd599;
ld.global.nc.f32 %f30, [%rd48];
add.s64 %rd603, %rd4, %rd601;
ld.global.nc.f32 %f31, [%rd603];
mul.rn.f32 %f2, %f31, 0f3A800000;
mul.rn.f32 %f32, %f28, %f2;
sub.rn.f32 %f33, %f30, %f32;
add.rn.f32 %f34, %f29, %f33;
add.rn.f32 %f35, %f34, %f22;
add.rn.f32 %f3, %f35, 0f00000000;
or.b32 %r63, %r3, 1;
and.b32 %r64, %r63, 3;
setp.ne.s32 %p5, %r64, 1;
@%p5 bra LBB54_7;
mul.lo.s64 %rd2466, %rd2384, 3528531795;
selp.u64 %rd644, 1, 0, %p69;
add.s64 %rd645, %rd2461, %rd644;
xor.b64 %rd646, %rd645, %rd2466;
shr.u64 %rd647, %rd646, 32;
mul.lo.s64 %rd2469, %rd647, 3449720151;
shr.u64 %rd648, %rd2469, 32;
and.b64 %rd649, %rd645, 4294967295;
mul.lo.s64 %rd650, %rd649, 3449720151;
and.b64 %rd651, %rd650, 4294967295;
xor.b64 %rd652, %rd651, %rd648;
xor.b64 %rd653, %rd652, 2654435769;
mul.lo.s64 %rd2472, %rd653, 3528531795;
xor.b64 %rd2462, %rd650, %rd12;
mov.u32 %r314, -845247145;
mov.u32 %r313, -616729560;
mov.u64 %rd2479, 3041712726;
mov.u64 %rd2478, 1401181199;
mov.u64 %rd2477, 2835769497;
mov.u64 %rd2476, 1684936478;
mov.u64 %rd2475, 2027808484;
mov.u64 %rd2474, 387276957;
mov.u64 %rd2473, 842468239;
mov.u64 %rd2471, 3986602516;
mov.u64 %rd2470, 1013904242;
mov.u64 %rd2468, 3668340011;
mov.u64 %rd2467, 3144134277;
mov.u64 %rd2465, 3449720151;
mov.u64 %rd2464, 1993301258;
mov.u64 %rd2463, 3528531795;
bra.uni LBB54_8;
LBB54_7:
mov.u32 %r313, -239350328;
selp.u64 %rd618, 1, 0, %p69;
add.s64 %rd619, %rd2461, %rd618;
and.b64 %rd620, %rd619, 4294967295;
mul.lo.s64 %rd2466, %rd620, 3449720151;
xor.b64 %rd621, %rd2466, %rd12;
shr.u64 %rd622, %rd621, 32;
mul.lo.s64 %rd2469, %rd622, 3528531795;
shr.u64 %rd623, %rd2469, 32;
mul.lo.s64 %rd625, %rd2384, 3528531795;
and.b64 %rd626, %rd625, 4294967295;
xor.b64 %rd627, %rd626, %rd623;
xor.b64 %rd628, %rd627, 3144134277;
mul.lo.s64 %rd2472, %rd628, 3449720151;
xor.b64 %rd2462, %rd619, %rd625;
mov.u32 %r314, -766435501;
mov.u64 %rd2479, 1684936478;
mov.u64 %rd2478, 534103459;
mov.u64 %rd2477, 387276957;
mov.u64 %rd2476, 3041712726;
mov.u64 %rd2475, 3986602516;
mov.u64 %rd2474, 2835769497;
mov.u64 %rd2473, 3668340011;
mov.u64 %rd2471, 2027808484;
mov.u64 %rd2470, 1993301258;
mov.u64 %rd2468, 842468239;
mov.u64 %rd2467, 2654435769;
mov.u64 %rd2465, 3528531795;
mov.u64 %rd2464, 1013904242;
mov.u64 %rd2463, 3449720151;
LBB54_8:
setp.ne.s32 %p8, %r5, 0;
shr.u64 %rd654, %rd2472, 32;
shr.u64 %rd655, %rd2462, 32;
mul.lo.s64 %rd656, %rd655, %rd2463;
and.b64 %rd657, %rd656, 4294967295;
xor.b64 %rd658, %rd657, %rd654;
xor.b64 %rd659, %rd658, %rd2464;
mul.lo.s64 %rd660, %rd659, %rd2465;
shr.u64 %rd661, %rd660, 32;
shr.u64 %rd662, %rd656, 32;
and.b64 %rd663, %rd2466, 4294967295;
xor.b64 %rd664, %rd663, %rd662;
xor.b64 %rd665, %rd664, %rd2467;
mul.lo.s64 %rd666, %rd665, %rd2465;
and.b64 %rd667, %rd666, 4294967295;
xor.b64 %rd668, %rd667, %rd661;
xor.b64 %rd669, %rd668, %rd2468;
mul.lo.s64 %rd670, %rd669, %rd2463;
shr.u64 %rd671, %rd670, 32;
shr.u64 %rd672, %rd666, 32;
and.b64 %rd673, %rd2469, 4294967295;
xor.b64 %rd674, %rd673, %rd672;
xor.b64 %rd675, %rd674, %rd2470;
mul.lo.s64 %rd676, %rd675, %rd2463;
and.b64 %rd677, %rd676, 4294967295;
xor.b64 %rd678, %rd677, %rd671;
xor.b64 %rd679, %rd678, %rd2471;
mul.lo.s64 %rd680, %rd679, %rd2465;
shr.u64 %rd681, %rd680, 32;
shr.u64 %rd682, %rd676, 32;
and.b64 %rd683, %rd2472, 4294967295;
xor.b64 %rd684, %rd683, %rd682;
xor.b64 %rd685, %rd684, %rd2473;
mul.lo.s64 %rd686, %rd685, %rd2465;
and.b64 %rd687, %rd686, 4294967295;
xor.b64 %rd688, %rd687, %rd681;
xor.b64 %rd689, %rd688, %rd2474;
mul.lo.s64 %rd690, %rd689, %rd2463;
shr.u64 %rd691, %rd690, 32;
shr.u64 %rd692, %rd686, 32;
and.b64 %rd693, %rd660, 4294967295;
xor.b64 %rd694, %rd693, %rd692;
xor.b64 %rd695, %rd694, %rd2475;
mul.lo.s64 %rd696, %rd695, %rd2463;
and.b64 %rd697, %rd696, 4294967295;
xor.b64 %rd698, %rd697, %rd691;
xor.b64 %rd699, %rd698, %rd2476;
mul.lo.s64 %rd700, %rd699, %rd2465;
shr.u64 %rd701, %rd700, 32;
shr.u64 %rd702, %rd696, 32;
and.b64 %rd703, %rd670, 4294967295;
xor.b64 %rd704, %rd703, %rd702;
xor.b64 %rd705, %rd704, %rd2477;
mul.lo.s64 %rd706, %rd705, %rd2465;
and.b64 %rd707, %rd706, 4294967295;
xor.b64 %rd708, %rd707, %rd701;
xor.b64 %rd709, %rd708, %rd2478;
mul.lo.s64 %rd710, %rd709, %rd2463;
shr.u64 %rd711, %rd710, 32;
shr.u64 %rd712, %rd706, 32;
xor.b64 %rd713, %rd680, %rd712;
xor.b64 %rd714, %rd713, %rd2479;
mul.lo.s64 %rd715, %rd714, %rd2463;
xor.b64 %rd716, %rd711, %rd715;
cvt.u32.u64 %r69, %rd716;
xor.b32 %r70, %r313, %r69;
mul.lo.s32 %r71, %r70, %r314;
shr.u32 %r72, %r71, 9;
cvt.rn.f32.u32 %f36, %r72;
mul.rn.f32 %f37, %f36, 0f34000000;
cvt.rn.f16.f32 %h10, %f37;
mov.b16 %h11, 0x2E66;
setp.ge.f16 %p9, %h10, %h11;
ld.global.nc.b16 %h12, [%rd44+2];
ld.global.nc.f32 %f38, [%rd45+4];
cvt.rn.f16.f32 %h13, %f38;
add.rn.f16 %h14, %h12, %h13;
mov.b16 %h15, 0x3C72;
mul.rn.f16 %h16, %h14, %h15;
selp.b16 %h17, %h16, 0x0000, %p9;
cvt.f32.f16 %f39, %h17;
ld.global.nc.b16 %h18, [%rd46+2];
cvt.f32.f16 %f40, %h18;
ld.global.nc.f32 %f41, [%rd47+4];
mul.rn.f32 %f42, %f1, %f41;
mul.rn.f32 %f43, %f42, %f40;
ld.global.nc.f32 %f44, [%rd48+4];
mul.rn.f32 %f45, %f2, %f42;
sub.rn.f32 %f46, %f44, %f45;
add.rn.f32 %f47, %f43, %f46;
add.rn.f32 %f48, %f47, %f39;
add.rn.f32 %f4, %f3, %f48;
or.b32 %r73, %r3, %r4;
or.b32 %r74, %r73, 128;
shr.u32 %r75, %r74, 2;
cvt.u64.u32 %rd717, %r75;
add.s64 %rd75, %rd11, %rd717;
and.b64 %rd2433, %rd75, 4294967295;
setp.lt.u64 %p74, %rd75, %rd11;
@%p8 bra LBB54_10;
mul.lo.s64 %rd2484, %rd2433, 3528531795;
selp.u64 %rd760, 1, 0, %p74;
add.s64 %rd761, %rd2461, %rd760;
xor.b64 %rd762, %rd761, %rd2484;
shr.u64 %rd763, %rd762, 32;
mul.lo.s64 %rd2487, %rd763, 3449720151;
shr.u64 %rd764, %rd2487, 32;
and.b64 %rd765, %rd761, 4294967295;
mul.lo.s64 %rd766, %rd765, 3449720151;
and.b64 %rd767, %rd766, 4294967295;
xor.b64 %rd768, %rd767, %rd764;
xor.b64 %rd769, %rd768, 2654435769;
mul.lo.s64 %rd2490, %rd769, 3528531795;
xor.b64 %rd2480, %rd766, %rd75;
mov.u32 %r317, -1879881855;
mov.u32 %r316, -845247145;
mov.u32 %r315, 534103459;
mov.u64 %rd2498, 3678237736;
mov.u64 %rd2497, 3041712726;
mov.u64 %rd2496, 1401181199;
mov.u64 %rd2495, 2835769497;
mov.u64 %rd2494, 1684936478;
mov.u64 %rd2493, 2027808484;
mov.u64 %rd2492, 387276957;
mov.u64 %rd2491, 842468239;
mov.u64 %rd2489, 3986602516;
mov.u64 %rd2488, 1013904242;
mov.u64 %rd2486, 3668340011;
mov.u64 %rd2485, 3144134277;
mov.u64 %rd2483, 3449720151;
mov.u64 %rd2482, 1993301258;
mov.u64 %rd2481, 3528531795;
bra.uni LBB54_11;
LBB54_10:
selp.u64 %rd733, 1, 0, %p74;
add.s64 %rd734, %rd2461, %rd733;
and.b64 %rd735, %rd734, 4294967295;
mul.lo.s64 %rd2484, %rd735, 3449720151;
xor.b64 %rd736, %rd2484, %rd75;
shr.u64 %rd737, %rd736, 32;
mul.lo.s64 %rd2487, %rd737, 3528531795;
shr.u64 %rd738, %rd2487, 32;
mul.lo.s64 %rd740, %rd2433, 3528531795;
and.b64 %rd741, %rd740, 4294967295;
xor.b64 %rd742, %rd741, %rd738;
xor.b64 %rd743, %rd742, 3144134277;
mul.lo.s64 %rd2490, %rd743, 3449720151;
xor.b64 %rd2480, %rd734, %rd740;
mov.u32 %r317, -1767562579;
mov.u32 %r316, -766435501;
mov.u32 %r315, 1401181199;
mov.u64 %rd2498, 4055616968;
mov.u64 %rd2497, 1684936478;
mov.u64 %rd2496, 534103459;
mov.u64 %rd2495, 387276957;
mov.u64 %rd2494, 3041712726;
mov.u64 %rd2493, 3986602516;
mov.u64 %rd2492, 2835769497;
mov.u64 %rd2491, 3668340011;
mov.u64 %rd2489, 2027808484;
mov.u64 %rd2488, 1993301258;
mov.u64 %rd2486, 842468239;
mov.u64 %rd2485, 2654435769;
mov.u64 %rd2483, 3528531795;
mov.u64 %rd2482, 1013904242;
mov.u64 %rd2481, 3449720151;
LBB54_11:
shr.u64 %rd770, %rd2490, 32;
shr.u64 %rd771, %rd2480, 32;
mul.lo.s64 %rd772, %rd771, %rd2481;
and.b64 %rd773, %rd772, 4294967295;
xor.b64 %rd774, %rd773, %rd770;
xor.b64 %rd775, %rd774, %rd2482;
mul.lo.s64 %rd776, %rd775, %rd2483;
shr.u64 %rd777, %rd776, 32;
shr.u64 %rd778, %rd772, 32;
and.b64 %rd779, %rd2484, 4294967295;
xor.b64 %rd780, %rd779, %rd778;
xor.b64 %rd781, %rd780, %rd2485;
mul.lo.s64 %rd782, %rd781, %rd2483;
and.b64 %rd783, %rd782, 4294967295;
xor.b64 %rd784, %rd783, %rd777;
xor.b64 %rd785, %rd784, %rd2486;
mul.lo.s64 %rd786, %rd785, %rd2481;
shr.u64 %rd787, %rd786, 32;
shr.u64 %rd788, %rd782, 32;
and.b64 %rd789, %rd2487, 4294967295;
xor.b64 %rd790, %rd789, %rd788;
xor.b64 %rd791, %rd790, %rd2488;
mul.lo.s64 %rd792, %rd791, %rd2481;
and.b64 %rd793, %rd792, 4294967295;
xor.b64 %rd794, %rd793, %rd787;
xor.b64 %rd795, %rd794, %rd2489;
mul.lo.s64 %rd796, %rd795, %rd2483;
shr.u64 %rd797, %rd796, 32;
shr.u64 %rd798, %rd792, 32;
and.b64 %rd799, %rd2490, 4294967295;
xor.b64 %rd800, %rd799, %rd798;
xor.b64 %rd801, %rd800, %rd2491;
mul.lo.s64 %rd802, %rd801, %rd2483;
and.b64 %rd803, %rd802, 4294967295;
xor.b64 %rd804, %rd803, %rd797;
xor.b64 %rd805, %rd804, %rd2492;
mul.lo.s64 %rd806, %rd805, %rd2481;
shr.u64 %rd807, %rd806, 32;
shr.u64 %rd808, %rd802, 32;
and.b64 %rd809, %rd776, 4294967295;
xor.b64 %rd810, %rd809, %rd808;
xor.b64 %rd811, %rd810, %rd2493;
mul.lo.s64 %rd812, %rd811, %rd2481;
and.b64 %rd813, %rd812, 4294967295;
xor.b64 %rd814, %rd813, %rd807;
xor.b64 %rd815, %rd814, %rd2494;
mul.lo.s64 %rd816, %rd815, %rd2483;
shr.u64 %rd817, %rd816, 32;
shr.u64 %rd818, %rd812, 32;
and.b64 %rd819, %rd786, 4294967295;
xor.b64 %rd820, %rd819, %rd818;
xor.b64 %rd821, %rd820, %rd2495;
mul.lo.s64 %rd822, %rd821, %rd2483;
and.b64 %rd823, %rd822, 4294967295;
xor.b64 %rd824, %rd823, %rd817;
xor.b64 %rd825, %rd824, %rd2496;
mul.lo.s64 %rd826, %rd825, %rd2481;
shr.u64 %rd827, %rd826, 32;
shr.u64 %rd828, %rd822, 32;
and.b64 %rd829, %rd796, 4294967295;
xor.b64 %rd830, %rd829, %rd828;
xor.b64 %rd831, %rd830, %rd2497;
mul.lo.s64 %rd832, %rd831, %rd2481;
and.b64 %rd833, %rd832, 4294967295;
xor.b64 %rd834, %rd833, %rd827;
xor.b64 %rd835, %rd834, %rd2498;
mul.lo.s64 %rd836, %rd835, %rd2483;
shr.u64 %rd837, %rd836, 32;
cvt.u32.u64 %r82, %rd837;
shr.u64 %rd838, %rd832, 32;
xor.b64 %rd839, %rd838, %rd806;
cvt.u32.u64 %r83, %rd839;
xor.b32 %r84, %r315, %r83;
mul.lo.s32 %r85, %r84, %r316;
xor.b32 %r86, %r85, %r82;
xor.b32 %r87, %r86, %r317;
shr.u32 %r88, %r87, 9;
cvt.rn.f32.u32 %f49, %r88;
mul.rn.f32 %f50, %f49, 0f34000000;
cvt.rn.f16.f32 %h19, %f50;
mov.b16 %h20, 0x2E66;
setp.ge.f16 %p12, %h19, %h20;
ld.global.nc.b16 %h21, [%rd44+256];
ld.global.nc.f32 %f51, [%rd45+512];
cvt.rn.f16.f32 %h22, %f51;
add.rn.f16 %h23, %h21, %h22;
mov.b16 %h24, 0x3C72;
mul.rn.f16 %h25, %h23, %h24;
selp.b16 %h26, %h25, 0x0000, %p12;
cvt.f32.f16 %f52, %h26;
ld.global.nc.b16 %h27, [%rd46+256];
cvt.f32.f16 %f53, %h27;
ld.global.nc.f32 %f54, [%rd47+512];
mul.rn.f32 %f55, %f1, %f54;
mul.rn.f32 %f56, %f55, %f53;
ld.global.nc.f32 %f57, [%rd48+512];
mul.rn.f32 %f58, %f2, %f55;
sub.rn.f32 %f59, %f57, %f58;
add.rn.f32 %f60, %f56, %f59;
add.rn.f32 %f61, %f60, %f52;
add.rn.f32 %f5, %f4, %f61;
or.b32 %r89, %r3, 129;
or.b32 %r90, %r89, %r4;
and.b32 %r91, %r89, 3;
shr.u32 %r92, %r90, 2;
setp.ne.s32 %p13, %r91, 1;
cvt.u64.u32 %rd840, %r92;
add.s64 %rd103, %rd11, %rd840;
and.b64 %rd2430, %rd103, 4294967295;
setp.lt.u64 %p73, %rd103, %rd11;
@%p13 bra LBB54_13;
mul.lo.s64 %rd2503, %rd2430, 3528531795;
selp.u64 %rd881, 1, 0, %p73;
add.s64 %rd882, %rd2461, %rd881;
xor.b64 %rd883, %rd882, %rd2503;
shr.u64 %rd884, %rd883, 32;
mul.lo.s64 %rd2506, %rd884, 3449720151;
shr.u64 %rd885, %rd2506, 32;
and.b64 %rd886, %rd882, 4294967295;
mul.lo.s64 %rd887, %rd886, 3449720151;
and.b64 %rd888, %rd887, 4294967295;
xor.b64 %rd889, %rd888, %rd885;
xor.b64 %rd890, %rd889, 2654435769;
mul.lo.s64 %rd2509, %rd890, 3528531795;
xor.b64 %rd2499, %rd887, %rd103;
mov.u32 %r319, -845247145;
mov.u32 %r318, -616729560;
mov.u64 %rd2516, 3041712726;
mov.u64 %rd2515, 1401181199;
mov.u64 %rd2514, 2835769497;
mov.u64 %rd2513, 1684936478;
mov.u64 %rd2512, 2027808484;
mov.u64 %rd2511, 387276957;
mov.u64 %rd2510, 842468239;
mov.u64 %rd2508, 3986602516;
mov.u64 %rd2507, 1013904242;
mov.u64 %rd2505, 3668340011;
mov.u64 %rd2504, 3144134277;
mov.u64 %rd2502, 3449720151;
mov.u64 %rd2501, 1993301258;
mov.u64 %rd2500, 3528531795;
bra.uni LBB54_14;
LBB54_13:
selp.u64 %rd855, 1, 0, %p73;
add.s64 %rd856, %rd2461, %rd855;
and.b64 %rd857, %rd856, 4294967295;
mul.lo.s64 %rd2503, %rd857, 3449720151;
xor.b64 %rd858, %rd2503, %rd103;
shr.u64 %rd859, %rd858, 32;
mul.lo.s64 %rd2506, %rd859, 3528531795;
shr.u64 %rd860, %rd2506, 32;
mul.lo.s64 %rd862, %rd2430, 3528531795;
and.b64 %rd863, %rd862, 4294967295;
xor.b64 %rd864, %rd863, %rd860;
xor.b64 %rd865, %rd864, 3144134277;
mul.lo.s64 %rd2509, %rd865, 3449720151;
xor.b64 %rd2499, %rd856, %rd862;
mov.u32 %r319, -766435501;
mov.u32 %r318, -239350328;
mov.u64 %rd2516, 1684936478;
mov.u64 %rd2515, 534103459;
mov.u64 %rd2514, 387276957;
mov.u64 %rd2513, 3041712726;
mov.u64 %rd2512, 3986602516;
mov.u64 %rd2511, 2835769497;
mov.u64 %rd2510, 3668340011;
mov.u64 %rd2508, 2027808484;
mov.u64 %rd2507, 1993301258;
mov.u64 %rd2505, 842468239;
mov.u64 %rd2504, 2654435769;
mov.u64 %rd2502, 3528531795;
mov.u64 %rd2501, 1013904242;
mov.u64 %rd2500, 3449720151;
LBB54_14:
shr.u64 %rd891, %rd2509, 32;
shr.u64 %rd892, %rd2499, 32;
mul.lo.s64 %rd893, %rd892, %rd2500;
and.b64 %rd894, %rd893, 4294967295;
xor.b64 %rd895, %rd894, %rd891;
xor.b64 %rd896, %rd895, %rd2501;
mul.lo.s64 %rd897, %rd896, %rd2502;
shr.u64 %rd898, %rd897, 32;
shr.u64 %rd899, %rd893, 32;
and.b64 %rd900, %rd2503, 4294967295;
xor.b64 %rd901, %rd900, %rd899;
xor.b64 %rd902, %rd901, %rd2504;
mul.lo.s64 %rd903, %rd902, %rd2502;
and.b64 %rd904, %rd903, 4294967295;
xor.b64 %rd905, %rd904, %rd898;
xor.b64 %rd906, %rd905, %rd2505;
mul.lo.s64 %rd907, %rd906, %rd2500;
shr.u64 %rd908, %rd907, 32;
shr.u64 %rd909, %rd903, 32;
and.b64 %rd910, %rd2506, 4294967295;
xor.b64 %rd911, %rd910, %rd909;
xor.b64 %rd912, %rd911, %rd2507;
mul.lo.s64 %rd913, %rd912, %rd2500;
and.b64 %rd914, %rd913, 4294967295;
xor.b64 %rd915, %rd914, %rd908;
xor.b64 %rd916, %rd915, %rd2508;
mul.lo.s64 %rd917, %rd916, %rd2502;
shr.u64 %rd918, %rd917, 32;
shr.u64 %rd919, %rd913, 32;
and.b64 %rd920, %rd2509, 4294967295;
xor.b64 %rd921, %rd920, %rd919;
xor.b64 %rd922, %rd921, %rd2510;
mul.lo.s64 %rd923, %rd922, %rd2502;
and.b64 %rd924, %rd923, 4294967295;
xor.b64 %rd925, %rd924, %rd918;
xor.b64 %rd926, %rd925, %rd2511;
mul.lo.s64 %rd927, %rd926, %rd2500;
shr.u64 %rd928, %rd927, 32;
shr.u64 %rd929, %rd923, 32;
and.b64 %rd930, %rd897, 4294967295;
xor.b64 %rd931, %rd930, %rd929;
xor.b64 %rd932, %rd931, %rd2512;
mul.lo.s64 %rd933, %rd932, %rd2500;
and.b64 %rd934, %rd933, 4294967295;
xor.b64 %rd935, %rd934, %rd928;
xor.b64 %rd936, %rd935, %rd2513;
mul.lo.s64 %rd937, %rd936, %rd2502;
shr.u64 %rd938, %rd937, 32;
shr.u64 %rd939, %rd933, 32;
and.b64 %rd940, %rd907, 4294967295;
xor.b64 %rd941, %rd940, %rd939;
xor.b64 %rd942, %rd941, %rd2514;
mul.lo.s64 %rd943, %rd942, %rd2502;
and.b64 %rd944, %rd943, 4294967295;
xor.b64 %rd945, %rd944, %rd938;
xor.b64 %rd946, %rd945, %rd2515;
mul.lo.s64 %rd947, %rd946, %rd2500;
shr.u64 %rd948, %rd947, 32;
shr.u64 %rd949, %rd943, 32;
xor.b64 %rd950, %rd917, %rd949;
xor.b64 %rd951, %rd950, %rd2516;
mul.lo.s64 %rd952, %rd951, %rd2500;
xor.b64 %rd953, %rd948, %rd952;
cvt.u32.u64 %r97, %rd953;
xor.b32 %r98, %r318, %r97;
mul.lo.s32 %r99, %r98, %r319;
shr.u32 %r100, %r99, 9;
cvt.rn.f32.u32 %f62, %r100;
mul.rn.f32 %f63, %f62, 0f34000000;
cvt.rn.f16.f32 %h28, %f63;
mov.b16 %h29, 0x2E66;
setp.ge.f16 %p17, %h28, %h29;
ld.global.nc.b16 %h30, [%rd44+258];
ld.global.nc.f32 %f64, [%rd45+516];
cvt.rn.f16.f32 %h31, %f64;
add.rn.f16 %h32, %h30, %h31;
mov.b16 %h33, 0x3C72;
mul.rn.f16 %h34, %h32, %h33;
selp.b16 %h35, %h34, 0x0000, %p17;
cvt.f32.f16 %f65, %h35;
ld.global.nc.b16 %h36, [%rd46+258];
cvt.f32.f16 %f66, %h36;
ld.global.nc.f32 %f67, [%rd47+516];
mul.rn.f32 %f68, %f1, %f67;
mul.rn.f32 %f69, %f68, %f66;
ld.global.nc.f32 %f70, [%rd48+516];
mul.rn.f32 %f71, %f2, %f68;
sub.rn.f32 %f72, %f70, %f71;
add.rn.f32 %f73, %f69, %f72;
add.rn.f32 %f74, %f73, %f65;
add.rn.f32 %f6, %f5, %f74;
or.b32 %r102, %r73, 256;
shr.u32 %r103, %r102, 2;
cvt.u64.u32 %rd954, %r103;
add.s64 %rd130, %rd11, %rd954;
and.b64 %rd2426, %rd130, 4294967295;
setp.lt.u64 %p72, %rd130, %rd11;
@%p8 bra LBB54_16;
mul.lo.s64 %rd2521, %rd2426, 3528531795;
selp.u64 %rd997, 1, 0, %p72;
add.s64 %rd998, %rd2461, %rd997;
xor.b64 %rd999, %rd998, %rd2521;
shr.u64 %rd1000, %rd999, 32;
mul.lo.s64 %rd2524, %rd1000, 3449720151;
shr.u64 %rd1001, %rd2524, 32;
and.b64 %rd1002, %rd998, 4294967295;
mul.lo.s64 %rd1003, %rd1002, 3449720151;
and.b64 %rd1004, %rd1003, 4294967295;
xor.b64 %rd1005, %rd1004, %rd1001;
xor.b64 %rd1006, %rd1005, 2654435769;
mul.lo.s64 %rd2527, %rd1006, 3528531795;
xor.b64 %rd2517, %rd1003, %rd130;
mov.u32 %r322, -1879881855;
mov.u32 %r321, -845247145;
mov.u32 %r320, 534103459;
mov.u64 %rd2535, 3678237736;
mov.u64 %rd2534, 3041712726;
mov.u64 %rd2533, 1401181199;
mov.u64 %rd2532, 2835769497;
mov.u64 %rd2531, 1684936478;
mov.u64 %rd2530, 2027808484;
mov.u64 %rd2529, 387276957;
mov.u64 %rd2528, 842468239;
mov.u64 %rd2526, 3986602516;
mov.u64 %rd2525, 1013904242;
mov.u64 %rd2523, 3668340011;
mov.u64 %rd2522, 3144134277;
mov.u64 %rd2520, 3449720151;
mov.u64 %rd2519, 1993301258;
mov.u64 %rd2518, 3528531795;
bra.uni LBB54_17;
LBB54_16:
selp.u64 %rd970, 1, 0, %p72;
add.s64 %rd971, %rd2461, %rd970;
and.b64 %rd972, %rd971, 4294967295;
mul.lo.s64 %rd2521, %rd972, 3449720151;
xor.b64 %rd973, %rd2521, %rd130;
shr.u64 %rd974, %rd973, 32;
mul.lo.s64 %rd2524, %rd974, 3528531795;
shr.u64 %rd975, %rd2524, 32;
mul.lo.s64 %rd977, %rd2426, 3528531795;
and.b64 %rd978, %rd977, 4294967295;
xor.b64 %rd979, %rd978, %rd975;
xor.b64 %rd980, %rd979, 3144134277;
mul.lo.s64 %rd2527, %rd980, 3449720151;
xor.b64 %rd2517, %rd971, %rd977;
mov.u32 %r322, -1767562579;
mov.u32 %r321, -766435501;
mov.u32 %r320, 1401181199;
mov.u64 %rd2535, 4055616968;
mov.u64 %rd2534, 1684936478;
mov.u64 %rd2533, 534103459;
mov.u64 %rd2532, 387276957;
mov.u64 %rd2531, 3041712726;
mov.u64 %rd2530, 3986602516;
mov.u64 %rd2529, 2835769497;
mov.u64 %rd2528, 3668340011;
mov.u64 %rd2526, 2027808484;
mov.u64 %rd2525, 1993301258;
mov.u64 %rd2523, 842468239;
mov.u64 %rd2522, 2654435769;
mov.u64 %rd2520, 3528531795;
mov.u64 %rd2519, 1013904242;
mov.u64 %rd2518, 3449720151;
LBB54_17:
shr.u64 %rd1007, %rd2527, 32;
shr.u64 %rd1008, %rd2517, 32;
mul.lo.s64 %rd1009, %rd1008, %rd2518;
and.b64 %rd1010, %rd1009, 4294967295;
xor.b64 %rd1011, %rd1010, %rd1007;
xor.b64 %rd1012, %rd1011, %rd2519;
mul.lo.s64 %rd1013, %rd1012, %rd2520;
shr.u64 %rd1014, %rd1013, 32;
shr.u64 %rd1015, %rd1009, 32;
and.b64 %rd1016, %rd2521, 4294967295;
xor.b64 %rd1017, %rd1016, %rd1015;
xor.b64 %rd1018, %rd1017, %rd2522;
mul.lo.s64 %rd1019, %rd1018, %rd2520;
and.b64 %rd1020, %rd1019, 4294967295;
xor.b64 %rd1021, %rd1020, %rd1014;
xor.b64 %rd1022, %rd1021, %rd2523;
mul.lo.s64 %rd1023, %rd1022, %rd2518;
shr.u64 %rd1024, %rd1023, 32;
shr.u64 %rd1025, %rd1019, 32;
and.b64 %rd1026, %rd2524, 4294967295;
xor.b64 %rd1027, %rd1026, %rd1025;
xor.b64 %rd1028, %rd1027, %rd2525;
mul.lo.s64 %rd1029, %rd1028, %rd2518;
and.b64 %rd1030, %rd1029, 4294967295;
xor.b64 %rd1031, %rd1030, %rd1024;
xor.b64 %rd1032, %rd1031, %rd2526;
mul.lo.s64 %rd1033, %rd1032, %rd2520;
shr.u64 %rd1034, %rd1033, 32;
shr.u64 %rd1035, %rd1029, 32;
and.b64 %rd1036, %rd2527, 4294967295;
xor.b64 %rd1037, %rd1036, %rd1035;
xor.b64 %rd1038, %rd1037, %rd2528;
mul.lo.s64 %rd1039, %rd1038, %rd2520;
and.b64 %rd1040, %rd1039, 4294967295;
xor.b64 %rd1041, %rd1040, %rd1034;
xor.b64 %rd1042, %rd1041, %rd2529;
mul.lo.s64 %rd1043, %rd1042, %rd2518;
shr.u64 %rd1044, %rd1043, 32;
shr.u64 %rd1045, %rd1039, 32;
and.b64 %rd1046, %rd1013, 4294967295;
xor.b64 %rd1047, %rd1046, %rd1045;
xor.b64 %rd1048, %rd1047, %rd2530;
mul.lo.s64 %rd1049, %rd1048, %rd2518;
and.b64 %rd1050, %rd1049, 4294967295;
xor.b64 %rd1051, %rd1050, %rd1044;
xor.b64 %rd1052, %rd1051, %rd2531;
mul.lo.s64 %rd1053, %rd1052, %rd2520;
shr.u64 %rd1054, %rd1053, 32;
shr.u64 %rd1055, %rd1049, 32;
and.b64 %rd1056, %rd1023, 4294967295;
xor.b64 %rd1057, %rd1056, %rd1055;
xor.b64 %rd1058, %rd1057, %rd2532;
mul.lo.s64 %rd1059, %rd1058, %rd2520;
and.b64 %rd1060, %rd1059, 4294967295;
xor.b64 %rd1061, %rd1060, %rd1054;
xor.b64 %rd1062, %rd1061, %rd2533;
mul.lo.s64 %rd1063, %rd1062, %rd2518;
shr.u64 %rd1064, %rd1063, 32;
shr.u64 %rd1065, %rd1059, 32;
and.b64 %rd1066, %rd1033, 4294967295;
xor.b64 %rd1067, %rd1066, %rd1065;
xor.b64 %rd1068, %rd1067, %rd2534;
mul.lo.s64 %rd1069, %rd1068, %rd2518;
and.b64 %rd1070, %rd1069, 4294967295;
xor.b64 %rd1071, %rd1070, %rd1064;
xor.b64 %rd1072, %rd1071, %rd2535;
mul.lo.s64 %rd1073, %rd1072, %rd2520;
shr.u64 %rd1074, %rd1073, 32;
cvt.u32.u64 %r110, %rd1074;
shr.u64 %rd1075, %rd1069, 32;
xor.b64 %rd1076, %rd1075, %rd1043;
cvt.u32.u64 %r111, %rd1076;
xor.b32 %r112, %r320, %r111;
mul.lo.s32 %r113, %r112, %r321;
xor.b32 %r114, %r113, %r110;
xor.b32 %r115, %r114, %r322;
shr.u32 %r116, %r115, 9;
cvt.rn.f32.u32 %f75, %r116;
mul.rn.f32 %f76, %f75, 0f34000000;
cvt.rn.f16.f32 %h37, %f76;
mov.b16 %h38, 0x2E66;
setp.ge.f16 %p20, %h37, %h38;
ld.global.nc.b16 %h39, [%rd44+512];
ld.global.nc.f32 %f77, [%rd45+1024];
cvt.rn.f16.f32 %h40, %f77;
add.rn.f16 %h41, %h39, %h40;
mov.b16 %h42, 0x3C72;
mul.rn.f16 %h43, %h41, %h42;
selp.b16 %h44, %h43, 0x0000, %p20;
cvt.f32.f16 %f78, %h44;
ld.global.nc.b16 %h45, [%rd46+512];
cvt.f32.f16 %f79, %h45;
ld.global.nc.f32 %f80, [%rd47+1024];
mul.rn.f32 %f81, %f1, %f80;
mul.rn.f32 %f82, %f81, %f79;
ld.global.nc.f32 %f83, [%rd48+1024];
mul.rn.f32 %f84, %f2, %f81;
sub.rn.f32 %f85, %f83, %f84;
add.rn.f32 %f86, %f82, %f85;
add.rn.f32 %f87, %f86, %f78;
add.rn.f32 %f7, %f6, %f87;
or.b32 %r117, %r3, 257;
or.b32 %r118, %r117, %r4;
and.b32 %r119, %r117, 3;
shr.u32 %r120, %r118, 2;
setp.ne.s32 %p21, %r119, 1;
cvt.u64.u32 %rd1077, %r120;
add.s64 %rd158, %rd11, %rd1077;
and.b64 %rd2423, %rd158, 4294967295;
setp.lt.u64 %p71, %rd158, %rd11;
@%p21 bra LBB54_19;
mul.lo.s64 %rd2540, %rd2423, 3528531795;
selp.u64 %rd1118, 1, 0, %p71;
add.s64 %rd1119, %rd2461, %rd1118;
xor.b64 %rd1120, %rd1119, %rd2540;
shr.u64 %rd1121, %rd1120, 32;
mul.lo.s64 %rd2543, %rd1121, 3449720151;
shr.u64 %rd1122, %rd2543, 32;
and.b64 %rd1123, %rd1119, 4294967295;
mul.lo.s64 %rd1124, %rd1123, 3449720151;
and.b64 %rd1125, %rd1124, 4294967295;
xor.b64 %rd1126, %rd1125, %rd1122;
xor.b64 %rd1127, %rd1126, 2654435769;
mul.lo.s64 %rd2546, %rd1127, 3528531795;
xor.b64 %rd2536, %rd1124, %rd158;
mov.u32 %r324, -845247145;
mov.u32 %r323, -616729560;
mov.u64 %rd2553, 3041712726;
mov.u64 %rd2552, 1401181199;
mov.u64 %rd2551, 2835769497;
mov.u64 %rd2550, 1684936478;
mov.u64 %rd2549, 2027808484;
mov.u64 %rd2548, 387276957;
mov.u64 %rd2547, 842468239;
mov.u64 %rd2545, 3986602516;
mov.u64 %rd2544, 1013904242;
mov.u64 %rd2542, 3668340011;
mov.u64 %rd2541, 3144134277;
mov.u64 %rd2539, 3449720151;
mov.u64 %rd2538, 1993301258;
mov.u64 %rd2537, 3528531795;
bra.uni LBB54_20;
LBB54_19:
selp.u64 %rd1092, 1, 0, %p71;
add.s64 %rd1093, %rd2461, %rd1092;
and.b64 %rd1094, %rd1093, 4294967295;
mul.lo.s64 %rd2540, %rd1094, 3449720151;
xor.b64 %rd1095, %rd2540, %rd158;
shr.u64 %rd1096, %rd1095, 32;
mul.lo.s64 %rd2543, %rd1096, 3528531795;
shr.u64 %rd1097, %rd2543, 32;
mul.lo.s64 %rd1099, %rd2423, 3528531795;
and.b64 %rd1100, %rd1099, 4294967295;
xor.b64 %rd1101, %rd1100, %rd1097;
xor.b64 %rd1102, %rd1101, 3144134277;
mul.lo.s64 %rd2546, %rd1102, 3449720151;
xor.b64 %rd2536, %rd1093, %rd1099;
mov.u32 %r324, -766435501;
mov.u32 %r323, -239350328;
mov.u64 %rd2553, 1684936478;
mov.u64 %rd2552, 534103459;
mov.u64 %rd2551, 387276957;
mov.u64 %rd2550, 3041712726;
mov.u64 %rd2549, 3986602516;
mov.u64 %rd2548, 2835769497;
mov.u64 %rd2547, 3668340011;
mov.u64 %rd2545, 2027808484;
mov.u64 %rd2544, 1993301258;
mov.u64 %rd2542, 842468239;
mov.u64 %rd2541, 2654435769;
mov.u64 %rd2539, 3528531795;
mov.u64 %rd2538, 1013904242;
mov.u64 %rd2537, 3449720151;
LBB54_20:
shr.u64 %rd1128, %rd2546, 32;
shr.u64 %rd1129, %rd2536, 32;
mul.lo.s64 %rd1130, %rd1129, %rd2537;
and.b64 %rd1131, %rd1130, 4294967295;
xor.b64 %rd1132, %rd1131, %rd1128;
xor.b64 %rd1133, %rd1132, %rd2538;
mul.lo.s64 %rd1134, %rd1133, %rd2539;
shr.u64 %rd1135, %rd1134, 32;
shr.u64 %rd1136, %rd1130, 32;
and.b64 %rd1137, %rd2540, 4294967295;
xor.b64 %rd1138, %rd1137, %rd1136;
xor.b64 %rd1139, %rd1138, %rd2541;
mul.lo.s64 %rd1140, %rd1139, %rd2539;
and.b64 %rd1141, %rd1140, 4294967295;
xor.b64 %rd1142, %rd1141, %rd1135;
xor.b64 %rd1143, %rd1142, %rd2542;
mul.lo.s64 %rd1144, %rd1143, %rd2537;
shr.u64 %rd1145, %rd1144, 32;
shr.u64 %rd1146, %rd1140, 32;
and.b64 %rd1147, %rd2543, 4294967295;
xor.b64 %rd1148, %rd1147, %rd1146;
xor.b64 %rd1149, %rd1148, %rd2544;
mul.lo.s64 %rd1150, %rd1149, %rd2537;
and.b64 %rd1151, %rd1150, 4294967295;
xor.b64 %rd1152, %rd1151, %rd1145;
xor.b64 %rd1153, %rd1152, %rd2545;
mul.lo.s64 %rd1154, %rd1153, %rd2539;
shr.u64 %rd1155, %rd1154, 32;
shr.u64 %rd1156, %rd1150, 32;
and.b64 %rd1157, %rd2546, 4294967295;
xor.b64 %rd1158, %rd1157, %rd1156;
xor.b64 %rd1159, %rd1158, %rd2547;
mul.lo.s64 %rd1160, %rd1159, %rd2539;
and.b64 %rd1161, %rd1160, 4294967295;
xor.b64 %rd1162, %rd1161, %rd1155;
xor.b64 %rd1163, %rd1162, %rd2548;
mul.lo.s64 %rd1164, %rd1163, %rd2537;
shr.u64 %rd1165, %rd1164, 32;
shr.u64 %rd1166, %rd1160, 32;
and.b64 %rd1167, %rd1134, 4294967295;
xor.b64 %rd1168, %rd1167, %rd1166;
xor.b64 %rd1169, %rd1168, %rd2549;
mul.lo.s64 %rd1170, %rd1169, %rd2537;
and.b64 %rd1171, %rd1170, 4294967295;
xor.b64 %rd1172, %rd1171, %rd1165;
xor.b64 %rd1173, %rd1172, %rd2550;
mul.lo.s64 %rd1174, %rd1173, %rd2539;
shr.u64 %rd1175, %rd1174, 32;
shr.u64 %rd1176, %rd1170, 32;
and.b64 %rd1177, %rd1144, 4294967295;
xor.b64 %rd1178, %rd1177, %rd1176;
xor.b64 %rd1179, %rd1178, %rd2551;
mul.lo.s64 %rd1180, %rd1179, %rd2539;
and.b64 %rd1181, %rd1180, 4294967295;
xor.b64 %rd1182, %rd1181, %rd1175;
xor.b64 %rd1183, %rd1182, %rd2552;
mul.lo.s64 %rd1184, %rd1183, %rd2537;
shr.u64 %rd1185, %rd1184, 32;
shr.u64 %rd1186, %rd1180, 32;
xor.b64 %rd1187, %rd1154, %rd1186;
xor.b64 %rd1188, %rd1187, %rd2553;
mul.lo.s64 %rd1189, %rd1188, %rd2537;
xor.b64 %rd1190, %rd1185, %rd1189;
cvt.u32.u64 %r125, %rd1190;
xor.b32 %r126, %r323, %r125;
mul.lo.s32 %r127, %r126, %r324;
shr.u32 %r128, %r127, 9;
cvt.rn.f32.u32 %f88, %r128;
mul.rn.f32 %f89, %f88, 0f34000000;
cvt.rn.f16.f32 %h46, %f89;
mov.b16 %h47, 0x2E66;
setp.ge.f16 %p25, %h46, %h47;
ld.global.nc.b16 %h48, [%rd44+514];
ld.global.nc.f32 %f90, [%rd45+1028];
cvt.rn.f16.f32 %h49, %f90;
add.rn.f16 %h50, %h48, %h49;
mov.b16 %h51, 0x3C72;
mul.rn.f16 %h52, %h50, %h51;
selp.b16 %h53, %h52, 0x0000, %p25;
cvt.f32.f16 %f91, %h53;
ld.global.nc.b16 %h54, [%rd46+514];
cvt.f32.f16 %f92, %h54;
ld.global.nc.f32 %f93, [%rd47+1028];
mul.rn.f32 %f94, %f1, %f93;
mul.rn.f32 %f95, %f94, %f92;
ld.global.nc.f32 %f96, [%rd48+1028];
mul.rn.f32 %f97, %f2, %f94;
sub.rn.f32 %f98, %f96, %f97;
add.rn.f32 %f99, %f95, %f98;
add.rn.f32 %f100, %f99, %f91;
add.rn.f32 %f8, %f7, %f100;
or.b32 %r130, %r73, 384;
shr.u32 %r131, %r130, 2;
cvt.u64.u32 %rd1191, %r131;
add.s64 %rd185, %rd11, %rd1191;
and.b64 %rd2419, %rd185, 4294967295;
setp.lt.u64 %p70, %rd185, %rd11;
@%p8 bra LBB54_22;
mul.lo.s64 %rd2558, %rd2419, 3528531795;
selp.u64 %rd1234, 1, 0, %p70;
add.s64 %rd1235, %rd2461, %rd1234;
xor.b64 %rd1236, %rd1235, %rd2558;
shr.u64 %rd1237, %rd1236, 32;
mul.lo.s64 %rd2561, %rd1237, 3449720151;
shr.u64 %rd1238, %rd2561, 32;
and.b64 %rd1239, %rd1235, 4294967295;
mul.lo.s64 %rd1240, %rd1239, 3449720151;
and.b64 %rd1241, %rd1240, 4294967295;
xor.b64 %rd1242, %rd1241, %rd1238;
xor.b64 %rd1243, %rd1242, 2654435769;
mul.lo.s64 %rd2564, %rd1243, 3528531795;
xor.b64 %rd2554, %rd1240, %rd185;
mov.u32 %r327, -1879881855;
mov.u32 %r326, -845247145;
mov.u32 %r325, 534103459;
mov.u64 %rd2572, 3678237736;
mov.u64 %rd2571, 3041712726;
mov.u64 %rd2570, 1401181199;
mov.u64 %rd2569, 2835769497;
mov.u64 %rd2568, 1684936478;
mov.u64 %rd2567, 2027808484;
mov.u64 %rd2566, 387276957;
mov.u64 %rd2565, 842468239;
mov.u64 %rd2563, 3986602516;
mov.u64 %rd2562, 1013904242;
mov.u64 %rd2560, 3668340011;
mov.u64 %rd2559, 3144134277;
mov.u64 %rd2557, 3449720151;
mov.u64 %rd2556, 1993301258;
mov.u64 %rd2555, 3528531795;
bra.uni LBB54_23;
LBB54_22:
selp.u64 %rd1207, 1, 0, %p70;
add.s64 %rd1208, %rd2461, %rd1207;
and.b64 %rd1209, %rd1208, 4294967295;
mul.lo.s64 %rd2558, %rd1209, 3449720151;
xor.b64 %rd1210, %rd2558, %rd185;
shr.u64 %rd1211, %rd1210, 32;
mul.lo.s64 %rd2561, %rd1211, 3528531795;
shr.u64 %rd1212, %rd2561, 32;
mul.lo.s64 %rd1214, %rd2419, 3528531795;
and.b64 %rd1215, %rd1214, 4294967295;
xor.b64 %rd1216, %rd1215, %rd1212;
xor.b64 %rd1217, %rd1216, 3144134277;
mul.lo.s64 %rd2564, %rd1217, 3449720151;
xor.b64 %rd2554, %rd1208, %rd1214;
mov.u32 %r327, -1767562579;
mov.u32 %r326, -766435501;
mov.u32 %r325, 1401181199;
mov.u64 %rd2572, 4055616968;
mov.u64 %rd2571, 1684936478;
mov.u64 %rd2570, 534103459;
mov.u64 %rd2569, 387276957;
mov.u64 %rd2568, 3041712726;
mov.u64 %rd2567, 3986602516;
mov.u64 %rd2566, 2835769497;
mov.u64 %rd2565, 3668340011;
mov.u64 %rd2563, 2027808484;
mov.u64 %rd2562, 1993301258;
mov.u64 %rd2560, 842468239;
mov.u64 %rd2559, 2654435769;
mov.u64 %rd2557, 3528531795;
mov.u64 %rd2556, 1013904242;
mov.u64 %rd2555, 3449720151;
LBB54_23:
shr.u64 %rd1244, %rd2564, 32;
shr.u64 %rd1245, %rd2554, 32;
mul.lo.s64 %rd1246, %rd1245, %rd2555;
and.b64 %rd1247, %rd1246, 4294967295;
xor.b64 %rd1248, %rd1247, %rd1244;
xor.b64 %rd1249, %rd1248, %rd2556;
mul.lo.s64 %rd1250, %rd1249, %rd2557;
shr.u64 %rd1251, %rd1250, 32;
shr.u64 %rd1252, %rd1246, 32;
and.b64 %rd1253, %rd2558, 4294967295;
xor.b64 %rd1254, %rd1253, %rd1252;
xor.b64 %rd1255, %rd1254, %rd2559;
mul.lo.s64 %rd1256, %rd1255, %rd2557;
and.b64 %rd1257, %rd1256, 4294967295;
xor.b64 %rd1258, %rd1257, %rd1251;
xor.b64 %rd1259, %rd1258, %rd2560;
mul.lo.s64 %rd1260, %rd1259, %rd2555;
shr.u64 %rd1261, %rd1260, 32;
shr.u64 %rd1262, %rd1256, 32;
and.b64 %rd1263, %rd2561, 4294967295;
xor.b64 %rd1264, %rd1263, %rd1262;
xor.b64 %rd1265, %rd1264, %rd2562;
mul.lo.s64 %rd1266, %rd1265, %rd2555;
and.b64 %rd1267, %rd1266, 4294967295;
xor.b64 %rd1268, %rd1267, %rd1261;
xor.b64 %rd1269, %rd1268, %rd2563;
mul.lo.s64 %rd1270, %rd1269, %rd2557;
shr.u64 %rd1271, %rd1270, 32;
shr.u64 %rd1272, %rd1266, 32;
and.b64 %rd1273, %rd2564, 4294967295;
xor.b64 %rd1274, %rd1273, %rd1272;
xor.b64 %rd1275, %rd1274, %rd2565;
mul.lo.s64 %rd1276, %rd1275, %rd2557;
and.b64 %rd1277, %rd1276, 4294967295;
xor.b64 %rd1278, %rd1277, %rd1271;
xor.b64 %rd1279, %rd1278, %rd2566;
mul.lo.s64 %rd1280, %rd1279, %rd2555;
shr.u64 %rd1281, %rd1280, 32;
shr.u64 %rd1282, %rd1276, 32;
and.b64 %rd1283, %rd1250, 4294967295;
xor.b64 %rd1284, %rd1283, %rd1282;
xor.b64 %rd1285, %rd1284, %rd2567;
mul.lo.s64 %rd1286, %rd1285, %rd2555;
and.b64 %rd1287, %rd1286, 4294967295;
xor.b64 %rd1288, %rd1287, %rd1281;
xor.b64 %rd1289, %rd1288, %rd2568;
mul.lo.s64 %rd1290, %rd1289, %rd2557;
shr.u64 %rd1291, %rd1290, 32;
shr.u64 %rd1292, %rd1286, 32;
and.b64 %rd1293, %rd1260, 4294967295;
xor.b64 %rd1294, %rd1293, %rd1292;
xor.b64 %rd1295, %rd1294, %rd2569;
mul.lo.s64 %rd1296, %rd1295, %rd2557;
and.b64 %rd1297, %rd1296, 4294967295;
xor.b64 %rd1298, %rd1297, %rd1291;
xor.b64 %rd1299, %rd1298, %rd2570;
mul.lo.s64 %rd1300, %rd1299, %rd2555;
shr.u64 %rd1301, %rd1300, 32;
shr.u64 %rd1302, %rd1296, 32;
and.b64 %rd1303, %rd1270, 4294967295;
xor.b64 %rd1304, %rd1303, %rd1302;
xor.b64 %rd1305, %rd1304, %rd2571;
mul.lo.s64 %rd1306, %rd1305, %rd2555;
and.b64 %rd1307, %rd1306, 4294967295;
xor.b64 %rd1308, %rd1307, %rd1301;
xor.b64 %rd1309, %rd1308, %rd2572;
mul.lo.s64 %rd1310, %rd1309, %rd2557;
shr.u64 %rd1311, %rd1310, 32;
cvt.u32.u64 %r138, %rd1311;
shr.u64 %rd1312, %rd1306, 32;
xor.b64 %rd1313, %rd1312, %rd1280;
cvt.u32.u64 %r139, %rd1313;
xor.b32 %r140, %r325, %r139;
mul.lo.s32 %r141, %r140, %r326;
xor.b32 %r142, %r141, %r138;
xor.b32 %r143, %r142, %r327;
shr.u32 %r144, %r143, 9;
cvt.rn.f32.u32 %f101, %r144;
mul.rn.f32 %f102, %f101, 0f34000000;
cvt.rn.f16.f32 %h55, %f102;
mov.b16 %h56, 0x2E66;
setp.ge.f16 %p28, %h55, %h56;
ld.global.nc.b16 %h57, [%rd44+768];
ld.global.nc.f32 %f103, [%rd45+1536];
cvt.rn.f16.f32 %h58, %f103;
add.rn.f16 %h59, %h57, %h58;
mov.b16 %h60, 0x3C72;
mul.rn.f16 %h61, %h59, %h60;
selp.b16 %h62, %h61, 0x0000, %p28;
cvt.f32.f16 %f104, %h62;
ld.global.nc.b16 %h63, [%rd46+768];
cvt.f32.f16 %f105, %h63;
ld.global.nc.f32 %f106, [%rd47+1536];
mul.rn.f32 %f107, %f1, %f106;
mul.rn.f32 %f108, %f107, %f105;
ld.global.nc.f32 %f109, [%rd48+1536];
mul.rn.f32 %f110, %f2, %f107;
sub.rn.f32 %f111, %f109, %f110;
add.rn.f32 %f112, %f108, %f111;
add.rn.f32 %f113, %f112, %f104;
add.rn.f32 %f9, %f8, %f113;
or.b32 %r145, %r3, 385;
or.b32 %r146, %r145, %r4;
and.b32 %r147, %r145, 3;
shr.u32 %r148, %r146, 2;
setp.ne.s32 %p29, %r147, 1;
cvt.u64.u32 %rd1314, %r148;
add.s64 %rd213, %rd11, %rd1314;
@%p29 bra LBB54_25;
and.b64 %rd1354, %rd213, 4294967295;
mul.lo.s64 %rd2577, %rd1354, 3528531795;
setp.lt.u64 %p31, %rd213, %rd11;
selp.u64 %rd1355, 1, 0, %p31;
add.s64 %rd1356, %rd2461, %rd1355;
xor.b64 %rd1357, %rd1356, %rd2577;
shr.u64 %rd1358, %rd1357, 32;
mul.lo.s64 %rd2580, %rd1358, 3449720151;
shr.u64 %rd1359, %rd2580, 32;
and.b64 %rd1360, %rd1356, 4294967295;
mul.lo.s64 %rd1361, %rd1360, 3449720151;
and.b64 %rd1362, %rd1361, 4294967295;
xor.b64 %rd1363, %rd1362, %rd1359;
xor.b64 %rd1364, %rd1363, 2654435769;
mul.lo.s64 %rd2583, %rd1364, 3528531795;
xor.b64 %rd2573, %rd1361, %rd213;
mov.u32 %r329, -845247145;
mov.u32 %r328, -616729560;
mov.u64 %rd2590, 3041712726;
mov.u64 %rd2589, 1401181199;
mov.u64 %rd2588, 2835769497;
mov.u64 %rd2587, 1684936478;
mov.u64 %rd2586, 2027808484;
mov.u64 %rd2585, 387276957;
mov.u64 %rd2584, 842468239;
mov.u64 %rd2582, 3986602516;
mov.u64 %rd2581, 1013904242;
mov.u64 %rd2579, 3668340011;
mov.u64 %rd2578, 3144134277;
mov.u64 %rd2576, 3449720151;
mov.u64 %rd2575, 1993301258;
mov.u64 %rd2574, 3528531795;
bra.uni LBB54_26;
LBB54_25:
setp.lt.u64 %p30, %rd213, %rd11;
selp.u64 %rd1329, 1, 0, %p30;
add.s64 %rd1330, %rd2461, %rd1329;
and.b64 %rd1331, %rd1330, 4294967295;
mul.lo.s64 %rd2577, %rd1331, 3449720151;
xor.b64 %rd1332, %rd2577, %rd213;
shr.u64 %rd1333, %rd1332, 32;
mul.lo.s64 %rd2580, %rd1333, 3528531795;
shr.u64 %rd1334, %rd2580, 32;
and.b64 %rd1335, %rd213, 4294967295;
mul.lo.s64 %rd1336, %rd1335, 3528531795;
and.b64 %rd1337, %rd1336, 4294967295;
xor.b64 %rd1338, %rd1337, %rd1334;
xor.b64 %rd1339, %rd1338, 3144134277;
mul.lo.s64 %rd2583, %rd1339, 3449720151;
xor.b64 %rd2573, %rd1330, %rd1336;
mov.u32 %r329, -766435501;
mov.u32 %r328, -239350328;
mov.u64 %rd2590, 1684936478;
mov.u64 %rd2589, 534103459;
mov.u64 %rd2588, 387276957;
mov.u64 %rd2587, 3041712726;
mov.u64 %rd2586, 3986602516;
mov.u64 %rd2585, 2835769497;
mov.u64 %rd2584, 3668340011;
mov.u64 %rd2582, 2027808484;
mov.u64 %rd2581, 1993301258;
mov.u64 %rd2579, 842468239;
mov.u64 %rd2578, 2654435769;
mov.u64 %rd2576, 3528531795;
mov.u64 %rd2575, 1013904242;
mov.u64 %rd2574, 3449720151;
LBB54_26:
shr.u64 %rd1365, %rd2583, 32;
shr.u64 %rd1366, %rd2573, 32;
mul.lo.s64 %rd1367, %rd1366, %rd2574;
and.b64 %rd1368, %rd1367, 4294967295;
xor.b64 %rd1369, %rd1368, %rd1365;
xor.b64 %rd1370, %rd1369, %rd2575;
mul.lo.s64 %rd1371, %rd1370, %rd2576;
shr.u64 %rd1372, %rd1371, 32;
shr.u64 %rd1373, %rd1367, 32;
and.b64 %rd1374, %rd2577, 4294967295;
xor.b64 %rd1375, %rd1374, %rd1373;
xor.b64 %rd1376, %rd1375, %rd2578;
mul.lo.s64 %rd1377, %rd1376, %rd2576;
and.b64 %rd1378, %rd1377, 4294967295;
xor.b64 %rd1379, %rd1378, %rd1372;
xor.b64 %rd1380, %rd1379, %rd2579;
mul.lo.s64 %rd1381, %rd1380, %rd2574;
shr.u64 %rd1382, %rd1381, 32;
shr.u64 %rd1383, %rd1377, 32;
and.b64 %rd1384, %rd2580, 4294967295;
xor.b64 %rd1385, %rd1384, %rd1383;
xor.b64 %rd1386, %rd1385, %rd2581;
mul.lo.s64 %rd1387, %rd1386, %rd2574;
and.b64 %rd1388, %rd1387, 4294967295;
xor.b64 %rd1389, %rd1388, %rd1382;
xor.b64 %rd1390, %rd1389, %rd2582;
mul.lo.s64 %rd1391, %rd1390, %rd2576;
shr.u64 %rd1392, %rd1391, 32;
shr.u64 %rd1393, %rd1387, 32;
and.b64 %rd1394, %rd2583, 4294967295;
xor.b64 %rd1395, %rd1394, %rd1393;
xor.b64 %rd1396, %rd1395, %rd2584;
mul.lo.s64 %rd1397, %rd1396, %rd2576;
and.b64 %rd1398, %rd1397, 4294967295;
xor.b64 %rd1399, %rd1398, %rd1392;
xor.b64 %rd1400, %rd1399, %rd2585;
mul.lo.s64 %rd1401, %rd1400, %rd2574;
shr.u64 %rd1402, %rd1401, 32;
shr.u64 %rd1403, %rd1397, 32;
and.b64 %rd1404, %rd1371, 4294967295;
xor.b64 %rd1405, %rd1404, %rd1403;
xor.b64 %rd1406, %rd1405, %rd2586;
mul.lo.s64 %rd1407, %rd1406, %rd2574;
and.b64 %rd1408, %rd1407, 4294967295;
xor.b64 %rd1409, %rd1408, %rd1402;
xor.b64 %rd1410, %rd1409, %rd2587;
mul.lo.s64 %rd1411, %rd1410, %rd2576;
shr.u64 %rd1412, %rd1411, 32;
shr.u64 %rd1413, %rd1407, 32;
and.b64 %rd1414, %rd1381, 4294967295;
xor.b64 %rd1415, %rd1414, %rd1413;
xor.b64 %rd1416, %rd1415, %rd2588;
mul.lo.s64 %rd1417, %rd1416, %rd2576;
and.b64 %rd1418, %rd1417, 4294967295;
xor.b64 %rd1419, %rd1418, %rd1412;
xor.b64 %rd1420, %rd1419, %rd2589;
mul.lo.s64 %rd1421, %rd1420, %rd2574;
shr.u64 %rd1422, %rd1421, 32;
shr.u64 %rd1423, %rd1417, 32;
xor.b64 %rd1424, %rd1391, %rd1423;
xor.b64 %rd1425, %rd1424, %rd2590;
mul.lo.s64 %rd1426, %rd1425, %rd2574;
xor.b64 %rd1427, %rd1422, %rd1426;
cvt.u32.u64 %r153, %rd1427;
xor.b32 %r154, %r328, %r153;
mul.lo.s32 %r155, %r154, %r329;
shr.u32 %r156, %r155, 9;
cvt.rn.f32.u32 %f114, %r156;
mul.rn.f32 %f115, %f114, 0f34000000;
cvt.rn.f16.f32 %h64, %f115;
mov.b16 %h65, 0x2E66;
setp.ge.f16 %p33, %h64, %h65;
ld.global.nc.b16 %h66, [%rd44+770];
ld.global.nc.f32 %f116, [%rd45+1540];
cvt.rn.f16.f32 %h67, %f116;
add.rn.f16 %h68, %h66, %h67;
mov.b16 %h69, 0x3C72;
mul.rn.f16 %h70, %h68, %h69;
selp.b16 %h71, %h70, 0x0000, %p33;
cvt.f32.f16 %f117, %h71;
ld.global.nc.b16 %h72, [%rd46+770];
cvt.f32.f16 %f118, %h72;
ld.global.nc.f32 %f119, [%rd47+1540];
mul.rn.f32 %f120, %f1, %f119;
mul.rn.f32 %f121, %f120, %f118;
ld.global.nc.f32 %f122, [%rd48+1540];
mul.rn.f32 %f123, %f2, %f120;
sub.rn.f32 %f124, %f122, %f123;
add.rn.f32 %f125, %f121, %f124;
add.rn.f32 %f126, %f125, %f117;
add.rn.f32 %f10, %f9, %f126;
or.b32 %r158, %r73, 512;
shr.u32 %r159, %r158, 2;
cvt.u64.u32 %rd1428, %r159;
add.s64 %rd240, %rd11, %rd1428;
@%p8 bra LBB54_28;
and.b64 %rd1470, %rd240, 4294967295;
mul.lo.s64 %rd2595, %rd1470, 3528531795;
setp.lt.u64 %p35, %rd240, %rd11;
selp.u64 %rd1471, 1, 0, %p35;
add.s64 %rd1472, %rd2461, %rd1471;
xor.b64 %rd1473, %rd1472, %rd2595;
shr.u64 %rd1474, %rd1473, 32;
mul.lo.s64 %rd2598, %rd1474, 3449720151;
shr.u64 %rd1475, %rd2598, 32;
and.b64 %rd1476, %rd1472, 4294967295;
mul.lo.s64 %rd1477, %rd1476, 3449720151;
and.b64 %rd1478, %rd1477, 4294967295;
xor.b64 %rd1479, %rd1478, %rd1475;
xor.b64 %rd1480, %rd1479, 2654435769;
mul.lo.s64 %rd2601, %rd1480, 3528531795;
xor.b64 %rd2591, %rd1477, %rd240;
mov.u32 %r332, -1879881855;
mov.u32 %r331, -845247145;
mov.u32 %r330, 534103459;
mov.u64 %rd2609, 3678237736;
mov.u64 %rd2608, 3041712726;
mov.u64 %rd2607, 1401181199;
mov.u64 %rd2606, 2835769497;
mov.u64 %rd2605, 1684936478;
mov.u64 %rd2604, 2027808484;
mov.u64 %rd2603, 387276957;
mov.u64 %rd2602, 842468239;
mov.u64 %rd2600, 3986602516;
mov.u64 %rd2599, 1013904242;
mov.u64 %rd2597, 3668340011;
mov.u64 %rd2596, 3144134277;
mov.u64 %rd2594, 3449720151;
mov.u64 %rd2593, 1993301258;
mov.u64 %rd2592, 3528531795;
bra.uni LBB54_29;
LBB54_28:
setp.lt.u64 %p34, %rd240, %rd11;
selp.u64 %rd1444, 1, 0, %p34;
add.s64 %rd1445, %rd2461, %rd1444;
and.b64 %rd1446, %rd1445, 4294967295;
mul.lo.s64 %rd2595, %rd1446, 3449720151;
xor.b64 %rd1447, %rd2595, %rd240;
shr.u64 %rd1448, %rd1447, 32;
mul.lo.s64 %rd2598, %rd1448, 3528531795;
shr.u64 %rd1449, %rd2598, 32;
and.b64 %rd1450, %rd240, 4294967295;
mul.lo.s64 %rd1451, %rd1450, 3528531795;
and.b64 %rd1452, %rd1451, 4294967295;
xor.b64 %rd1453, %rd1452, %rd1449;
xor.b64 %rd1454, %rd1453, 3144134277;
mul.lo.s64 %rd2601, %rd1454, 3449720151;
xor.b64 %rd2591, %rd1445, %rd1451;
mov.u32 %r332, -1767562579;
mov.u32 %r331, -766435501;
mov.u32 %r330, 1401181199;
mov.u64 %rd2609, 4055616968;
mov.u64 %rd2608, 1684936478;
mov.u64 %rd2607, 534103459;
mov.u64 %rd2606, 387276957;
mov.u64 %rd2605, 3041712726;
mov.u64 %rd2604, 3986602516;
mov.u64 %rd2603, 2835769497;
mov.u64 %rd2602, 3668340011;
mov.u64 %rd2600, 2027808484;
mov.u64 %rd2599, 1993301258;
mov.u64 %rd2597, 842468239;
mov.u64 %rd2596, 2654435769;
mov.u64 %rd2594, 3528531795;
mov.u64 %rd2593, 1013904242;
mov.u64 %rd2592, 3449720151;
LBB54_29:
shr.u64 %rd1481, %rd2601, 32;
shr.u64 %rd1482, %rd2591, 32;
mul.lo.s64 %rd1483, %rd1482, %rd2592;
and.b64 %rd1484, %rd1483, 4294967295;
xor.b64 %rd1485, %rd1484, %rd1481;
xor.b64 %rd1486, %rd1485, %rd2593;
mul.lo.s64 %rd1487, %rd1486, %rd2594;
shr.u64 %rd1488, %rd1487, 32;
shr.u64 %rd1489, %rd1483, 32;
and.b64 %rd1490, %rd2595, 4294967295;
xor.b64 %rd1491, %rd1490, %rd1489;
xor.b64 %rd1492, %rd1491, %rd2596;
mul.lo.s64 %rd1493, %rd1492, %rd2594;
and.b64 %rd1494, %rd1493, 4294967295;
xor.b64 %rd1495, %rd1494, %rd1488;
xor.b64 %rd1496, %rd1495, %rd2597;
mul.lo.s64 %rd1497, %rd1496, %rd2592;
shr.u64 %rd1498, %rd1497, 32;
shr.u64 %rd1499, %rd1493, 32;
and.b64 %rd1500, %rd2598, 4294967295;
xor.b64 %rd1501, %rd1500, %rd1499;
xor.b64 %rd1502, %rd1501, %rd2599;
mul.lo.s64 %rd1503, %rd1502, %rd2592;
and.b64 %rd1504, %rd1503, 4294967295;
xor.b64 %rd1505, %rd1504, %rd1498;
xor.b64 %rd1506, %rd1505, %rd2600;
mul.lo.s64 %rd1507, %rd1506, %rd2594;
shr.u64 %rd1508, %rd1507, 32;
shr.u64 %rd1509, %rd1503, 32;
and.b64 %rd1510, %rd2601, 4294967295;
xor.b64 %rd1511, %rd1510, %rd1509;
xor.b64 %rd1512, %rd1511, %rd2602;
mul.lo.s64 %rd1513, %rd1512, %rd2594;
and.b64 %rd1514, %rd1513, 4294967295;
xor.b64 %rd1515, %rd1514, %rd1508;
xor.b64 %rd1516, %rd1515, %rd2603;
mul.lo.s64 %rd1517, %rd1516, %rd2592;
shr.u64 %rd1518, %rd1517, 32;
shr.u64 %rd1519, %rd1513, 32;
and.b64 %rd1520, %rd1487, 4294967295;
xor.b64 %rd1521, %rd1520, %rd1519;
xor.b64 %rd1522, %rd1521, %rd2604;
mul.lo.s64 %rd1523, %rd1522, %rd2592;
and.b64 %rd1524, %rd1523, 4294967295;
xor.b64 %rd1525, %rd1524, %rd1518;
xor.b64 %rd1526, %rd1525, %rd2605;
mul.lo.s64 %rd1527, %rd1526, %rd2594;
shr.u64 %rd1528, %rd1527, 32;
shr.u64 %rd1529, %rd1523, 32;
and.b64 %rd1530, %rd1497, 4294967295;
xor.b64 %rd1531, %rd1530, %rd1529;
xor.b64 %rd1532, %rd1531, %rd2606;
mul.lo.s64 %rd1533, %rd1532, %rd2594;
and.b64 %rd1534, %rd1533, 4294967295;
xor.b64 %rd1535, %rd1534, %rd1528;
xor.b64 %rd1536, %rd1535, %rd2607;
mul.lo.s64 %rd1537, %rd1536, %rd2592;
shr.u64 %rd1538, %rd1537, 32;
shr.u64 %rd1539, %rd1533, 32;
and.b64 %rd1540, %rd1507, 4294967295;
xor.b64 %rd1541, %rd1540, %rd1539;
xor.b64 %rd1542, %rd1541, %rd2608;
mul.lo.s64 %rd1543, %rd1542, %rd2592;
and.b64 %rd1544, %rd1543, 4294967295;
xor.b64 %rd1545, %rd1544, %rd1538;
xor.b64 %rd1546, %rd1545, %rd2609;
mul.lo.s64 %rd1547, %rd1546, %rd2594;
shr.u64 %rd1548, %rd1547, 32;
cvt.u32.u64 %r166, %rd1548;
shr.u64 %rd1549, %rd1543, 32;
xor.b64 %rd1550, %rd1549, %rd1517;
cvt.u32.u64 %r167, %rd1550;
xor.b32 %r168, %r330, %r167;
mul.lo.s32 %r169, %r168, %r331;
xor.b32 %r170, %r169, %r166;
xor.b32 %r171, %r170, %r332;
shr.u32 %r172, %r171, 9;
cvt.rn.f32.u32 %f127, %r172;
mul.rn.f32 %f128, %f127, 0f34000000;
cvt.rn.f16.f32 %h73, %f128;
mov.b16 %h74, 0x2E66;
setp.ge.f16 %p36, %h73, %h74;
ld.global.nc.b16 %h75, [%rd44+1024];
ld.global.nc.f32 %f129, [%rd45+2048];
cvt.rn.f16.f32 %h76, %f129;
add.rn.f16 %h77, %h75, %h76;
mov.b16 %h78, 0x3C72;
mul.rn.f16 %h79, %h77, %h78;
selp.b16 %h80, %h79, 0x0000, %p36;
cvt.f32.f16 %f130, %h80;
ld.global.nc.b16 %h81, [%rd46+1024];
cvt.f32.f16 %f131, %h81;
ld.global.nc.f32 %f132, [%rd47+2048];
mul.rn.f32 %f133, %f1, %f132;
mul.rn.f32 %f134, %f133, %f131;
ld.global.nc.f32 %f135, [%rd48+2048];
mul.rn.f32 %f136, %f2, %f133;
sub.rn.f32 %f137, %f135, %f136;
add.rn.f32 %f138, %f134, %f137;
add.rn.f32 %f139, %f138, %f130;
add.rn.f32 %f11, %f10, %f139;
or.b32 %r173, %r3, 513;
or.b32 %r174, %r173, %r4;
and.b32 %r175, %r173, 3;
shr.u32 %r176, %r174, 2;
setp.ne.s32 %p37, %r175, 1;
cvt.u64.u32 %rd1551, %r176;
add.s64 %rd268, %rd11, %rd1551;
@%p37 bra LBB54_31;
and.b64 %rd1591, %rd268, 4294967295;
mul.lo.s64 %rd2614, %rd1591, 3528531795;
setp.lt.u64 %p39, %rd268, %rd11;
selp.u64 %rd1592, 1, 0, %p39;
add.s64 %rd1593, %rd2461, %rd1592;
xor.b64 %rd1594, %rd1593, %rd2614;
shr.u64 %rd1595, %rd1594, 32;
mul.lo.s64 %rd2617, %rd1595, 3449720151;
shr.u64 %rd1596, %rd2617, 32;
and.b64 %rd1597, %rd1593, 4294967295;
mul.lo.s64 %rd1598, %rd1597, 3449720151;
and.b64 %rd1599, %rd1598, 4294967295;
xor.b64 %rd1600, %rd1599, %rd1596;
xor.b64 %rd1601, %rd1600, 2654435769;
mul.lo.s64 %rd2620, %rd1601, 3528531795;
xor.b64 %rd2610, %rd1598, %rd268;
mov.u32 %r334, -845247145;
mov.u32 %r333, -616729560;
mov.u64 %rd2627, 3041712726;
mov.u64 %rd2626, 1401181199;
mov.u64 %rd2625, 2835769497;
mov.u64 %rd2624, 1684936478;
mov.u64 %rd2623, 2027808484;
mov.u64 %rd2622, 387276957;
mov.u64 %rd2621, 842468239;
mov.u64 %rd2619, 3986602516;
mov.u64 %rd2618, 1013904242;
mov.u64 %rd2616, 3668340011;
mov.u64 %rd2615, 3144134277;
mov.u64 %rd2613, 3449720151;
mov.u64 %rd2612, 1993301258;
mov.u64 %rd2611, 3528531795;
bra.uni LBB54_32;
LBB54_31:
setp.lt.u64 %p38, %rd268, %rd11;
selp.u64 %rd1566, 1, 0, %p38;
add.s64 %rd1567, %rd2461, %rd1566;
and.b64 %rd1568, %rd1567, 4294967295;
mul.lo.s64 %rd2614, %rd1568, 3449720151;
xor.b64 %rd1569, %rd2614, %rd268;
shr.u64 %rd1570, %rd1569, 32;
mul.lo.s64 %rd2617, %rd1570, 3528531795;
shr.u64 %rd1571, %rd2617, 32;
and.b64 %rd1572, %rd268, 4294967295;
mul.lo.s64 %rd1573, %rd1572, 3528531795;
and.b64 %rd1574, %rd1573, 4294967295;
xor.b64 %rd1575, %rd1574, %rd1571;
xor.b64 %rd1576, %rd1575, 3144134277;
mul.lo.s64 %rd2620, %rd1576, 3449720151;
xor.b64 %rd2610, %rd1567, %rd1573;
mov.u32 %r334, -766435501;
mov.u32 %r333, -239350328;
mov.u64 %rd2627, 1684936478;
mov.u64 %rd2626, 534103459;
mov.u64 %rd2625, 387276957;
mov.u64 %rd2624, 3041712726;
mov.u64 %rd2623, 3986602516;
mov.u64 %rd2622, 2835769497;
mov.u64 %rd2621, 3668340011;
mov.u64 %rd2619, 2027808484;
mov.u64 %rd2618, 1993301258;
mov.u64 %rd2616, 842468239;
mov.u64 %rd2615, 2654435769;
mov.u64 %rd2613, 3528531795;
mov.u64 %rd2612, 1013904242;
mov.u64 %rd2611, 3449720151;
LBB54_32:
shr.u64 %rd1602, %rd2620, 32;
shr.u64 %rd1603, %rd2610, 32;
mul.lo.s64 %rd1604, %rd1603, %rd2611;
and.b64 %rd1605, %rd1604, 4294967295;
xor.b64 %rd1606, %rd1605, %rd1602;
xor.b64 %rd1607, %rd1606, %rd2612;
mul.lo.s64 %rd1608, %rd1607, %rd2613;
shr.u64 %rd1609, %rd1608, 32;
shr.u64 %rd1610, %rd1604, 32;
and.b64 %rd1611, %rd2614, 4294967295;
xor.b64 %rd1612, %rd1611, %rd1610;
xor.b64 %rd1613, %rd1612, %rd2615;
mul.lo.s64 %rd1614, %rd1613, %rd2613;
and.b64 %rd1615, %rd1614, 4294967295;
xor.b64 %rd1616, %rd1615, %rd1609;
xor.b64 %rd1617, %rd1616, %rd2616;
mul.lo.s64 %rd1618, %rd1617, %rd2611;
shr.u64 %rd1619, %rd1618, 32;
shr.u64 %rd1620, %rd1614, 32;
and.b64 %rd1621, %rd2617, 4294967295;
xor.b64 %rd1622, %rd1621, %rd1620;
xor.b64 %rd1623, %rd1622, %rd2618;
mul.lo.s64 %rd1624, %rd1623, %rd2611;
and.b64 %rd1625, %rd1624, 4294967295;
xor.b64 %rd1626, %rd1625, %rd1619;
xor.b64 %rd1627, %rd1626, %rd2619;
mul.lo.s64 %rd1628, %rd1627, %rd2613;
shr.u64 %rd1629, %rd1628, 32;
shr.u64 %rd1630, %rd1624, 32;
and.b64 %rd1631, %rd2620, 4294967295;
xor.b64 %rd1632, %rd1631, %rd1630;
xor.b64 %rd1633, %rd1632, %rd2621;
mul.lo.s64 %rd1634, %rd1633, %rd2613;
and.b64 %rd1635, %rd1634, 4294967295;
xor.b64 %rd1636, %rd1635, %rd1629;
xor.b64 %rd1637, %rd1636, %rd2622;
mul.lo.s64 %rd1638, %rd1637, %rd2611;
shr.u64 %rd1639, %rd1638, 32;
shr.u64 %rd1640, %rd1634, 32;
and.b64 %rd1641, %rd1608, 4294967295;
xor.b64 %rd1642, %rd1641, %rd1640;
xor.b64 %rd1643, %rd1642, %rd2623;
mul.lo.s64 %rd1644, %rd1643, %rd2611;
and.b64 %rd1645, %rd1644, 4294967295;
xor.b64 %rd1646, %rd1645, %rd1639;
xor.b64 %rd1647, %rd1646, %rd2624;
mul.lo.s64 %rd1648, %rd1647, %rd2613;
shr.u64 %rd1649, %rd1648, 32;
shr.u64 %rd1650, %rd1644, 32;
and.b64 %rd1651, %rd1618, 4294967295;
xor.b64 %rd1652, %rd1651, %rd1650;
xor.b64 %rd1653, %rd1652, %rd2625;
mul.lo.s64 %rd1654, %rd1653, %rd2613;
and.b64 %rd1655, %rd1654, 4294967295;
xor.b64 %rd1656, %rd1655, %rd1649;
xor.b64 %rd1657, %rd1656, %rd2626;
mul.lo.s64 %rd1658, %rd1657, %rd2611;
shr.u64 %rd1659, %rd1658, 32;
shr.u64 %rd1660, %rd1654, 32;
xor.b64 %rd1661, %rd1628, %rd1660;
xor.b64 %rd1662, %rd1661, %rd2627;
mul.lo.s64 %rd1663, %rd1662, %rd2611;
xor.b64 %rd1664, %rd1659, %rd1663;
cvt.u32.u64 %r181, %rd1664;
xor.b32 %r182, %r333, %r181;
mul.lo.s32 %r183, %r182, %r334;
shr.u32 %r184, %r183, 9;
cvt.rn.f32.u32 %f140, %r184;
mul.rn.f32 %f141, %f140, 0f34000000;
cvt.rn.f16.f32 %h82, %f141;
mov.b16 %h83, 0x2E66;
setp.ge.f16 %p41, %h82, %h83;
ld.global.nc.b16 %h84, [%rd44+1026];
ld.global.nc.f32 %f142, [%rd45+2052];
cvt.rn.f16.f32 %h85, %f142;
add.rn.f16 %h86, %h84, %h85;
mov.b16 %h87, 0x3C72;
mul.rn.f16 %h88, %h86, %h87;
selp.b16 %h89, %h88, 0x0000, %p41;
cvt.f32.f16 %f143, %h89;
ld.global.nc.b16 %h90, [%rd46+1026];
cvt.f32.f16 %f144, %h90;
ld.global.nc.f32 %f145, [%rd47+2052];
mul.rn.f32 %f146, %f1, %f145;
mul.rn.f32 %f147, %f146, %f144;
ld.global.nc.f32 %f148, [%rd48+2052];
mul.rn.f32 %f149, %f2, %f146;
sub.rn.f32 %f150, %f148, %f149;
add.rn.f32 %f151, %f147, %f150;
add.rn.f32 %f152, %f151, %f143;
add.rn.f32 %f12, %f11, %f152;
or.b32 %r186, %r73, 640;
shr.u32 %r187, %r186, 2;
cvt.u64.u32 %rd1665, %r187;
add.s64 %rd295, %rd11, %rd1665;
@%p8 bra LBB54_34;
and.b64 %rd1707, %rd295, 4294967295;
mul.lo.s64 %rd2632, %rd1707, 3528531795;
setp.lt.u64 %p43, %rd295, %rd11;
selp.u64 %rd1708, 1, 0, %p43;
add.s64 %rd1709, %rd2461, %rd1708;
xor.b64 %rd1710, %rd1709, %rd2632;
shr.u64 %rd1711, %rd1710, 32;
mul.lo.s64 %rd2635, %rd1711, 3449720151;
shr.u64 %rd1712, %rd2635, 32;
and.b64 %rd1713, %rd1709, 4294967295;
mul.lo.s64 %rd1714, %rd1713, 3449720151;
and.b64 %rd1715, %rd1714, 4294967295;
xor.b64 %rd1716, %rd1715, %rd1712;
xor.b64 %rd1717, %rd1716, 2654435769;
mul.lo.s64 %rd2638, %rd1717, 3528531795;
xor.b64 %rd2628, %rd1714, %rd295;
mov.u32 %r337, -1879881855;
mov.u32 %r336, -845247145;
mov.u32 %r335, 534103459;
mov.u64 %rd2646, 3678237736;
mov.u64 %rd2645, 3041712726;
mov.u64 %rd2644, 1401181199;
mov.u64 %rd2643, 2835769497;
mov.u64 %rd2642, 1684936478;
mov.u64 %rd2641, 2027808484;
mov.u64 %rd2640, 387276957;
mov.u64 %rd2639, 842468239;
mov.u64 %rd2637, 3986602516;
mov.u64 %rd2636, 1013904242;
mov.u64 %rd2634, 3668340011;
mov.u64 %rd2633, 3144134277;
mov.u64 %rd2631, 3449720151;
mov.u64 %rd2630, 1993301258;
mov.u64 %rd2629, 3528531795;
bra.uni LBB54_35;
LBB54_34:
setp.lt.u64 %p42, %rd295, %rd11;
selp.u64 %rd1681, 1, 0, %p42;
add.s64 %rd1682, %rd2461, %rd1681;
and.b64 %rd1683, %rd1682, 4294967295;
mul.lo.s64 %rd2632, %rd1683, 3449720151;
xor.b64 %rd1684, %rd2632, %rd295;
shr.u64 %rd1685, %rd1684, 32;
mul.lo.s64 %rd2635, %rd1685, 3528531795;
shr.u64 %rd1686, %rd2635, 32;
and.b64 %rd1687, %rd295, 4294967295;
mul.lo.s64 %rd1688, %rd1687, 3528531795;
and.b64 %rd1689, %rd1688, 4294967295;
xor.b64 %rd1690, %rd1689, %rd1686;
xor.b64 %rd1691, %rd1690, 3144134277;
mul.lo.s64 %rd2638, %rd1691, 3449720151;
xor.b64 %rd2628, %rd1682, %rd1688;
mov.u32 %r337, -1767562579;
mov.u32 %r336, -766435501;
mov.u32 %r335, 1401181199;
mov.u64 %rd2646, 4055616968;
mov.u64 %rd2645, 1684936478;
mov.u64 %rd2644, 534103459;
mov.u64 %rd2643, 387276957;
mov.u64 %rd2642, 3041712726;
mov.u64 %rd2641, 3986602516;
mov.u64 %rd2640, 2835769497;
mov.u64 %rd2639, 3668340011;
mov.u64 %rd2637, 2027808484;
mov.u64 %rd2636, 1993301258;
mov.u64 %rd2634, 842468239;
mov.u64 %rd2633, 2654435769;
mov.u64 %rd2631, 3528531795;
mov.u64 %rd2630, 1013904242;
mov.u64 %rd2629, 3449720151;
LBB54_35:
shr.u64 %rd1718, %rd2638, 32;
shr.u64 %rd1719, %rd2628, 32;
mul.lo.s64 %rd1720, %rd1719, %rd2629;
and.b64 %rd1721, %rd1720, 4294967295;
xor.b64 %rd1722, %rd1721, %rd1718;
xor.b64 %rd1723, %rd1722, %rd2630;
mul.lo.s64 %rd1724, %rd1723, %rd2631;
shr.u64 %rd1725, %rd1724, 32;
shr.u64 %rd1726, %rd1720, 32;
and.b64 %rd1727, %rd2632, 4294967295;
xor.b64 %rd1728, %rd1727, %rd1726;
xor.b64 %rd1729, %rd1728, %rd2633;
mul.lo.s64 %rd1730, %rd1729, %rd2631;
and.b64 %rd1731, %rd1730, 4294967295;
xor.b64 %rd1732, %rd1731, %rd1725;
xor.b64 %rd1733, %rd1732, %rd2634;
mul.lo.s64 %rd1734, %rd1733, %rd2629;
shr.u64 %rd1735, %rd1734, 32;
shr.u64 %rd1736, %rd1730, 32;
and.b64 %rd1737, %rd2635, 4294967295;
xor.b64 %rd1738, %rd1737, %rd1736;
xor.b64 %rd1739, %rd1738, %rd2636;
mul.lo.s64 %rd1740, %rd1739, %rd2629;
and.b64 %rd1741, %rd1740, 4294967295;
xor.b64 %rd1742, %rd1741, %rd1735;
xor.b64 %rd1743, %rd1742, %rd2637;
mul.lo.s64 %rd1744, %rd1743, %rd2631;
shr.u64 %rd1745, %rd1744, 32;
shr.u64 %rd1746, %rd1740, 32;
and.b64 %rd1747, %rd2638, 4294967295;
xor.b64 %rd1748, %rd1747, %rd1746;
xor.b64 %rd1749, %rd1748, %rd2639;
mul.lo.s64 %rd1750, %rd1749, %rd2631;
and.b64 %rd1751, %rd1750, 4294967295;
xor.b64 %rd1752, %rd1751, %rd1745;
xor.b64 %rd1753, %rd1752, %rd2640;
mul.lo.s64 %rd1754, %rd1753, %rd2629;
shr.u64 %rd1755, %rd1754, 32;
shr.u64 %rd1756, %rd1750, 32;
and.b64 %rd1757, %rd1724, 4294967295;
xor.b64 %rd1758, %rd1757, %rd1756;
xor.b64 %rd1759, %rd1758, %rd2641;
mul.lo.s64 %rd1760, %rd1759, %rd2629;
and.b64 %rd1761, %rd1760, 4294967295;
xor.b64 %rd1762, %rd1761, %rd1755;
xor.b64 %rd1763, %rd1762, %rd2642;
mul.lo.s64 %rd1764, %rd1763, %rd2631;
shr.u64 %rd1765, %rd1764, 32;
shr.u64 %rd1766, %rd1760, 32;
and.b64 %rd1767, %rd1734, 4294967295;
xor.b64 %rd1768, %rd1767, %rd1766;
xor.b64 %rd1769, %rd1768, %rd2643;
mul.lo.s64 %rd1770, %rd1769, %rd2631;
and.b64 %rd1771, %rd1770, 4294967295;
xor.b64 %rd1772, %rd1771, %rd1765;
xor.b64 %rd1773, %rd1772, %rd2644;
mul.lo.s64 %rd1774, %rd1773, %rd2629;
shr.u64 %rd1775, %rd1774, 32;
shr.u64 %rd1776, %rd1770, 32;
and.b64 %rd1777, %rd1744, 4294967295;
xor.b64 %rd1778, %rd1777, %rd1776;
xor.b64 %rd1779, %rd1778, %rd2645;
mul.lo.s64 %rd1780, %rd1779, %rd2629;
and.b64 %rd1781, %rd1780, 4294967295;
xor.b64 %rd1782, %rd1781, %rd1775;
xor.b64 %rd1783, %rd1782, %rd2646;
mul.lo.s64 %rd1784, %rd1783, %rd2631;
shr.u64 %rd1785, %rd1784, 32;
cvt.u32.u64 %r194, %rd1785;
shr.u64 %rd1786, %rd1780, 32;
xor.b64 %rd1787, %rd1786, %rd1754;
cvt.u32.u64 %r195, %rd1787;
xor.b32 %r196, %r335, %r195;
mul.lo.s32 %r197, %r196, %r336;
xor.b32 %r198, %r197, %r194;
xor.b32 %r199, %r198, %r337;
shr.u32 %r200, %r199, 9;
cvt.rn.f32.u32 %f153, %r200;
mul.rn.f32 %f154, %f153, 0f34000000;
cvt.rn.f16.f32 %h91, %f154;
mov.b16 %h92, 0x2E66;
setp.ge.f16 %p44, %h91, %h92;
ld.global.nc.b16 %h93, [%rd44+1280];
ld.global.nc.f32 %f155, [%rd45+2560];
cvt.rn.f16.f32 %h94, %f155;
add.rn.f16 %h95, %h93, %h94;
mov.b16 %h96, 0x3C72;
mul.rn.f16 %h97, %h95, %h96;
selp.b16 %h98, %h97, 0x0000, %p44;
cvt.f32.f16 %f156, %h98;
ld.global.nc.b16 %h99, [%rd46+1280];
cvt.f32.f16 %f157, %h99;
ld.global.nc.f32 %f158, [%rd47+2560];
mul.rn.f32 %f159, %f1, %f158;
mul.rn.f32 %f160, %f159, %f157;
ld.global.nc.f32 %f161, [%rd48+2560];
mul.rn.f32 %f162, %f2, %f159;
sub.rn.f32 %f163, %f161, %f162;
add.rn.f32 %f164, %f160, %f163;
add.rn.f32 %f165, %f164, %f156;
add.rn.f32 %f13, %f12, %f165;
or.b32 %r201, %r3, 641;
or.b32 %r202, %r201, %r4;
and.b32 %r203, %r201, 3;
shr.u32 %r204, %r202, 2;
setp.ne.s32 %p45, %r203, 1;
cvt.u64.u32 %rd1788, %r204;
add.s64 %rd323, %rd11, %rd1788;
@%p45 bra LBB54_37;
and.b64 %rd1828, %rd323, 4294967295;
mul.lo.s64 %rd2651, %rd1828, 3528531795;
setp.lt.u64 %p47, %rd323, %rd11;
selp.u64 %rd1829, 1, 0, %p47;
add.s64 %rd1830, %rd2461, %rd1829;
xor.b64 %rd1831, %rd1830, %rd2651;
shr.u64 %rd1832, %rd1831, 32;
mul.lo.s64 %rd2654, %rd1832, 3449720151;
shr.u64 %rd1833, %rd2654, 32;
and.b64 %rd1834, %rd1830, 4294967295;
mul.lo.s64 %rd1835, %rd1834, 3449720151;
and.b64 %rd1836, %rd1835, 4294967295;
xor.b64 %rd1837, %rd1836, %rd1833;
xor.b64 %rd1838, %rd1837, 2654435769;
mul.lo.s64 %rd2657, %rd1838, 3528531795;
xor.b64 %rd2647, %rd1835, %rd323;
mov.u32 %r339, -845247145;
mov.u32 %r338, -616729560;
mov.u64 %rd2664, 3041712726;
mov.u64 %rd2663, 1401181199;
mov.u64 %rd2662, 2835769497;
mov.u64 %rd2661, 1684936478;
mov.u64 %rd2660, 2027808484;
mov.u64 %rd2659, 387276957;
mov.u64 %rd2658, 842468239;
mov.u64 %rd2656, 3986602516;
mov.u64 %rd2655, 1013904242;
mov.u64 %rd2653, 3668340011;
mov.u64 %rd2652, 3144134277;
mov.u64 %rd2650, 3449720151;
mov.u64 %rd2649, 1993301258;
mov.u64 %rd2648, 3528531795;
bra.uni LBB54_38;
LBB54_37:
setp.lt.u64 %p46, %rd323, %rd11;
selp.u64 %rd1803, 1, 0, %p46;
add.s64 %rd1804, %rd2461, %rd1803;
and.b64 %rd1805, %rd1804, 4294967295;
mul.lo.s64 %rd2651, %rd1805, 3449720151;
xor.b64 %rd1806, %rd2651, %rd323;
shr.u64 %rd1807, %rd1806, 32;
mul.lo.s64 %rd2654, %rd1807, 3528531795;
shr.u64 %rd1808, %rd2654, 32;
and.b64 %rd1809, %rd323, 4294967295;
mul.lo.s64 %rd1810, %rd1809, 3528531795;
and.b64 %rd1811, %rd1810, 4294967295;
xor.b64 %rd1812, %rd1811, %rd1808;
xor.b64 %rd1813, %rd1812, 3144134277;
mul.lo.s64 %rd2657, %rd1813, 3449720151;
xor.b64 %rd2647, %rd1804, %rd1810;
mov.u32 %r339, -766435501;
mov.u32 %r338, -239350328;
mov.u64 %rd2664, 1684936478;
mov.u64 %rd2663, 534103459;
mov.u64 %rd2662, 387276957;
mov.u64 %rd2661, 3041712726;
mov.u64 %rd2660, 3986602516;
mov.u64 %rd2659, 2835769497;
mov.u64 %rd2658, 3668340011;
mov.u64 %rd2656, 2027808484;
mov.u64 %rd2655, 1993301258;
mov.u64 %rd2653, 842468239;
mov.u64 %rd2652, 2654435769;
mov.u64 %rd2650, 3528531795;
mov.u64 %rd2649, 1013904242;
mov.u64 %rd2648, 3449720151;
LBB54_38:
shr.u64 %rd1839, %rd2657, 32;
shr.u64 %rd1840, %rd2647, 32;
mul.lo.s64 %rd1841, %rd1840, %rd2648;
and.b64 %rd1842, %rd1841, 4294967295;
xor.b64 %rd1843, %rd1842, %rd1839;
xor.b64 %rd1844, %rd1843, %rd2649;
mul.lo.s64 %rd1845, %rd1844, %rd2650;
shr.u64 %rd1846, %rd1845, 32;
shr.u64 %rd1847, %rd1841, 32;
and.b64 %rd1848, %rd2651, 4294967295;
xor.b64 %rd1849, %rd1848, %rd1847;
xor.b64 %rd1850, %rd1849, %rd2652;
mul.lo.s64 %rd1851, %rd1850, %rd2650;
and.b64 %rd1852, %rd1851, 4294967295;
xor.b64 %rd1853, %rd1852, %rd1846;
xor.b64 %rd1854, %rd1853, %rd2653;
mul.lo.s64 %rd1855, %rd1854, %rd2648;
shr.u64 %rd1856, %rd1855, 32;
shr.u64 %rd1857, %rd1851, 32;
and.b64 %rd1858, %rd2654, 4294967295;
xor.b64 %rd1859, %rd1858, %rd1857;
xor.b64 %rd1860, %rd1859, %rd2655;
mul.lo.s64 %rd1861, %rd1860, %rd2648;
and.b64 %rd1862, %rd1861, 4294967295;
xor.b64 %rd1863, %rd1862, %rd1856;
xor.b64 %rd1864, %rd1863, %rd2656;
mul.lo.s64 %rd1865, %rd1864, %rd2650;
shr.u64 %rd1866, %rd1865, 32;
shr.u64 %rd1867, %rd1861, 32;
and.b64 %rd1868, %rd2657, 4294967295;
xor.b64 %rd1869, %rd1868, %rd1867;
xor.b64 %rd1870, %rd1869, %rd2658;
mul.lo.s64 %rd1871, %rd1870, %rd2650;
and.b64 %rd1872, %rd1871, 4294967295;
xor.b64 %rd1873, %rd1872, %rd1866;
xor.b64 %rd1874, %rd1873, %rd2659;
mul.lo.s64 %rd1875, %rd1874, %rd2648;
shr.u64 %rd1876, %rd1875, 32;
shr.u64 %rd1877, %rd1871, 32;
and.b64 %rd1878, %rd1845, 4294967295;
xor.b64 %rd1879, %rd1878, %rd1877;
xor.b64 %rd1880, %rd1879, %rd2660;
mul.lo.s64 %rd1881, %rd1880, %rd2648;
and.b64 %rd1882, %rd1881, 4294967295;
xor.b64 %rd1883, %rd1882, %rd1876;
xor.b64 %rd1884, %rd1883, %rd2661;
mul.lo.s64 %rd1885, %rd1884, %rd2650;
shr.u64 %rd1886, %rd1885, 32;
shr.u64 %rd1887, %rd1881, 32;
and.b64 %rd1888, %rd1855, 4294967295;
xor.b64 %rd1889, %rd1888, %rd1887;
xor.b64 %rd1890, %rd1889, %rd2662;
mul.lo.s64 %rd1891, %rd1890, %rd2650;
and.b64 %rd1892, %rd1891, 4294967295;
xor.b64 %rd1893, %rd1892, %rd1886;
xor.b64 %rd1894, %rd1893, %rd2663;
mul.lo.s64 %rd1895, %rd1894, %rd2648;
shr.u64 %rd1896, %rd1895, 32;
shr.u64 %rd1897, %rd1891, 32;
xor.b64 %rd1898, %rd1865, %rd1897;
xor.b64 %rd1899, %rd1898, %rd2664;
mul.lo.s64 %rd1900, %rd1899, %rd2648;
xor.b64 %rd1901, %rd1896, %rd1900;
cvt.u32.u64 %r209, %rd1901;
xor.b32 %r210, %r338, %r209;
mul.lo.s32 %r211, %r210, %r339;
shr.u32 %r212, %r211, 9;
cvt.rn.f32.u32 %f166, %r212;
mul.rn.f32 %f167, %f166, 0f34000000;
cvt.rn.f16.f32 %h100, %f167;
mov.b16 %h101, 0x2E66;
setp.ge.f16 %p49, %h100, %h101;
ld.global.nc.b16 %h102, [%rd44+1282];
ld.global.nc.f32 %f168, [%rd45+2564];
cvt.rn.f16.f32 %h103, %f168;
add.rn.f16 %h104, %h102, %h103;
mov.b16 %h105, 0x3C72;
mul.rn.f16 %h106, %h104, %h105;
selp.b16 %h107, %h106, 0x0000, %p49;
cvt.f32.f16 %f169, %h107;
ld.global.nc.b16 %h108, [%rd46+1282];
cvt.f32.f16 %f170, %h108;
ld.global.nc.f32 %f171, [%rd47+2564];
mul.rn.f32 %f172, %f1, %f171;
mul.rn.f32 %f173, %f172, %f170;
ld.global.nc.f32 %f174, [%rd48+2564];
mul.rn.f32 %f175, %f2, %f172;
sub.rn.f32 %f176, %f174, %f175;
add.rn.f32 %f177, %f173, %f176;
add.rn.f32 %f178, %f177, %f169;
add.rn.f32 %f14, %f13, %f178;
or.b32 %r214, %r73, 768;
shr.u32 %r215, %r214, 2;
cvt.u64.u32 %rd1902, %r215;
add.s64 %rd350, %rd11, %rd1902;
@%p8 bra LBB54_40;
and.b64 %rd1944, %rd350, 4294967295;
mul.lo.s64 %rd2669, %rd1944, 3528531795;
setp.lt.u64 %p51, %rd350, %rd11;
selp.u64 %rd1945, 1, 0, %p51;
add.s64 %rd1946, %rd2461, %rd1945;
xor.b64 %rd1947, %rd1946, %rd2669;
shr.u64 %rd1948, %rd1947, 32;
mul.lo.s64 %rd2672, %rd1948, 3449720151;
shr.u64 %rd1949, %rd2672, 32;
and.b64 %rd1950, %rd1946, 4294967295;
mul.lo.s64 %rd1951, %rd1950, 3449720151;
and.b64 %rd1952, %rd1951, 4294967295;
xor.b64 %rd1953, %rd1952, %rd1949;
xor.b64 %rd1954, %rd1953, 2654435769;
mul.lo.s64 %rd2675, %rd1954, 3528531795;
xor.b64 %rd2665, %rd1951, %rd350;
mov.u32 %r342, -1879881855;
mov.u32 %r341, -845247145;
mov.u32 %r340, 534103459;
mov.u64 %rd2683, 3678237736;
mov.u64 %rd2682, 3041712726;
mov.u64 %rd2681, 1401181199;
mov.u64 %rd2680, 2835769497;
mov.u64 %rd2679, 1684936478;
mov.u64 %rd2678, 2027808484;
mov.u64 %rd2677, 387276957;
mov.u64 %rd2676, 842468239;
mov.u64 %rd2674, 3986602516;
mov.u64 %rd2673, 1013904242;
mov.u64 %rd2671, 3668340011;
mov.u64 %rd2670, 3144134277;
mov.u64 %rd2668, 3449720151;
mov.u64 %rd2667, 1993301258;
mov.u64 %rd2666, 3528531795;
bra.uni LBB54_41;
LBB54_40:
setp.lt.u64 %p50, %rd350, %rd11;
selp.u64 %rd1918, 1, 0, %p50;
add.s64 %rd1919, %rd2461, %rd1918;
and.b64 %rd1920, %rd1919, 4294967295;
mul.lo.s64 %rd2669, %rd1920, 3449720151;
xor.b64 %rd1921, %rd2669, %rd350;
shr.u64 %rd1922, %rd1921, 32;
mul.lo.s64 %rd2672, %rd1922, 3528531795;
shr.u64 %rd1923, %rd2672, 32;
and.b64 %rd1924, %rd350, 4294967295;
mul.lo.s64 %rd1925, %rd1924, 3528531795;
and.b64 %rd1926, %rd1925, 4294967295;
xor.b64 %rd1927, %rd1926, %rd1923;
xor.b64 %rd1928, %rd1927, 3144134277;
mul.lo.s64 %rd2675, %rd1928, 3449720151;
xor.b64 %rd2665, %rd1919, %rd1925;
mov.u32 %r342, -1767562579;
mov.u32 %r341, -766435501;
mov.u32 %r340, 1401181199;
mov.u64 %rd2683, 4055616968;
mov.u64 %rd2682, 1684936478;
mov.u64 %rd2681, 534103459;
mov.u64 %rd2680, 387276957;
mov.u64 %rd2679, 3041712726;
mov.u64 %rd2678, 3986602516;
mov.u64 %rd2677, 2835769497;
mov.u64 %rd2676, 3668340011;
mov.u64 %rd2674, 2027808484;
mov.u64 %rd2673, 1993301258;
mov.u64 %rd2671, 842468239;
mov.u64 %rd2670, 2654435769;
mov.u64 %rd2668, 3528531795;
mov.u64 %rd2667, 1013904242;
mov.u64 %rd2666, 3449720151;
LBB54_41:
shr.u64 %rd1955, %rd2675, 32;
shr.u64 %rd1956, %rd2665, 32;
mul.lo.s64 %rd1957, %rd1956, %rd2666;
and.b64 %rd1958, %rd1957, 4294967295;
xor.b64 %rd1959, %rd1958, %rd1955;
xor.b64 %rd1960, %rd1959, %rd2667;
mul.lo.s64 %rd1961, %rd1960, %rd2668;
shr.u64 %rd1962, %rd1961, 32;
shr.u64 %rd1963, %rd1957, 32;
and.b64 %rd1964, %rd2669, 4294967295;
xor.b64 %rd1965, %rd1964, %rd1963;
xor.b64 %rd1966, %rd1965, %rd2670;
mul.lo.s64 %rd1967, %rd1966, %rd2668;
and.b64 %rd1968, %rd1967, 4294967295;
xor.b64 %rd1969, %rd1968, %rd1962;
xor.b64 %rd1970, %rd1969, %rd2671;
mul.lo.s64 %rd1971, %rd1970, %rd2666;
shr.u64 %rd1972, %rd1971, 32;
shr.u64 %rd1973, %rd1967, 32;
and.b64 %rd1974, %rd2672, 4294967295;
xor.b64 %rd1975, %rd1974, %rd1973;
xor.b64 %rd1976, %rd1975, %rd2673;
mul.lo.s64 %rd1977, %rd1976, %rd2666;
and.b64 %rd1978, %rd1977, 4294967295;
xor.b64 %rd1979, %rd1978, %rd1972;
xor.b64 %rd1980, %rd1979, %rd2674;
mul.lo.s64 %rd1981, %rd1980, %rd2668;
shr.u64 %rd1982, %rd1981, 32;
shr.u64 %rd1983, %rd1977, 32;
and.b64 %rd1984, %rd2675, 4294967295;
xor.b64 %rd1985, %rd1984, %rd1983;
xor.b64 %rd1986, %rd1985, %rd2676;
mul.lo.s64 %rd1987, %rd1986, %rd2668;
and.b64 %rd1988, %rd1987, 4294967295;
xor.b64 %rd1989, %rd1988, %rd1982;
xor.b64 %rd1990, %rd1989, %rd2677;
mul.lo.s64 %rd1991, %rd1990, %rd2666;
shr.u64 %rd1992, %rd1991, 32;
shr.u64 %rd1993, %rd1987, 32;
and.b64 %rd1994, %rd1961, 4294967295;
xor.b64 %rd1995, %rd1994, %rd1993;
xor.b64 %rd1996, %rd1995, %rd2678;
mul.lo.s64 %rd1997, %rd1996, %rd2666;
and.b64 %rd1998, %rd1997, 4294967295;
xor.b64 %rd1999, %rd1998, %rd1992;
xor.b64 %rd2000, %rd1999, %rd2679;
mul.lo.s64 %rd2001, %rd2000, %rd2668;
shr.u64 %rd2002, %rd2001, 32;
shr.u64 %rd2003, %rd1997, 32;
and.b64 %rd2004, %rd1971, 4294967295;
xor.b64 %rd2005, %rd2004, %rd2003;
xor.b64 %rd2006, %rd2005, %rd2680;
mul.lo.s64 %rd2007, %rd2006, %rd2668;
and.b64 %rd2008, %rd2007, 4294967295;
xor.b64 %rd2009, %rd2008, %rd2002;
xor.b64 %rd2010, %rd2009, %rd2681;
mul.lo.s64 %rd2011, %rd2010, %rd2666;
shr.u64 %rd2012, %rd2011, 32;
shr.u64 %rd2013, %rd2007, 32;
and.b64 %rd2014, %rd1981, 4294967295;
xor.b64 %rd2015, %rd2014, %rd2013;
xor.b64 %rd2016, %rd2015, %rd2682;
mul.lo.s64 %rd2017, %rd2016, %rd2666;
and.b64 %rd2018, %rd2017, 4294967295;
xor.b64 %rd2019, %rd2018, %rd2012;
xor.b64 %rd2020, %rd2019, %rd2683;
mul.lo.s64 %rd2021, %rd2020, %rd2668;
shr.u64 %rd2022, %rd2021, 32;
cvt.u32.u64 %r222, %rd2022;
shr.u64 %rd2023, %rd2017, 32;
xor.b64 %rd2024, %rd2023, %rd1991;
cvt.u32.u64 %r223, %rd2024;
xor.b32 %r224, %r340, %r223;
mul.lo.s32 %r225, %r224, %r341;
xor.b32 %r226, %r225, %r222;
xor.b32 %r227, %r226, %r342;
shr.u32 %r228, %r227, 9;
cvt.rn.f32.u32 %f179, %r228;
mul.rn.f32 %f180, %f179, 0f34000000;
cvt.rn.f16.f32 %h109, %f180;
mov.b16 %h110, 0x2E66;
setp.ge.f16 %p52, %h109, %h110;
ld.global.nc.b16 %h111, [%rd44+1536];
ld.global.nc.f32 %f181, [%rd45+3072];
cvt.rn.f16.f32 %h112, %f181;
add.rn.f16 %h113, %h111, %h112;
mov.b16 %h114, 0x3C72;
mul.rn.f16 %h115, %h113, %h114;
selp.b16 %h116, %h115, 0x0000, %p52;
cvt.f32.f16 %f182, %h116;
ld.global.nc.b16 %h117, [%rd46+1536];
cvt.f32.f16 %f183, %h117;
ld.global.nc.f32 %f184, [%rd47+3072];
mul.rn.f32 %f185, %f1, %f184;
mul.rn.f32 %f186, %f185, %f183;
ld.global.nc.f32 %f187, [%rd48+3072];
mul.rn.f32 %f188, %f2, %f185;
sub.rn.f32 %f189, %f187, %f188;
add.rn.f32 %f190, %f186, %f189;
add.rn.f32 %f191, %f190, %f182;
add.rn.f32 %f15, %f14, %f191;
or.b32 %r229, %r3, 769;
or.b32 %r230, %r229, %r4;
and.b32 %r231, %r229, 3;
shr.u32 %r232, %r230, 2;
setp.ne.s32 %p53, %r231, 1;
cvt.u64.u32 %rd2025, %r232;
add.s64 %rd378, %rd11, %rd2025;
@%p53 bra LBB54_43;
and.b64 %rd2065, %rd378, 4294967295;
mul.lo.s64 %rd2688, %rd2065, 3528531795;
setp.lt.u64 %p55, %rd378, %rd11;
selp.u64 %rd2066, 1, 0, %p55;
add.s64 %rd2067, %rd2461, %rd2066;
xor.b64 %rd2068, %rd2067, %rd2688;
shr.u64 %rd2069, %rd2068, 32;
mul.lo.s64 %rd2691, %rd2069, 3449720151;
shr.u64 %rd2070, %rd2691, 32;
and.b64 %rd2071, %rd2067, 4294967295;
mul.lo.s64 %rd2072, %rd2071, 3449720151;
and.b64 %rd2073, %rd2072, 4294967295;
xor.b64 %rd2074, %rd2073, %rd2070;
xor.b64 %rd2075, %rd2074, 2654435769;
mul.lo.s64 %rd2694, %rd2075, 3528531795;
xor.b64 %rd2684, %rd2072, %rd378;
mov.u32 %r344, -845247145;
mov.u32 %r343, -616729560;
mov.u64 %rd2701, 3041712726;
mov.u64 %rd2700, 1401181199;
mov.u64 %rd2699, 2835769497;
mov.u64 %rd2698, 1684936478;
mov.u64 %rd2697, 2027808484;
mov.u64 %rd2696, 387276957;
mov.u64 %rd2695, 842468239;
mov.u64 %rd2693, 3986602516;
mov.u64 %rd2692, 1013904242;
mov.u64 %rd2690, 3668340011;
mov.u64 %rd2689, 3144134277;
mov.u64 %rd2687, 3449720151;
mov.u64 %rd2686, 1993301258;
mov.u64 %rd2685, 3528531795;
bra.uni LBB54_44;
LBB54_43:
setp.lt.u64 %p54, %rd378, %rd11;
selp.u64 %rd2040, 1, 0, %p54;
add.s64 %rd2041, %rd2461, %rd2040;
and.b64 %rd2042, %rd2041, 4294967295;
mul.lo.s64 %rd2688, %rd2042, 3449720151;
xor.b64 %rd2043, %rd2688, %rd378;
shr.u64 %rd2044, %rd2043, 32;
mul.lo.s64 %rd2691, %rd2044, 3528531795;
shr.u64 %rd2045, %rd2691, 32;
and.b64 %rd2046, %rd378, 4294967295;
mul.lo.s64 %rd2047, %rd2046, 3528531795;
and.b64 %rd2048, %rd2047, 4294967295;
xor.b64 %rd2049, %rd2048, %rd2045;
xor.b64 %rd2050, %rd2049, 3144134277;
mul.lo.s64 %rd2694, %rd2050, 3449720151;
xor.b64 %rd2684, %rd2041, %rd2047;
mov.u32 %r344, -766435501;
mov.u32 %r343, -239350328;
mov.u64 %rd2701, 1684936478;
mov.u64 %rd2700, 534103459;
mov.u64 %rd2699, 387276957;
mov.u64 %rd2698, 3041712726;
mov.u64 %rd2697, 3986602516;
mov.u64 %rd2696, 2835769497;
mov.u64 %rd2695, 3668340011;
mov.u64 %rd2693, 2027808484;
mov.u64 %rd2692, 1993301258;
mov.u64 %rd2690, 842468239;
mov.u64 %rd2689, 2654435769;
mov.u64 %rd2687, 3528531795;
mov.u64 %rd2686, 1013904242;
mov.u64 %rd2685, 3449720151;
LBB54_44:
shr.u64 %rd2076, %rd2694, 32;
shr.u64 %rd2077, %rd2684, 32;
mul.lo.s64 %rd2078, %rd2077, %rd2685;
and.b64 %rd2079, %rd2078, 4294967295;
xor.b64 %rd2080, %rd2079, %rd2076;
xor.b64 %rd2081, %rd2080, %rd2686;
mul.lo.s64 %rd2082, %rd2081, %rd2687;
shr.u64 %rd2083, %rd2082, 32;
shr.u64 %rd2084, %rd2078, 32;
and.b64 %rd2085, %rd2688, 4294967295;
xor.b64 %rd2086, %rd2085, %rd2084;
xor.b64 %rd2087, %rd2086, %rd2689;
mul.lo.s64 %rd2088, %rd2087, %rd2687;
and.b64 %rd2089, %rd2088, 4294967295;
xor.b64 %rd2090, %rd2089, %rd2083;
xor.b64 %rd2091, %rd2090, %rd2690;
mul.lo.s64 %rd2092, %rd2091, %rd2685;
shr.u64 %rd2093, %rd2092, 32;
shr.u64 %rd2094, %rd2088, 32;
and.b64 %rd2095, %rd2691, 4294967295;
xor.b64 %rd2096, %rd2095, %rd2094;
xor.b64 %rd2097, %rd2096, %rd2692;
mul.lo.s64 %rd2098, %rd2097, %rd2685;
and.b64 %rd2099, %rd2098, 4294967295;
xor.b64 %rd2100, %rd2099, %rd2093;
xor.b64 %rd2101, %rd2100, %rd2693;
mul.lo.s64 %rd2102, %rd2101, %rd2687;
shr.u64 %rd2103, %rd2102, 32;
shr.u64 %rd2104, %rd2098, 32;
and.b64 %rd2105, %rd2694, 4294967295;
xor.b64 %rd2106, %rd2105, %rd2104;
xor.b64 %rd2107, %rd2106, %rd2695;
mul.lo.s64 %rd2108, %rd2107, %rd2687;
and.b64 %rd2109, %rd2108, 4294967295;
xor.b64 %rd2110, %rd2109, %rd2103;
xor.b64 %rd2111, %rd2110, %rd2696;
mul.lo.s64 %rd2112, %rd2111, %rd2685;
shr.u64 %rd2113, %rd2112, 32;
shr.u64 %rd2114, %rd2108, 32;
and.b64 %rd2115, %rd2082, 4294967295;
xor.b64 %rd2116, %rd2115, %rd2114;
xor.b64 %rd2117, %rd2116, %rd2697;
mul.lo.s64 %rd2118, %rd2117, %rd2685;
and.b64 %rd2119, %rd2118, 4294967295;
xor.b64 %rd2120, %rd2119, %rd2113;
xor.b64 %rd2121, %rd2120, %rd2698;
mul.lo.s64 %rd2122, %rd2121, %rd2687;
shr.u64 %rd2123, %rd2122, 32;
shr.u64 %rd2124, %rd2118, 32;
and.b64 %rd2125, %rd2092, 4294967295;
xor.b64 %rd2126, %rd2125, %rd2124;
xor.b64 %rd2127, %rd2126, %rd2699;
mul.lo.s64 %rd2128, %rd2127, %rd2687;
and.b64 %rd2129, %rd2128, 4294967295;
xor.b64 %rd2130, %rd2129, %rd2123;
xor.b64 %rd2131, %rd2130, %rd2700;
mul.lo.s64 %rd2132, %rd2131, %rd2685;
shr.u64 %rd2133, %rd2132, 32;
shr.u64 %rd2134, %rd2128, 32;
xor.b64 %rd2135, %rd2102, %rd2134;
xor.b64 %rd2136, %rd2135, %rd2701;
mul.lo.s64 %rd2137, %rd2136, %rd2685;
xor.b64 %rd2138, %rd2133, %rd2137;
cvt.u32.u64 %r237, %rd2138;
xor.b32 %r238, %r343, %r237;
mul.lo.s32 %r239, %r238, %r344;
shr.u32 %r240, %r239, 9;
cvt.rn.f32.u32 %f192, %r240;
mul.rn.f32 %f193, %f192, 0f34000000;
cvt.rn.f16.f32 %h118, %f193;
mov.b16 %h119, 0x2E66;
setp.ge.f16 %p57, %h118, %h119;
ld.global.nc.b16 %h120, [%rd44+1538];
ld.global.nc.f32 %f194, [%rd45+3076];
cvt.rn.f16.f32 %h121, %f194;
add.rn.f16 %h122, %h120, %h121;
mov.b16 %h123, 0x3C72;
mul.rn.f16 %h124, %h122, %h123;
selp.b16 %h125, %h124, 0x0000, %p57;
cvt.f32.f16 %f195, %h125;
ld.global.nc.b16 %h126, [%rd46+1538];
cvt.f32.f16 %f196, %h126;
ld.global.nc.f32 %f197, [%rd47+3076];
mul.rn.f32 %f198, %f1, %f197;
mul.rn.f32 %f199, %f198, %f196;
ld.global.nc.f32 %f200, [%rd48+3076];
mul.rn.f32 %f201, %f2, %f198;
sub.rn.f32 %f202, %f200, %f201;
add.rn.f32 %f203, %f199, %f202;
add.rn.f32 %f204, %f203, %f195;
add.rn.f32 %f16, %f15, %f204;
or.b32 %r242, %r73, 896;
shr.u32 %r243, %r242, 2;
cvt.u64.u32 %rd2139, %r243;
add.s64 %rd405, %rd11, %rd2139;
@%p8 bra LBB54_46;
mov.u32 %r347, -1879881855;
mov.u32 %r345, 534103459;
mov.u64 %rd2720, 3678237736;
and.b64 %rd2181, %rd405, 4294967295;
mul.lo.s64 %rd2706, %rd2181, 3528531795;
setp.lt.u64 %p59, %rd405, %rd11;
selp.u64 %rd2182, 1, 0, %p59;
add.s64 %rd2183, %rd2461, %rd2182;
xor.b64 %rd2184, %rd2183, %rd2706;
shr.u64 %rd2185, %rd2184, 32;
mul.lo.s64 %rd2709, %rd2185, 3449720151;
shr.u64 %rd2186, %rd2709, 32;
and.b64 %rd2187, %rd2183, 4294967295;
mul.lo.s64 %rd2188, %rd2187, 3449720151;
and.b64 %rd2189, %rd2188, 4294967295;
xor.b64 %rd2190, %rd2189, %rd2186;
xor.b64 %rd2191, %rd2190, 2654435769;
mul.lo.s64 %rd2712, %rd2191, 3528531795;
xor.b64 %rd2702, %rd2188, %rd405;
mov.u32 %r346, -845247145;
mov.u64 %rd2719, 3041712726;
mov.u64 %rd2718, 1401181199;
mov.u64 %rd2717, 2835769497;
mov.u64 %rd2716, 1684936478;
mov.u64 %rd2715, 2027808484;
mov.u64 %rd2714, 387276957;
mov.u64 %rd2713, 842468239;
mov.u64 %rd2711, 3986602516;
mov.u64 %rd2710, 1013904242;
mov.u64 %rd2708, 3668340011;
mov.u64 %rd2707, 3144134277;
mov.u64 %rd2705, 3449720151;
mov.u64 %rd2704, 1993301258;
mov.u64 %rd2703, 3528531795;
bra.uni LBB54_47;
LBB54_46:
setp.lt.u64 %p58, %rd405, %rd11;
selp.u64 %rd2155, 1, 0, %p58;
add.s64 %rd2156, %rd2461, %rd2155;
and.b64 %rd2157, %rd2156, 4294967295;
mul.lo.s64 %rd2706, %rd2157, 3449720151;
xor.b64 %rd2158, %rd2706, %rd405;
shr.u64 %rd2159, %rd2158, 32;
mul.lo.s64 %rd2709, %rd2159, 3528531795;
shr.u64 %rd2160, %rd2709, 32;
and.b64 %rd2161, %rd405, 4294967295;
mul.lo.s64 %rd2162, %rd2161, 3528531795;
and.b64 %rd2163, %rd2162, 4294967295;
xor.b64 %rd2164, %rd2163, %rd2160;
xor.b64 %rd2165, %rd2164, 3144134277;
mul.lo.s64 %rd2712, %rd2165, 3449720151;
xor.b64 %rd2702, %rd2156, %rd2162;
mov.u32 %r347, -1767562579;
mov.u32 %r346, -766435501;
mov.u32 %r345, 1401181199;
mov.u64 %rd2720, 4055616968;
mov.u64 %rd2719, 1684936478;
mov.u64 %rd2718, 534103459;
mov.u64 %rd2717, 387276957;
mov.u64 %rd2716, 3041712726;
mov.u64 %rd2715, 3986602516;
mov.u64 %rd2714, 2835769497;
mov.u64 %rd2713, 3668340011;
mov.u64 %rd2711, 2027808484;
mov.u64 %rd2710, 1993301258;
mov.u64 %rd2708, 842468239;
mov.u64 %rd2707, 2654435769;
mov.u64 %rd2705, 3528531795;
mov.u64 %rd2704, 1013904242;
mov.u64 %rd2703, 3449720151;
LBB54_47:
shr.u64 %rd2192, %rd2712, 32;
shr.u64 %rd2193, %rd2702, 32;
mul.lo.s64 %rd2194, %rd2193, %rd2703;
and.b64 %rd2195, %rd2194, 4294967295;
xor.b64 %rd2196, %rd2195, %rd2192;
xor.b64 %rd2197, %rd2196, %rd2704;
mul.lo.s64 %rd2198, %rd2197, %rd2705;
shr.u64 %rd2199, %rd2198, 32;
shr.u64 %rd2200, %rd2194, 32;
and.b64 %rd2201, %rd2706, 4294967295;
xor.b64 %rd2202, %rd2201, %rd2200;
xor.b64 %rd2203, %rd2202, %rd2707;
mul.lo.s64 %rd2204, %rd2203, %rd2705;
and.b64 %rd2205, %rd2204, 4294967295;
xor.b64 %rd2206, %rd2205, %rd2199;
xor.b64 %rd2207, %rd2206, %rd2708;
mul.lo.s64 %rd2208, %rd2207, %rd2703;
shr.u64 %rd2209, %rd2208, 32;
shr.u64 %rd2210, %rd2204, 32;
and.b64 %rd2211, %rd2709, 4294967295;
xor.b64 %rd2212, %rd2211, %rd2210;
xor.b64 %rd2213, %rd2212, %rd2710;
mul.lo.s64 %rd2214, %rd2213, %rd2703;
and.b64 %rd2215, %rd2214, 4294967295;
xor.b64 %rd2216, %rd2215, %rd2209;
xor.b64 %rd2217, %rd2216, %rd2711;
mul.lo.s64 %rd2218, %rd2217, %rd2705;
shr.u64 %rd2219, %rd2218, 32;
shr.u64 %rd2220, %rd2214, 32;
and.b64 %rd2221, %rd2712, 4294967295;
xor.b64 %rd2222, %rd2221, %rd2220;
xor.b64 %rd2223, %rd2222, %rd2713;
mul.lo.s64 %rd2224, %rd2223, %rd2705;
and.b64 %rd2225, %rd2224, 4294967295;
xor.b64 %rd2226, %rd2225, %rd2219;
xor.b64 %rd2227, %rd2226, %rd2714;
mul.lo.s64 %rd2228, %rd2227, %rd2703;
shr.u64 %rd2229, %rd2228, 32;
shr.u64 %rd2230, %rd2224, 32;
and.b64 %rd2231, %rd2198, 4294967295;
xor.b64 %rd2232, %rd2231, %rd2230;
xor.b64 %rd2233, %rd2232, %rd2715;
mul.lo.s64 %rd2234, %rd2233, %rd2703;
and.b64 %rd2235, %rd2234, 4294967295;
xor.b64 %rd2236, %rd2235, %rd2229;
xor.b64 %rd2237, %rd2236, %rd2716;
mul.lo.s64 %rd2238, %rd2237, %rd2705;
shr.u64 %rd2239, %rd2238, 32;
shr.u64 %rd2240, %rd2234, 32;
and.b64 %rd2241, %rd2208, 4294967295;
xor.b64 %rd2242, %rd2241, %rd2240;
xor.b64 %rd2243, %rd2242, %rd2717;
mul.lo.s64 %rd2244, %rd2243, %rd2705;
and.b64 %rd2245, %rd2244, 4294967295;
xor.b64 %rd2246, %rd2245, %rd2239;
xor.b64 %rd2247, %rd2246, %rd2718;
mul.lo.s64 %rd2248, %rd2247, %rd2703;
shr.u64 %rd2249, %rd2248, 32;
shr.u64 %rd2250, %rd2244, 32;
and.b64 %rd2251, %rd2218, 4294967295;
xor.b64 %rd2252, %rd2251, %rd2250;
xor.b64 %rd2253, %rd2252, %rd2719;
mul.lo.s64 %rd2254, %rd2253, %rd2703;
and.b64 %rd2255, %rd2254, 4294967295;
xor.b64 %rd2256, %rd2255, %rd2249;
xor.b64 %rd2257, %rd2256, %rd2720;
mul.lo.s64 %rd2258, %rd2257, %rd2705;
shr.u64 %rd2259, %rd2258, 32;
cvt.u32.u64 %r250, %rd2259;
shr.u64 %rd2260, %rd2254, 32;
xor.b64 %rd2261, %rd2260, %rd2228;
cvt.u32.u64 %r251, %rd2261;
xor.b32 %r252, %r345, %r251;
mul.lo.s32 %r253, %r252, %r346;
xor.b32 %r254, %r253, %r250;
xor.b32 %r255, %r254, %r347;
shr.u32 %r256, %r255, 9;
cvt.rn.f32.u32 %f205, %r256;
mul.rn.f32 %f206, %f205, 0f34000000;
cvt.rn.f16.f32 %h127, %f206;
mov.b16 %h128, 0x2E66;
setp.ge.f16 %p60, %h127, %h128;
ld.global.nc.b16 %h129, [%rd44+1792];
ld.global.nc.f32 %f207, [%rd45+3584];
cvt.rn.f16.f32 %h130, %f207;
add.rn.f16 %h131, %h129, %h130;
mov.b16 %h132, 0x3C72;
mul.rn.f16 %h133, %h131, %h132;
selp.b16 %h134, %h133, 0x0000, %p60;
cvt.f32.f16 %f208, %h134;
ld.global.nc.b16 %h135, [%rd46+1792];
cvt.f32.f16 %f209, %h135;
ld.global.nc.f32 %f210, [%rd47+3584];
mul.rn.f32 %f211, %f1, %f210;
mul.rn.f32 %f212, %f211, %f209;
ld.global.nc.f32 %f213, [%rd48+3584];
mul.rn.f32 %f214, %f2, %f211;
sub.rn.f32 %f215, %f213, %f214;
add.rn.f32 %f216, %f212, %f215;
add.rn.f32 %f217, %f216, %f208;
add.rn.f32 %f17, %f16, %f217;
or.b32 %r257, %r3, 897;
or.b32 %r258, %r257, %r4;
and.b32 %r259, %r257, 3;
shr.u32 %r260, %r258, 2;
setp.ne.s32 %p61, %r259, 1;
cvt.u64.u32 %rd2262, %r260;
add.s64 %rd433, %rd11, %rd2262;
@%p61 bra LBB54_49;
mov.u32 %r349, -845247145;
mov.u64 %rd2737, 1401181199;
mov.u64 %rd2726, 3144134277;
mov.u32 %r348, -616729560;
and.b64 %rd2302, %rd433, 4294967295;
mul.lo.s64 %rd2725, %rd2302, 3528531795;
setp.lt.u64 %p63, %rd433, %rd11;
selp.u64 %rd2303, 1, 0, %p63;
add.s64 %rd2304, %rd2461, %rd2303;
xor.b64 %rd2305, %rd2304, %rd2725;
shr.u64 %rd2306, %rd2305, 32;
mul.lo.s64 %rd2728, %rd2306, 3449720151;
shr.u64 %rd2307, %rd2728, 32;
and.b64 %rd2308, %rd2304, 4294967295;
mul.lo.s64 %rd2309, %rd2308, 3449720151;
and.b64 %rd2310, %rd2309, 4294967295;
xor.b64 %rd2311, %rd2310, %rd2307;
xor.b64 %rd2312, %rd2311, 2654435769;
mul.lo.s64 %rd2731, %rd2312, 3528531795;
xor.b64 %rd2721, %rd2309, %rd433;
mov.u64 %rd2738, 3041712726;
mov.u64 %rd2736, 2835769497;
mov.u64 %rd2735, 1684936478;
mov.u64 %rd2734, 2027808484;
mov.u64 %rd2733, 387276957;
mov.u64 %rd2732, 842468239;
mov.u64 %rd2730, 3986602516;
mov.u64 %rd2729, 1013904242;
mov.u64 %rd2727, 3668340011;
mov.u64 %rd2724, 3449720151;
mov.u64 %rd2723, 1993301258;
mov.u64 %rd2722, 3528531795;
bra.uni LBB54_50;
LBB54_49:
setp.lt.u64 %p62, %rd433, %rd11;
selp.u64 %rd2277, 1, 0, %p62;
add.s64 %rd2278, %rd2461, %rd2277;
and.b64 %rd2279, %rd2278, 4294967295;
mul.lo.s64 %rd2725, %rd2279, 3449720151;
xor.b64 %rd2280, %rd2725, %rd433;
shr.u64 %rd2281, %rd2280, 32;
mul.lo.s64 %rd2728, %rd2281, 3528531795;
shr.u64 %rd2282, %rd2728, 32;
and.b64 %rd2283, %rd433, 4294967295;
mul.lo.s64 %rd2284, %rd2283, 3528531795;
and.b64 %rd2285, %rd2284, 4294967295;
xor.b64 %rd2286, %rd2285, %rd2282;
xor.b64 %rd2287, %rd2286, 3144134277;
mul.lo.s64 %rd2731, %rd2287, 3449720151;
xor.b64 %rd2721, %rd2278, %rd2284;
mov.u32 %r349, -766435501;
mov.u32 %r348, -239350328;
mov.u64 %rd2738, 1684936478;
mov.u64 %rd2737, 534103459;
mov.u64 %rd2736, 387276957;
mov.u64 %rd2735, 3041712726;
mov.u64 %rd2734, 3986602516;
mov.u64 %rd2733, 2835769497;
mov.u64 %rd2732, 3668340011;
mov.u64 %rd2730, 2027808484;
mov.u64 %rd2729, 1993301258;
mov.u64 %rd2727, 842468239;
mov.u64 %rd2726, 2654435769;
mov.u64 %rd2724, 3528531795;
mov.u64 %rd2723, 1013904242;
mov.u64 %rd2722, 3449720151;
LBB54_50:
shr.u64 %rd2313, %rd2731, 32;
shr.u64 %rd2314, %rd2721, 32;
mul.lo.s64 %rd2315, %rd2314, %rd2722;
and.b64 %rd2316, %rd2315, 4294967295;
xor.b64 %rd2317, %rd2316, %rd2313;
xor.b64 %rd2318, %rd2317, %rd2723;
mul.lo.s64 %rd2319, %rd2318, %rd2724;
shr.u64 %rd2320, %rd2319, 32;
shr.u64 %rd2321, %rd2315, 32;
and.b64 %rd2322, %rd2725, 4294967295;
xor.b64 %rd2323, %rd2322, %rd2321;
xor.b64 %rd2324, %rd2323, %rd2726;
mul.lo.s64 %rd2325, %rd2324, %rd2724;
and.b64 %rd2326, %rd2325, 4294967295;
xor.b64 %rd2327, %rd2326, %rd2320;
xor.b64 %rd2328, %rd2327, %rd2727;
mul.lo.s64 %rd2329, %rd2328, %rd2722;
shr.u64 %rd2330, %rd2329, 32;
shr.u64 %rd2331, %rd2325, 32;
and.b64 %rd2332, %rd2728, 4294967295;
xor.b64 %rd2333, %rd2332, %rd2331;
xor.b64 %rd2334, %rd2333, %rd2729;
mul.lo.s64 %rd2335, %rd2334, %rd2722;
and.b64 %rd2336, %rd2335, 4294967295;
xor.b64 %rd2337, %rd2336, %rd2330;
xor.b64 %rd2338, %rd2337, %rd2730;
mul.lo.s64 %rd2339, %rd2338, %rd2724;
shr.u64 %rd2340, %rd2339, 32;
shr.u64 %rd2341, %rd2335, 32;
and.b64 %rd2342, %rd2731, 4294967295;
xor.b64 %rd2343, %rd2342, %rd2341;
xor.b64 %rd2344, %rd2343, %rd2732;
mul.lo.s64 %rd2345, %rd2344, %rd2724;
and.b64 %rd2346, %rd2345, 4294967295;
xor.b64 %rd2347, %rd2346, %rd2340;
xor.b64 %rd2348, %rd2347, %rd2733;
mul.lo.s64 %rd2349, %rd2348, %rd2722;
shr.u64 %rd2350, %rd2349, 32;
shr.u64 %rd2351, %rd2345, 32;
and.b64 %rd2352, %rd2319, 4294967295;
xor.b64 %rd2353, %rd2352, %rd2351;
xor.b64 %rd2354, %rd2353, %rd2734;
mul.lo.s64 %rd2355, %rd2354, %rd2722;
and.b64 %rd2356, %rd2355, 4294967295;
xor.b64 %rd2357, %rd2356, %rd2350;
xor.b64 %rd2358, %rd2357, %rd2735;
mul.lo.s64 %rd2359, %rd2358, %rd2724;
shr.u64 %rd2360, %rd2359, 32;
shr.u64 %rd2361, %rd2355, 32;
and.b64 %rd2362, %rd2329, 4294967295;
xor.b64 %rd2363, %rd2362, %rd2361;
xor.b64 %rd2364, %rd2363, %rd2736;
mul.lo.s64 %rd2365, %rd2364, %rd2724;
and.b64 %rd2366, %rd2365, 4294967295;
xor.b64 %rd2367, %rd2366, %rd2360;
xor.b64 %rd2368, %rd2367, %rd2737;
mul.lo.s64 %rd2369, %rd2368, %rd2722;
shr.u64 %rd2370, %rd2369, 32;
shr.u64 %rd2371, %rd2365, 32;
xor.b64 %rd2372, %rd2339, %rd2371;
xor.b64 %rd2373, %rd2372, %rd2738;
mul.lo.s64 %rd2374, %rd2373, %rd2722;
xor.b64 %rd2375, %rd2370, %rd2374;
cvt.u32.u64 %r265, %rd2375;
xor.b32 %r266, %r348, %r265;
mul.lo.s32 %r267, %r266, %r349;
shr.u32 %r268, %r267, 9;
cvt.rn.f32.u32 %f218, %r268;
mul.rn.f32 %f219, %f218, 0f34000000;
cvt.rn.f16.f32 %h136, %f219;
mov.b16 %h137, 0x2E66;
setp.ge.f16 %p64, %h136, %h137;
ld.global.nc.b16 %h138, [%rd44+1794];
ld.global.nc.f32 %f220, [%rd45+3588];
cvt.rn.f16.f32 %h139, %f220;
add.rn.f16 %h140, %h138, %h139;
mov.b16 %h141, 0x3C72;
mul.rn.f16 %h142, %h140, %h141;
selp.b16 %h143, %h142, 0x0000, %p64;
cvt.f32.f16 %f221, %h143;
ld.global.nc.b16 %h144, [%rd46+1794];
cvt.f32.f16 %f222, %h144;
ld.global.nc.f32 %f223, [%rd47+3588];
mul.rn.f32 %f224, %f1, %f223;
mul.rn.f32 %f225, %f224, %f222;
ld.global.nc.f32 %f226, [%rd48+3588];
mul.rn.f32 %f227, %f2, %f224;
sub.rn.f32 %f228, %f226, %f227;
add.rn.f32 %f229, %f225, %f228;
add.rn.f32 %f230, %f229, %f221;
add.rn.f32 %f231, %f17, %f230;
and.b32 %r46, %r1, 31;
shfl.sync.down.b32 %f232, %f231, 16, 31, -1;
add.rn.f32 %f233, %f232, %f231;
shfl.sync.down.b32 %f234, %f233, 8, 31, -1;
add.rn.f32 %f235, %f234, %f233;
shfl.sync.down.b32 %f236, %f235, 4, 31, -1;
add.rn.f32 %f237, %f236, %f235;
shfl.sync.down.b32 %f238, %f237, 2, 31, -1;
add.rn.f32 %f239, %f238, %f237;
shfl.sync.down.b32 %f240, %f239, 1, 31, -1;
shr.u32 %r47, %r1, 5;
setp.ne.s32 %p65, %r46, 0;
mov.u64 %rd2378, shared_cache_012;
@%p65 bra LBB54_2;
mul.wide.u32 %rd2377, %r47, 4;
add.s64 %rd461, %rd2378, %rd2377;
add.rn.f32 %f18, %f240, %f239;
st.shared.f32 [%rd461], %f18;
LBB54_2:
bar.sync 0;
setp.eq.s32 %p66, %r47, 0;
@%p66 bra LBB54_52;
bra.uni LBB54_3;
LBB54_52:
add.u64 %rd472, %SP, 0;
add.u64 %rd10, %SPL, 0;
mul.wide.u32 %rd2379, %r46, 4;
add.s64 %rd462, %rd2378, %rd2379;
cvta.shared.u64 %rd2381, %rd462;
mov.u32 %r269, 0;
st.local.u32 [%rd10], %r269;
setp.lt.u32 %p67, %r1, 2;
selp.b64 %rd2383, %rd2381, %rd472, %p67;
ld.f32 %f241, [%rd2383];
shfl.sync.down.b32 %f242, %f241, 16, 31, -1;
add.rn.f32 %f243, %f241, %f242;
shfl.sync.down.b32 %f244, %f243, 8, 31, -1;
add.rn.f32 %f245, %f243, %f244;
shfl.sync.down.b32 %f246, %f245, 4, 31, -1;
add.rn.f32 %f247, %f245, %f246;
shfl.sync.down.b32 %f248, %f247, 2, 31, -1;
add.rn.f32 %f249, %f247, %f248;
shfl.sync.down.b32 %f250, %f249, 1, 31, -1;
add.rn.f32 %f251, %f249, %f250;
st.f32 [%rd2383], %f251;
setp.ne.s32 %p68, %r1, 0;
@%p68 bra LBB54_3;
ld.param.u64 %rd469, [fusion_2214_param_3];
cvt.u64.u32 %rd43, %r2;
cvta.to.global.u64 %rd6, %rd469;
shl.b64 %rd2376, %rd43, 2;
add.s64 %rd460, %rd6, %rd2376;
ld.shared.f32 %f252, [%rd462];
atom.global.add.f32 %f253, [%rd460], %f252;
LBB54_3:
ret;
}
// .globl fusion_2212
.visible .entry fusion_2212(
.param .u64 fusion_2212_param_0,
.param .u64 fusion_2212_param_1,
.param .u64 fusion_2212_param_2,
.param .u64 fusion_2212_param_3,
.param .u64 fusion_2212_param_4,
.param .u64 fusion_2212_param_5,
.param .u64 fusion_2212_param_6,
.param .u64 fusion_2212_param_7,
.param .u64 fusion_2212_param_8,
.param .u64 fusion_2212_param_9,
.param .u64 fusion_2212_param_10
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot55[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<75>;
.reg .b16 %h<145>;
.reg .f32 %f<288>;
.reg .b32 %r<350>;
.reg .b64 %rd<2742>;
mov.u64 %SPL, __local_depot55;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd464, [fusion_2212_param_0];
ld.param.u64 %rd465, [fusion_2212_param_9];
cvta.to.global.u64 %rd1, %rd465;
ld.param.u64 %rd466, [fusion_2212_param_1];
ld.param.u64 %rd467, [fusion_2212_param_8];
cvta.to.global.u64 %rd2, %rd467;
ld.param.u64 %rd468, [fusion_2212_param_2];
ld.param.u64 %rd469, [fusion_2212_param_7];
cvta.to.global.u64 %rd3, %rd469;
ld.param.u64 %rd471, [fusion_2212_param_6];
cvta.to.global.u64 %rd4, %rd471;
ld.param.u64 %rd472, [fusion_2212_param_4];
ld.param.u64 %rd473, [fusion_2212_param_5];
cvta.to.global.u64 %rd5, %rd473;
cvta.to.global.u64 %rd6, %rd472;
cvta.to.global.u64 %rd8, %rd468;
cvta.to.global.u64 %rd9, %rd466;
cvta.to.global.u64 %rd10, %rd464;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 10;
or.b32 %r48, %r4, %r3;
shr.u32 %r49, %r48, 2;
and.b32 %r5, %r1, 1;
setp.eq.s32 %p1, %r5, 0;
ld.global.nc.u64 %rd12, [%rd8];
cvt.u64.u32 %rd475, %r49;
add.s64 %rd13, %rd12, %rd475;
setp.lt.u64 %p69, %rd13, %rd12;
and.b64 %rd2387, %rd13, 4294967295;
@%p1 bra LBB55_1;
bra.uni LBB55_4;
LBB55_1:
mul.lo.s64 %rd2449, %rd2387, 3528531795;
ld.global.nc.u64 %rd2464, [%rd8+8];
selp.u64 %rd518, 1, 0, %p69;
add.s64 %rd519, %rd2464, %rd518;
xor.b64 %rd520, %rd519, %rd2449;
shr.u64 %rd521, %rd520, 32;
mul.lo.s64 %rd2452, %rd521, 3449720151;
shr.u64 %rd522, %rd2452, 32;
and.b64 %rd523, %rd519, 4294967295;
mul.lo.s64 %rd524, %rd523, 3449720151;
and.b64 %rd525, %rd524, 4294967295;
xor.b64 %rd526, %rd525, %rd522;
xor.b64 %rd527, %rd526, 2654435769;
mul.lo.s64 %rd2455, %rd527, 3528531795;
xor.b64 %rd2445, %rd524, %rd13;
mov.u32 %r312, -1879881855;
mov.u32 %r311, -845247145;
mov.u32 %r310, 534103459;
mov.u64 %rd2463, 3678237736;
mov.u64 %rd2462, 3041712726;
mov.u64 %rd2461, 1401181199;
mov.u64 %rd2460, 2835769497;
mov.u64 %rd2459, 1684936478;
mov.u64 %rd2458, 2027808484;
mov.u64 %rd2457, 387276957;
mov.u64 %rd2456, 842468239;
mov.u64 %rd2454, 3986602516;
mov.u64 %rd2453, 1013904242;
mov.u64 %rd2451, 3668340011;
mov.u64 %rd2450, 3144134277;
mov.u64 %rd2448, 3449720151;
mov.u64 %rd2447, 1993301258;
mov.u64 %rd2446, 3528531795;
bra.uni LBB55_5;
LBB55_4:
mov.u32 %r311, -766435501;
mov.u64 %rd2462, 1684936478;
mov.u64 %rd2461, 534103459;
mov.u64 %rd2460, 387276957;
mov.u64 %rd2459, 3041712726;
mov.u64 %rd2458, 3986602516;
mov.u64 %rd2457, 2835769497;
mov.u64 %rd2456, 3668340011;
mov.u64 %rd2454, 2027808484;
mov.u64 %rd2453, 1993301258;
mov.u64 %rd2451, 842468239;
mov.u64 %rd2450, 2654435769;
mov.u64 %rd2448, 3528531795;
mov.u64 %rd2447, 1013904242;
mov.u64 %rd2446, 3449720151;
mov.u32 %r312, -1767562579;
mov.u32 %r310, 1401181199;
mov.u64 %rd2463, 4055616968;
ld.global.nc.u64 %rd2464, [%rd8+8];
selp.u64 %rd491, 1, 0, %p69;
add.s64 %rd492, %rd2464, %rd491;
and.b64 %rd493, %rd492, 4294967295;
mul.lo.s64 %rd2449, %rd493, 3449720151;
xor.b64 %rd494, %rd2449, %rd13;
shr.u64 %rd495, %rd494, 32;
mul.lo.s64 %rd2452, %rd495, 3528531795;
shr.u64 %rd496, %rd2452, 32;
mul.lo.s64 %rd498, %rd2387, 3528531795;
and.b64 %rd499, %rd498, 4294967295;
xor.b64 %rd500, %rd499, %rd496;
xor.b64 %rd501, %rd500, 3144134277;
mul.lo.s64 %rd2455, %rd501, 3449720151;
xor.b64 %rd2445, %rd492, %rd498;
LBB55_5:
shr.u64 %rd528, %rd2455, 32;
shr.u64 %rd529, %rd2445, 32;
mul.lo.s64 %rd530, %rd529, %rd2446;
and.b64 %rd531, %rd530, 4294967295;
xor.b64 %rd532, %rd531, %rd528;
xor.b64 %rd533, %rd532, %rd2447;
mul.lo.s64 %rd534, %rd533, %rd2448;
shr.u64 %rd535, %rd534, 32;
shr.u64 %rd536, %rd530, 32;
and.b64 %rd537, %rd2449, 4294967295;
xor.b64 %rd538, %rd537, %rd536;
xor.b64 %rd539, %rd538, %rd2450;
mul.lo.s64 %rd540, %rd539, %rd2448;
and.b64 %rd541, %rd540, 4294967295;
xor.b64 %rd542, %rd541, %rd535;
xor.b64 %rd543, %rd542, %rd2451;
mul.lo.s64 %rd544, %rd543, %rd2446;
shr.u64 %rd545, %rd544, 32;
shr.u64 %rd546, %rd540, 32;
and.b64 %rd547, %rd2452, 4294967295;
xor.b64 %rd548, %rd547, %rd546;
xor.b64 %rd549, %rd548, %rd2453;
mul.lo.s64 %rd550, %rd549, %rd2446;
and.b64 %rd551, %rd550, 4294967295;
xor.b64 %rd552, %rd551, %rd545;
xor.b64 %rd553, %rd552, %rd2454;
mul.lo.s64 %rd554, %rd553, %rd2448;
shr.u64 %rd555, %rd554, 32;
shr.u64 %rd556, %rd550, 32;
and.b64 %rd557, %rd2455, 4294967295;
xor.b64 %rd558, %rd557, %rd556;
xor.b64 %rd559, %rd558, %rd2456;
mul.lo.s64 %rd560, %rd559, %rd2448;
and.b64 %rd561, %rd560, 4294967295;
xor.b64 %rd562, %rd561, %rd555;
xor.b64 %rd563, %rd562, %rd2457;
mul.lo.s64 %rd564, %rd563, %rd2446;
shr.u64 %rd565, %rd564, 32;
shr.u64 %rd566, %rd560, 32;
and.b64 %rd567, %rd534, 4294967295;
xor.b64 %rd568, %rd567, %rd566;
xor.b64 %rd569, %rd568, %rd2458;
mul.lo.s64 %rd570, %rd569, %rd2446;
and.b64 %rd571, %rd570, 4294967295;
xor.b64 %rd572, %rd571, %rd565;
xor.b64 %rd573, %rd572, %rd2459;
mul.lo.s64 %rd574, %rd573, %rd2448;
shr.u64 %rd575, %rd574, 32;
shr.u64 %rd576, %rd570, 32;
and.b64 %rd577, %rd544, 4294967295;
xor.b64 %rd578, %rd577, %rd576;
xor.b64 %rd579, %rd578, %rd2460;
mul.lo.s64 %rd580, %rd579, %rd2448;
and.b64 %rd581, %rd580, 4294967295;
xor.b64 %rd582, %rd581, %rd575;
xor.b64 %rd583, %rd582, %rd2461;
mul.lo.s64 %rd584, %rd583, %rd2446;
shr.u64 %rd585, %rd584, 32;
shr.u64 %rd586, %rd580, 32;
and.b64 %rd587, %rd554, 4294967295;
xor.b64 %rd588, %rd587, %rd586;
xor.b64 %rd589, %rd588, %rd2462;
mul.lo.s64 %rd590, %rd589, %rd2446;
and.b64 %rd591, %rd590, 4294967295;
xor.b64 %rd592, %rd591, %rd585;
xor.b64 %rd593, %rd592, %rd2463;
mul.lo.s64 %rd594, %rd593, %rd2448;
shr.u64 %rd595, %rd594, 32;
cvt.u32.u64 %r56, %rd595;
shr.u64 %rd596, %rd590, 32;
xor.b64 %rd597, %rd596, %rd564;
cvt.u32.u64 %r57, %rd597;
xor.b32 %r58, %r310, %r57;
mul.lo.s32 %r59, %r58, %r311;
xor.b32 %r60, %r59, %r56;
xor.b32 %r61, %r60, %r312;
shr.u32 %r62, %r61, 9;
cvt.rn.f32.u32 %f20, %r62;
mul.rn.f32 %f21, %f20, 0f34000000;
cvt.rn.f16.f32 %h1, %f21;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p4, %h1, %h2;
mul.wide.u32 %rd598, %r2, 2048;
add.s64 %rd599, %rd10, %rd598;
mul.wide.u32 %rd600, %r3, 2;
add.s64 %rd45, %rd599, %rd600;
ld.global.nc.b16 %h3, [%rd45];
mul.wide.u32 %rd601, %r3, 4;
add.s64 %rd46, %rd1, %rd601;
ld.global.nc.f32 %f22, [%rd46];
cvt.rn.f16.f32 %h4, %f22;
add.rn.f16 %h5, %h3, %h4;
mov.b16 %h6, 0x3C72;
mul.rn.f16 %h7, %h5, %h6;
selp.b16 %h8, %h7, 0x0000, %p4;
cvt.f32.f16 %f23, %h8;
add.s64 %rd602, %rd9, %rd598;
add.s64 %rd47, %rd602, %rd600;
ld.global.nc.b16 %h9, [%rd47];
cvt.f32.f16 %f24, %h9;
mul.wide.u32 %rd603, %r2, 4;
add.s64 %rd604, %rd5, %rd603;
ld.global.nc.f32 %f25, [%rd604];
mul.rn.f32 %f26, %f25, 0f3A800000;
add.rn.f32 %f27, %f26, 0f2B8CBCCC;
rsqrt.approx.f32 %f1, %f27;
add.s64 %rd48, %rd2, %rd601;
ld.global.nc.f32 %f28, [%rd48];
mul.rn.f32 %f29, %f1, %f28;
mul.rn.f32 %f30, %f29, %f24;
add.s64 %rd49, %rd3, %rd601;
ld.global.nc.f32 %f31, [%rd49];
add.s64 %rd605, %rd4, %rd603;
ld.global.nc.f32 %f32, [%rd605];
mul.rn.f32 %f2, %f32, 0f3A800000;
mul.rn.f32 %f33, %f29, %f2;
sub.rn.f32 %f34, %f31, %f33;
add.rn.f32 %f35, %f30, %f34;
add.rn.f32 %f36, %f35, %f23;
add.s64 %rd606, %rd6, %rd603;
ld.global.nc.f32 %f37, [%rd606];
mul.rn.f32 %f3, %f37, 0f3A800000;
sub.rn.f32 %f38, %f36, %f3;
mul.rn.f32 %f39, %f38, %f38;
add.rn.f32 %f4, %f39, 0f00000000;
or.b32 %r63, %r3, 1;
and.b32 %r64, %r63, 3;
setp.ne.s32 %p5, %r64, 1;
@%p5 bra LBB55_7;
mul.lo.s64 %rd2469, %rd2387, 3528531795;
selp.u64 %rd647, 1, 0, %p69;
add.s64 %rd648, %rd2464, %rd647;
xor.b64 %rd649, %rd648, %rd2469;
shr.u64 %rd650, %rd649, 32;
mul.lo.s64 %rd2472, %rd650, 3449720151;
shr.u64 %rd651, %rd2472, 32;
and.b64 %rd652, %rd648, 4294967295;
mul.lo.s64 %rd653, %rd652, 3449720151;
and.b64 %rd654, %rd653, 4294967295;
xor.b64 %rd655, %rd654, %rd651;
xor.b64 %rd656, %rd655, 2654435769;
mul.lo.s64 %rd2475, %rd656, 3528531795;
xor.b64 %rd2465, %rd653, %rd13;
mov.u32 %r314, -845247145;
mov.u32 %r313, -616729560;
mov.u64 %rd2482, 3041712726;
mov.u64 %rd2481, 1401181199;
mov.u64 %rd2480, 2835769497;
mov.u64 %rd2479, 1684936478;
mov.u64 %rd2478, 2027808484;
mov.u64 %rd2477, 387276957;
mov.u64 %rd2476, 842468239;
mov.u64 %rd2474, 3986602516;
mov.u64 %rd2473, 1013904242;
mov.u64 %rd2471, 3668340011;
mov.u64 %rd2470, 3144134277;
mov.u64 %rd2468, 3449720151;
mov.u64 %rd2467, 1993301258;
mov.u64 %rd2466, 3528531795;
bra.uni LBB55_8;
LBB55_7:
mov.u32 %r313, -239350328;
selp.u64 %rd621, 1, 0, %p69;
add.s64 %rd622, %rd2464, %rd621;
and.b64 %rd623, %rd622, 4294967295;
mul.lo.s64 %rd2469, %rd623, 3449720151;
xor.b64 %rd624, %rd2469, %rd13;
shr.u64 %rd625, %rd624, 32;
mul.lo.s64 %rd2472, %rd625, 3528531795;
shr.u64 %rd626, %rd2472, 32;
mul.lo.s64 %rd628, %rd2387, 3528531795;
and.b64 %rd629, %rd628, 4294967295;
xor.b64 %rd630, %rd629, %rd626;
xor.b64 %rd631, %rd630, 3144134277;
mul.lo.s64 %rd2475, %rd631, 3449720151;
xor.b64 %rd2465, %rd622, %rd628;
mov.u32 %r314, -766435501;
mov.u64 %rd2482, 1684936478;
mov.u64 %rd2481, 534103459;
mov.u64 %rd2480, 387276957;
mov.u64 %rd2479, 3041712726;
mov.u64 %rd2478, 3986602516;
mov.u64 %rd2477, 2835769497;
mov.u64 %rd2476, 3668340011;
mov.u64 %rd2474, 2027808484;
mov.u64 %rd2473, 1993301258;
mov.u64 %rd2471, 842468239;
mov.u64 %rd2470, 2654435769;
mov.u64 %rd2468, 3528531795;
mov.u64 %rd2467, 1013904242;
mov.u64 %rd2466, 3449720151;
LBB55_8:
setp.ne.s32 %p8, %r5, 0;
shr.u64 %rd657, %rd2475, 32;
shr.u64 %rd658, %rd2465, 32;
mul.lo.s64 %rd659, %rd658, %rd2466;
and.b64 %rd660, %rd659, 4294967295;
xor.b64 %rd661, %rd660, %rd657;
xor.b64 %rd662, %rd661, %rd2467;
mul.lo.s64 %rd663, %rd662, %rd2468;
shr.u64 %rd664, %rd663, 32;
shr.u64 %rd665, %rd659, 32;
and.b64 %rd666, %rd2469, 4294967295;
xor.b64 %rd667, %rd666, %rd665;
xor.b64 %rd668, %rd667, %rd2470;
mul.lo.s64 %rd669, %rd668, %rd2468;
and.b64 %rd670, %rd669, 4294967295;
xor.b64 %rd671, %rd670, %rd664;
xor.b64 %rd672, %rd671, %rd2471;
mul.lo.s64 %rd673, %rd672, %rd2466;
shr.u64 %rd674, %rd673, 32;
shr.u64 %rd675, %rd669, 32;
and.b64 %rd676, %rd2472, 4294967295;
xor.b64 %rd677, %rd676, %rd675;
xor.b64 %rd678, %rd677, %rd2473;
mul.lo.s64 %rd679, %rd678, %rd2466;
and.b64 %rd680, %rd679, 4294967295;
xor.b64 %rd681, %rd680, %rd674;
xor.b64 %rd682, %rd681, %rd2474;
mul.lo.s64 %rd683, %rd682, %rd2468;
shr.u64 %rd684, %rd683, 32;
shr.u64 %rd685, %rd679, 32;
and.b64 %rd686, %rd2475, 4294967295;
xor.b64 %rd687, %rd686, %rd685;
xor.b64 %rd688, %rd687, %rd2476;
mul.lo.s64 %rd689, %rd688, %rd2468;
and.b64 %rd690, %rd689, 4294967295;
xor.b64 %rd691, %rd690, %rd684;
xor.b64 %rd692, %rd691, %rd2477;
mul.lo.s64 %rd693, %rd692, %rd2466;
shr.u64 %rd694, %rd693, 32;
shr.u64 %rd695, %rd689, 32;
and.b64 %rd696, %rd663, 4294967295;
xor.b64 %rd697, %rd696, %rd695;
xor.b64 %rd698, %rd697, %rd2478;
mul.lo.s64 %rd699, %rd698, %rd2466;
and.b64 %rd700, %rd699, 4294967295;
xor.b64 %rd701, %rd700, %rd694;
xor.b64 %rd702, %rd701, %rd2479;
mul.lo.s64 %rd703, %rd702, %rd2468;
shr.u64 %rd704, %rd703, 32;
shr.u64 %rd705, %rd699, 32;
and.b64 %rd706, %rd673, 4294967295;
xor.b64 %rd707, %rd706, %rd705;
xor.b64 %rd708, %rd707, %rd2480;
mul.lo.s64 %rd709, %rd708, %rd2468;
and.b64 %rd710, %rd709, 4294967295;
xor.b64 %rd711, %rd710, %rd704;
xor.b64 %rd712, %rd711, %rd2481;
mul.lo.s64 %rd713, %rd712, %rd2466;
shr.u64 %rd714, %rd713, 32;
shr.u64 %rd715, %rd709, 32;
xor.b64 %rd716, %rd683, %rd715;
xor.b64 %rd717, %rd716, %rd2482;
mul.lo.s64 %rd718, %rd717, %rd2466;
xor.b64 %rd719, %rd714, %rd718;
cvt.u32.u64 %r69, %rd719;
xor.b32 %r70, %r313, %r69;
mul.lo.s32 %r71, %r70, %r314;
shr.u32 %r72, %r71, 9;
cvt.rn.f32.u32 %f40, %r72;
mul.rn.f32 %f41, %f40, 0f34000000;
cvt.rn.f16.f32 %h10, %f41;
mov.b16 %h11, 0x2E66;
setp.ge.f16 %p9, %h10, %h11;
ld.global.nc.b16 %h12, [%rd45+2];
ld.global.nc.f32 %f42, [%rd46+4];
cvt.rn.f16.f32 %h13, %f42;
add.rn.f16 %h14, %h12, %h13;
mov.b16 %h15, 0x3C72;
mul.rn.f16 %h16, %h14, %h15;
selp.b16 %h17, %h16, 0x0000, %p9;
cvt.f32.f16 %f43, %h17;
ld.global.nc.b16 %h18, [%rd47+2];
cvt.f32.f16 %f44, %h18;
ld.global.nc.f32 %f45, [%rd48+4];
mul.rn.f32 %f46, %f1, %f45;
mul.rn.f32 %f47, %f46, %f44;
ld.global.nc.f32 %f48, [%rd49+4];
mul.rn.f32 %f49, %f2, %f46;
sub.rn.f32 %f50, %f48, %f49;
add.rn.f32 %f51, %f47, %f50;
add.rn.f32 %f52, %f51, %f43;
sub.rn.f32 %f53, %f52, %f3;
mul.rn.f32 %f54, %f53, %f53;
add.rn.f32 %f5, %f4, %f54;
or.b32 %r73, %r3, %r4;
or.b32 %r74, %r73, 128;
shr.u32 %r75, %r74, 2;
cvt.u64.u32 %rd720, %r75;
add.s64 %rd76, %rd12, %rd720;
and.b64 %rd2436, %rd76, 4294967295;
setp.lt.u64 %p74, %rd76, %rd12;
@%p8 bra LBB55_10;
mul.lo.s64 %rd2487, %rd2436, 3528531795;
selp.u64 %rd763, 1, 0, %p74;
add.s64 %rd764, %rd2464, %rd763;
xor.b64 %rd765, %rd764, %rd2487;
shr.u64 %rd766, %rd765, 32;
mul.lo.s64 %rd2490, %rd766, 3449720151;
shr.u64 %rd767, %rd2490, 32;
and.b64 %rd768, %rd764, 4294967295;
mul.lo.s64 %rd769, %rd768, 3449720151;
and.b64 %rd770, %rd769, 4294967295;
xor.b64 %rd771, %rd770, %rd767;
xor.b64 %rd772, %rd771, 2654435769;
mul.lo.s64 %rd2493, %rd772, 3528531795;
xor.b64 %rd2483, %rd769, %rd76;
mov.u32 %r317, -1879881855;
mov.u32 %r316, -845247145;
mov.u32 %r315, 534103459;
mov.u64 %rd2501, 3678237736;
mov.u64 %rd2500, 3041712726;
mov.u64 %rd2499, 1401181199;
mov.u64 %rd2498, 2835769497;
mov.u64 %rd2497, 1684936478;
mov.u64 %rd2496, 2027808484;
mov.u64 %rd2495, 387276957;
mov.u64 %rd2494, 842468239;
mov.u64 %rd2492, 3986602516;
mov.u64 %rd2491, 1013904242;
mov.u64 %rd2489, 3668340011;
mov.u64 %rd2488, 3144134277;
mov.u64 %rd2486, 3449720151;
mov.u64 %rd2485, 1993301258;
mov.u64 %rd2484, 3528531795;
bra.uni LBB55_11;
LBB55_10:
selp.u64 %rd736, 1, 0, %p74;
add.s64 %rd737, %rd2464, %rd736;
and.b64 %rd738, %rd737, 4294967295;
mul.lo.s64 %rd2487, %rd738, 3449720151;
xor.b64 %rd739, %rd2487, %rd76;
shr.u64 %rd740, %rd739, 32;
mul.lo.s64 %rd2490, %rd740, 3528531795;
shr.u64 %rd741, %rd2490, 32;
mul.lo.s64 %rd743, %rd2436, 3528531795;
and.b64 %rd744, %rd743, 4294967295;
xor.b64 %rd745, %rd744, %rd741;
xor.b64 %rd746, %rd745, 3144134277;
mul.lo.s64 %rd2493, %rd746, 3449720151;
xor.b64 %rd2483, %rd737, %rd743;
mov.u32 %r317, -1767562579;
mov.u32 %r316, -766435501;
mov.u32 %r315, 1401181199;
mov.u64 %rd2501, 4055616968;
mov.u64 %rd2500, 1684936478;
mov.u64 %rd2499, 534103459;
mov.u64 %rd2498, 387276957;
mov.u64 %rd2497, 3041712726;
mov.u64 %rd2496, 3986602516;
mov.u64 %rd2495, 2835769497;
mov.u64 %rd2494, 3668340011;
mov.u64 %rd2492, 2027808484;
mov.u64 %rd2491, 1993301258;
mov.u64 %rd2489, 842468239;
mov.u64 %rd2488, 2654435769;
mov.u64 %rd2486, 3528531795;
mov.u64 %rd2485, 1013904242;
mov.u64 %rd2484, 3449720151;
LBB55_11:
shr.u64 %rd773, %rd2493, 32;
shr.u64 %rd774, %rd2483, 32;
mul.lo.s64 %rd775, %rd774, %rd2484;
and.b64 %rd776, %rd775, 4294967295;
xor.b64 %rd777, %rd776, %rd773;
xor.b64 %rd778, %rd777, %rd2485;
mul.lo.s64 %rd779, %rd778, %rd2486;
shr.u64 %rd780, %rd779, 32;
shr.u64 %rd781, %rd775, 32;
and.b64 %rd782, %rd2487, 4294967295;
xor.b64 %rd783, %rd782, %rd781;
xor.b64 %rd784, %rd783, %rd2488;
mul.lo.s64 %rd785, %rd784, %rd2486;
and.b64 %rd786, %rd785, 4294967295;
xor.b64 %rd787, %rd786, %rd780;
xor.b64 %rd788, %rd787, %rd2489;
mul.lo.s64 %rd789, %rd788, %rd2484;
shr.u64 %rd790, %rd789, 32;
shr.u64 %rd791, %rd785, 32;
and.b64 %rd792, %rd2490, 4294967295;
xor.b64 %rd793, %rd792, %rd791;
xor.b64 %rd794, %rd793, %rd2491;
mul.lo.s64 %rd795, %rd794, %rd2484;
and.b64 %rd796, %rd795, 4294967295;
xor.b64 %rd797, %rd796, %rd790;
xor.b64 %rd798, %rd797, %rd2492;
mul.lo.s64 %rd799, %rd798, %rd2486;
shr.u64 %rd800, %rd799, 32;
shr.u64 %rd801, %rd795, 32;
and.b64 %rd802, %rd2493, 4294967295;
xor.b64 %rd803, %rd802, %rd801;
xor.b64 %rd804, %rd803, %rd2494;
mul.lo.s64 %rd805, %rd804, %rd2486;
and.b64 %rd806, %rd805, 4294967295;
xor.b64 %rd807, %rd806, %rd800;
xor.b64 %rd808, %rd807, %rd2495;
mul.lo.s64 %rd809, %rd808, %rd2484;
shr.u64 %rd810, %rd809, 32;
shr.u64 %rd811, %rd805, 32;
and.b64 %rd812, %rd779, 4294967295;
xor.b64 %rd813, %rd812, %rd811;
xor.b64 %rd814, %rd813, %rd2496;
mul.lo.s64 %rd815, %rd814, %rd2484;
and.b64 %rd816, %rd815, 4294967295;
xor.b64 %rd817, %rd816, %rd810;
xor.b64 %rd818, %rd817, %rd2497;
mul.lo.s64 %rd819, %rd818, %rd2486;
shr.u64 %rd820, %rd819, 32;
shr.u64 %rd821, %rd815, 32;
and.b64 %rd822, %rd789, 4294967295;
xor.b64 %rd823, %rd822, %rd821;
xor.b64 %rd824, %rd823, %rd2498;
mul.lo.s64 %rd825, %rd824, %rd2486;
and.b64 %rd826, %rd825, 4294967295;
xor.b64 %rd827, %rd826, %rd820;
xor.b64 %rd828, %rd827, %rd2499;
mul.lo.s64 %rd829, %rd828, %rd2484;
shr.u64 %rd830, %rd829, 32;
shr.u64 %rd831, %rd825, 32;
and.b64 %rd832, %rd799, 4294967295;
xor.b64 %rd833, %rd832, %rd831;
xor.b64 %rd834, %rd833, %rd2500;
mul.lo.s64 %rd835, %rd834, %rd2484;
and.b64 %rd836, %rd835, 4294967295;
xor.b64 %rd837, %rd836, %rd830;
xor.b64 %rd838, %rd837, %rd2501;
mul.lo.s64 %rd839, %rd838, %rd2486;
shr.u64 %rd840, %rd839, 32;
cvt.u32.u64 %r82, %rd840;
shr.u64 %rd841, %rd835, 32;
xor.b64 %rd842, %rd841, %rd809;
cvt.u32.u64 %r83, %rd842;
xor.b32 %r84, %r315, %r83;
mul.lo.s32 %r85, %r84, %r316;
xor.b32 %r86, %r85, %r82;
xor.b32 %r87, %r86, %r317;
shr.u32 %r88, %r87, 9;
cvt.rn.f32.u32 %f55, %r88;
mul.rn.f32 %f56, %f55, 0f34000000;
cvt.rn.f16.f32 %h19, %f56;
mov.b16 %h20, 0x2E66;
setp.ge.f16 %p12, %h19, %h20;
ld.global.nc.b16 %h21, [%rd45+256];
ld.global.nc.f32 %f57, [%rd46+512];
cvt.rn.f16.f32 %h22, %f57;
add.rn.f16 %h23, %h21, %h22;
mov.b16 %h24, 0x3C72;
mul.rn.f16 %h25, %h23, %h24;
selp.b16 %h26, %h25, 0x0000, %p12;
cvt.f32.f16 %f58, %h26;
ld.global.nc.b16 %h27, [%rd47+256];
cvt.f32.f16 %f59, %h27;
ld.global.nc.f32 %f60, [%rd48+512];
mul.rn.f32 %f61, %f1, %f60;
mul.rn.f32 %f62, %f61, %f59;
ld.global.nc.f32 %f63, [%rd49+512];
mul.rn.f32 %f64, %f2, %f61;
sub.rn.f32 %f65, %f63, %f64;
add.rn.f32 %f66, %f62, %f65;
add.rn.f32 %f67, %f66, %f58;
sub.rn.f32 %f68, %f67, %f3;
mul.rn.f32 %f69, %f68, %f68;
add.rn.f32 %f6, %f5, %f69;
or.b32 %r89, %r3, 129;
or.b32 %r90, %r89, %r4;
and.b32 %r91, %r89, 3;
shr.u32 %r92, %r90, 2;
setp.ne.s32 %p13, %r91, 1;
cvt.u64.u32 %rd843, %r92;
add.s64 %rd104, %rd12, %rd843;
and.b64 %rd2433, %rd104, 4294967295;
setp.lt.u64 %p73, %rd104, %rd12;
@%p13 bra LBB55_13;
mul.lo.s64 %rd2506, %rd2433, 3528531795;
selp.u64 %rd884, 1, 0, %p73;
add.s64 %rd885, %rd2464, %rd884;
xor.b64 %rd886, %rd885, %rd2506;
shr.u64 %rd887, %rd886, 32;
mul.lo.s64 %rd2509, %rd887, 3449720151;
shr.u64 %rd888, %rd2509, 32;
and.b64 %rd889, %rd885, 4294967295;
mul.lo.s64 %rd890, %rd889, 3449720151;
and.b64 %rd891, %rd890, 4294967295;
xor.b64 %rd892, %rd891, %rd888;
xor.b64 %rd893, %rd892, 2654435769;
mul.lo.s64 %rd2512, %rd893, 3528531795;
xor.b64 %rd2502, %rd890, %rd104;
mov.u32 %r319, -845247145;
mov.u32 %r318, -616729560;
mov.u64 %rd2519, 3041712726;
mov.u64 %rd2518, 1401181199;
mov.u64 %rd2517, 2835769497;
mov.u64 %rd2516, 1684936478;
mov.u64 %rd2515, 2027808484;
mov.u64 %rd2514, 387276957;
mov.u64 %rd2513, 842468239;
mov.u64 %rd2511, 3986602516;
mov.u64 %rd2510, 1013904242;
mov.u64 %rd2508, 3668340011;
mov.u64 %rd2507, 3144134277;
mov.u64 %rd2505, 3449720151;
mov.u64 %rd2504, 1993301258;
mov.u64 %rd2503, 3528531795;
bra.uni LBB55_14;
LBB55_13:
selp.u64 %rd858, 1, 0, %p73;
add.s64 %rd859, %rd2464, %rd858;
and.b64 %rd860, %rd859, 4294967295;
mul.lo.s64 %rd2506, %rd860, 3449720151;
xor.b64 %rd861, %rd2506, %rd104;
shr.u64 %rd862, %rd861, 32;
mul.lo.s64 %rd2509, %rd862, 3528531795;
shr.u64 %rd863, %rd2509, 32;
mul.lo.s64 %rd865, %rd2433, 3528531795;
and.b64 %rd866, %rd865, 4294967295;
xor.b64 %rd867, %rd866, %rd863;
xor.b64 %rd868, %rd867, 3144134277;
mul.lo.s64 %rd2512, %rd868, 3449720151;
xor.b64 %rd2502, %rd859, %rd865;
mov.u32 %r319, -766435501;
mov.u32 %r318, -239350328;
mov.u64 %rd2519, 1684936478;
mov.u64 %rd2518, 534103459;
mov.u64 %rd2517, 387276957;
mov.u64 %rd2516, 3041712726;
mov.u64 %rd2515, 3986602516;
mov.u64 %rd2514, 2835769497;
mov.u64 %rd2513, 3668340011;
mov.u64 %rd2511, 2027808484;
mov.u64 %rd2510, 1993301258;
mov.u64 %rd2508, 842468239;
mov.u64 %rd2507, 2654435769;
mov.u64 %rd2505, 3528531795;
mov.u64 %rd2504, 1013904242;
mov.u64 %rd2503, 3449720151;
LBB55_14:
shr.u64 %rd894, %rd2512, 32;
shr.u64 %rd895, %rd2502, 32;
mul.lo.s64 %rd896, %rd895, %rd2503;
and.b64 %rd897, %rd896, 4294967295;
xor.b64 %rd898, %rd897, %rd894;
xor.b64 %rd899, %rd898, %rd2504;
mul.lo.s64 %rd900, %rd899, %rd2505;
shr.u64 %rd901, %rd900, 32;
shr.u64 %rd902, %rd896, 32;
and.b64 %rd903, %rd2506, 4294967295;
xor.b64 %rd904, %rd903, %rd902;
xor.b64 %rd905, %rd904, %rd2507;
mul.lo.s64 %rd906, %rd905, %rd2505;
and.b64 %rd907, %rd906, 4294967295;
xor.b64 %rd908, %rd907, %rd901;
xor.b64 %rd909, %rd908, %rd2508;
mul.lo.s64 %rd910, %rd909, %rd2503;
shr.u64 %rd911, %rd910, 32;
shr.u64 %rd912, %rd906, 32;
and.b64 %rd913, %rd2509, 4294967295;
xor.b64 %rd914, %rd913, %rd912;
xor.b64 %rd915, %rd914, %rd2510;
mul.lo.s64 %rd916, %rd915, %rd2503;
and.b64 %rd917, %rd916, 4294967295;
xor.b64 %rd918, %rd917, %rd911;
xor.b64 %rd919, %rd918, %rd2511;
mul.lo.s64 %rd920, %rd919, %rd2505;
shr.u64 %rd921, %rd920, 32;
shr.u64 %rd922, %rd916, 32;
and.b64 %rd923, %rd2512, 4294967295;
xor.b64 %rd924, %rd923, %rd922;
xor.b64 %rd925, %rd924, %rd2513;
mul.lo.s64 %rd926, %rd925, %rd2505;
and.b64 %rd927, %rd926, 4294967295;
xor.b64 %rd928, %rd927, %rd921;
xor.b64 %rd929, %rd928, %rd2514;
mul.lo.s64 %rd930, %rd929, %rd2503;
shr.u64 %rd931, %rd930, 32;
shr.u64 %rd932, %rd926, 32;
and.b64 %rd933, %rd900, 4294967295;
xor.b64 %rd934, %rd933, %rd932;
xor.b64 %rd935, %rd934, %rd2515;
mul.lo.s64 %rd936, %rd935, %rd2503;
and.b64 %rd937, %rd936, 4294967295;
xor.b64 %rd938, %rd937, %rd931;
xor.b64 %rd939, %rd938, %rd2516;
mul.lo.s64 %rd940, %rd939, %rd2505;
shr.u64 %rd941, %rd940, 32;
shr.u64 %rd942, %rd936, 32;
and.b64 %rd943, %rd910, 4294967295;
xor.b64 %rd944, %rd943, %rd942;
xor.b64 %rd945, %rd944, %rd2517;
mul.lo.s64 %rd946, %rd945, %rd2505;
and.b64 %rd947, %rd946, 4294967295;
xor.b64 %rd948, %rd947, %rd941;
xor.b64 %rd949, %rd948, %rd2518;
mul.lo.s64 %rd950, %rd949, %rd2503;
shr.u64 %rd951, %rd950, 32;
shr.u64 %rd952, %rd946, 32;
xor.b64 %rd953, %rd920, %rd952;
xor.b64 %rd954, %rd953, %rd2519;
mul.lo.s64 %rd955, %rd954, %rd2503;
xor.b64 %rd956, %rd951, %rd955;
cvt.u32.u64 %r97, %rd956;
xor.b32 %r98, %r318, %r97;
mul.lo.s32 %r99, %r98, %r319;
shr.u32 %r100, %r99, 9;
cvt.rn.f32.u32 %f70, %r100;
mul.rn.f32 %f71, %f70, 0f34000000;
cvt.rn.f16.f32 %h28, %f71;
mov.b16 %h29, 0x2E66;
setp.ge.f16 %p17, %h28, %h29;
ld.global.nc.b16 %h30, [%rd45+258];
ld.global.nc.f32 %f72, [%rd46+516];
cvt.rn.f16.f32 %h31, %f72;
add.rn.f16 %h32, %h30, %h31;
mov.b16 %h33, 0x3C72;
mul.rn.f16 %h34, %h32, %h33;
selp.b16 %h35, %h34, 0x0000, %p17;
cvt.f32.f16 %f73, %h35;
ld.global.nc.b16 %h36, [%rd47+258];
cvt.f32.f16 %f74, %h36;
ld.global.nc.f32 %f75, [%rd48+516];
mul.rn.f32 %f76, %f1, %f75;
mul.rn.f32 %f77, %f76, %f74;
ld.global.nc.f32 %f78, [%rd49+516];
mul.rn.f32 %f79, %f2, %f76;
sub.rn.f32 %f80, %f78, %f79;
add.rn.f32 %f81, %f77, %f80;
add.rn.f32 %f82, %f81, %f73;
sub.rn.f32 %f83, %f82, %f3;
mul.rn.f32 %f84, %f83, %f83;
add.rn.f32 %f7, %f6, %f84;
or.b32 %r102, %r73, 256;
shr.u32 %r103, %r102, 2;
cvt.u64.u32 %rd957, %r103;
add.s64 %rd131, %rd12, %rd957;
and.b64 %rd2429, %rd131, 4294967295;
setp.lt.u64 %p72, %rd131, %rd12;
@%p8 bra LBB55_16;
mul.lo.s64 %rd2524, %rd2429, 3528531795;
selp.u64 %rd1000, 1, 0, %p72;
add.s64 %rd1001, %rd2464, %rd1000;
xor.b64 %rd1002, %rd1001, %rd2524;
shr.u64 %rd1003, %rd1002, 32;
mul.lo.s64 %rd2527, %rd1003, 3449720151;
shr.u64 %rd1004, %rd2527, 32;
and.b64 %rd1005, %rd1001, 4294967295;
mul.lo.s64 %rd1006, %rd1005, 3449720151;
and.b64 %rd1007, %rd1006, 4294967295;
xor.b64 %rd1008, %rd1007, %rd1004;
xor.b64 %rd1009, %rd1008, 2654435769;
mul.lo.s64 %rd2530, %rd1009, 3528531795;
xor.b64 %rd2520, %rd1006, %rd131;
mov.u32 %r322, -1879881855;
mov.u32 %r321, -845247145;
mov.u32 %r320, 534103459;
mov.u64 %rd2538, 3678237736;
mov.u64 %rd2537, 3041712726;
mov.u64 %rd2536, 1401181199;
mov.u64 %rd2535, 2835769497;
mov.u64 %rd2534, 1684936478;
mov.u64 %rd2533, 2027808484;
mov.u64 %rd2532, 387276957;
mov.u64 %rd2531, 842468239;
mov.u64 %rd2529, 3986602516;
mov.u64 %rd2528, 1013904242;
mov.u64 %rd2526, 3668340011;
mov.u64 %rd2525, 3144134277;
mov.u64 %rd2523, 3449720151;
mov.u64 %rd2522, 1993301258;
mov.u64 %rd2521, 3528531795;
bra.uni LBB55_17;
LBB55_16:
selp.u64 %rd973, 1, 0, %p72;
add.s64 %rd974, %rd2464, %rd973;
and.b64 %rd975, %rd974, 4294967295;
mul.lo.s64 %rd2524, %rd975, 3449720151;
xor.b64 %rd976, %rd2524, %rd131;
shr.u64 %rd977, %rd976, 32;
mul.lo.s64 %rd2527, %rd977, 3528531795;
shr.u64 %rd978, %rd2527, 32;
mul.lo.s64 %rd980, %rd2429, 3528531795;
and.b64 %rd981, %rd980, 4294967295;
xor.b64 %rd982, %rd981, %rd978;
xor.b64 %rd983, %rd982, 3144134277;
mul.lo.s64 %rd2530, %rd983, 3449720151;
xor.b64 %rd2520, %rd974, %rd980;
mov.u32 %r322, -1767562579;
mov.u32 %r321, -766435501;
mov.u32 %r320, 1401181199;
mov.u64 %rd2538, 4055616968;
mov.u64 %rd2537, 1684936478;
mov.u64 %rd2536, 534103459;
mov.u64 %rd2535, 387276957;
mov.u64 %rd2534, 3041712726;
mov.u64 %rd2533, 3986602516;
mov.u64 %rd2532, 2835769497;
mov.u64 %rd2531, 3668340011;
mov.u64 %rd2529, 2027808484;
mov.u64 %rd2528, 1993301258;
mov.u64 %rd2526, 842468239;
mov.u64 %rd2525, 2654435769;
mov.u64 %rd2523, 3528531795;
mov.u64 %rd2522, 1013904242;
mov.u64 %rd2521, 3449720151;
LBB55_17:
shr.u64 %rd1010, %rd2530, 32;
shr.u64 %rd1011, %rd2520, 32;
mul.lo.s64 %rd1012, %rd1011, %rd2521;
and.b64 %rd1013, %rd1012, 4294967295;
xor.b64 %rd1014, %rd1013, %rd1010;
xor.b64 %rd1015, %rd1014, %rd2522;
mul.lo.s64 %rd1016, %rd1015, %rd2523;
shr.u64 %rd1017, %rd1016, 32;
shr.u64 %rd1018, %rd1012, 32;
and.b64 %rd1019, %rd2524, 4294967295;
xor.b64 %rd1020, %rd1019, %rd1018;
xor.b64 %rd1021, %rd1020, %rd2525;
mul.lo.s64 %rd1022, %rd1021, %rd2523;
and.b64 %rd1023, %rd1022, 4294967295;
xor.b64 %rd1024, %rd1023, %rd1017;
xor.b64 %rd1025, %rd1024, %rd2526;
mul.lo.s64 %rd1026, %rd1025, %rd2521;
shr.u64 %rd1027, %rd1026, 32;
shr.u64 %rd1028, %rd1022, 32;
and.b64 %rd1029, %rd2527, 4294967295;
xor.b64 %rd1030, %rd1029, %rd1028;
xor.b64 %rd1031, %rd1030, %rd2528;
mul.lo.s64 %rd1032, %rd1031, %rd2521;
and.b64 %rd1033, %rd1032, 4294967295;
xor.b64 %rd1034, %rd1033, %rd1027;
xor.b64 %rd1035, %rd1034, %rd2529;
mul.lo.s64 %rd1036, %rd1035, %rd2523;
shr.u64 %rd1037, %rd1036, 32;
shr.u64 %rd1038, %rd1032, 32;
and.b64 %rd1039, %rd2530, 4294967295;
xor.b64 %rd1040, %rd1039, %rd1038;
xor.b64 %rd1041, %rd1040, %rd2531;
mul.lo.s64 %rd1042, %rd1041, %rd2523;
and.b64 %rd1043, %rd1042, 4294967295;
xor.b64 %rd1044, %rd1043, %rd1037;
xor.b64 %rd1045, %rd1044, %rd2532;
mul.lo.s64 %rd1046, %rd1045, %rd2521;
shr.u64 %rd1047, %rd1046, 32;
shr.u64 %rd1048, %rd1042, 32;
and.b64 %rd1049, %rd1016, 4294967295;
xor.b64 %rd1050, %rd1049, %rd1048;
xor.b64 %rd1051, %rd1050, %rd2533;
mul.lo.s64 %rd1052, %rd1051, %rd2521;
and.b64 %rd1053, %rd1052, 4294967295;
xor.b64 %rd1054, %rd1053, %rd1047;
xor.b64 %rd1055, %rd1054, %rd2534;
mul.lo.s64 %rd1056, %rd1055, %rd2523;
shr.u64 %rd1057, %rd1056, 32;
shr.u64 %rd1058, %rd1052, 32;
and.b64 %rd1059, %rd1026, 4294967295;
xor.b64 %rd1060, %rd1059, %rd1058;
xor.b64 %rd1061, %rd1060, %rd2535;
mul.lo.s64 %rd1062, %rd1061, %rd2523;
and.b64 %rd1063, %rd1062, 4294967295;
xor.b64 %rd1064, %rd1063, %rd1057;
xor.b64 %rd1065, %rd1064, %rd2536;
mul.lo.s64 %rd1066, %rd1065, %rd2521;
shr.u64 %rd1067, %rd1066, 32;
shr.u64 %rd1068, %rd1062, 32;
and.b64 %rd1069, %rd1036, 4294967295;
xor.b64 %rd1070, %rd1069, %rd1068;
xor.b64 %rd1071, %rd1070, %rd2537;
mul.lo.s64 %rd1072, %rd1071, %rd2521;
and.b64 %rd1073, %rd1072, 4294967295;
xor.b64 %rd1074, %rd1073, %rd1067;
xor.b64 %rd1075, %rd1074, %rd2538;
mul.lo.s64 %rd1076, %rd1075, %rd2523;
shr.u64 %rd1077, %rd1076, 32;
cvt.u32.u64 %r110, %rd1077;
shr.u64 %rd1078, %rd1072, 32;
xor.b64 %rd1079, %rd1078, %rd1046;
cvt.u32.u64 %r111, %rd1079;
xor.b32 %r112, %r320, %r111;
mul.lo.s32 %r113, %r112, %r321;
xor.b32 %r114, %r113, %r110;
xor.b32 %r115, %r114, %r322;
shr.u32 %r116, %r115, 9;
cvt.rn.f32.u32 %f85, %r116;
mul.rn.f32 %f86, %f85, 0f34000000;
cvt.rn.f16.f32 %h37, %f86;
mov.b16 %h38, 0x2E66;
setp.ge.f16 %p20, %h37, %h38;
ld.global.nc.b16 %h39, [%rd45+512];
ld.global.nc.f32 %f87, [%rd46+1024];
cvt.rn.f16.f32 %h40, %f87;
add.rn.f16 %h41, %h39, %h40;
mov.b16 %h42, 0x3C72;
mul.rn.f16 %h43, %h41, %h42;
selp.b16 %h44, %h43, 0x0000, %p20;
cvt.f32.f16 %f88, %h44;
ld.global.nc.b16 %h45, [%rd47+512];
cvt.f32.f16 %f89, %h45;
ld.global.nc.f32 %f90, [%rd48+1024];
mul.rn.f32 %f91, %f1, %f90;
mul.rn.f32 %f92, %f91, %f89;
ld.global.nc.f32 %f93, [%rd49+1024];
mul.rn.f32 %f94, %f2, %f91;
sub.rn.f32 %f95, %f93, %f94;
add.rn.f32 %f96, %f92, %f95;
add.rn.f32 %f97, %f96, %f88;
sub.rn.f32 %f98, %f97, %f3;
mul.rn.f32 %f99, %f98, %f98;
add.rn.f32 %f8, %f7, %f99;
or.b32 %r117, %r3, 257;
or.b32 %r118, %r117, %r4;
and.b32 %r119, %r117, 3;
shr.u32 %r120, %r118, 2;
setp.ne.s32 %p21, %r119, 1;
cvt.u64.u32 %rd1080, %r120;
add.s64 %rd159, %rd12, %rd1080;
and.b64 %rd2426, %rd159, 4294967295;
setp.lt.u64 %p71, %rd159, %rd12;
@%p21 bra LBB55_19;
mul.lo.s64 %rd2543, %rd2426, 3528531795;
selp.u64 %rd1121, 1, 0, %p71;
add.s64 %rd1122, %rd2464, %rd1121;
xor.b64 %rd1123, %rd1122, %rd2543;
shr.u64 %rd1124, %rd1123, 32;
mul.lo.s64 %rd2546, %rd1124, 3449720151;
shr.u64 %rd1125, %rd2546, 32;
and.b64 %rd1126, %rd1122, 4294967295;
mul.lo.s64 %rd1127, %rd1126, 3449720151;
and.b64 %rd1128, %rd1127, 4294967295;
xor.b64 %rd1129, %rd1128, %rd1125;
xor.b64 %rd1130, %rd1129, 2654435769;
mul.lo.s64 %rd2549, %rd1130, 3528531795;
xor.b64 %rd2539, %rd1127, %rd159;
mov.u32 %r324, -845247145;
mov.u32 %r323, -616729560;
mov.u64 %rd2556, 3041712726;
mov.u64 %rd2555, 1401181199;
mov.u64 %rd2554, 2835769497;
mov.u64 %rd2553, 1684936478;
mov.u64 %rd2552, 2027808484;
mov.u64 %rd2551, 387276957;
mov.u64 %rd2550, 842468239;
mov.u64 %rd2548, 3986602516;
mov.u64 %rd2547, 1013904242;
mov.u64 %rd2545, 3668340011;
mov.u64 %rd2544, 3144134277;
mov.u64 %rd2542, 3449720151;
mov.u64 %rd2541, 1993301258;
mov.u64 %rd2540, 3528531795;
bra.uni LBB55_20;
LBB55_19:
selp.u64 %rd1095, 1, 0, %p71;
add.s64 %rd1096, %rd2464, %rd1095;
and.b64 %rd1097, %rd1096, 4294967295;
mul.lo.s64 %rd2543, %rd1097, 3449720151;
xor.b64 %rd1098, %rd2543, %rd159;
shr.u64 %rd1099, %rd1098, 32;
mul.lo.s64 %rd2546, %rd1099, 3528531795;
shr.u64 %rd1100, %rd2546, 32;
mul.lo.s64 %rd1102, %rd2426, 3528531795;
and.b64 %rd1103, %rd1102, 4294967295;
xor.b64 %rd1104, %rd1103, %rd1100;
xor.b64 %rd1105, %rd1104, 3144134277;
mul.lo.s64 %rd2549, %rd1105, 3449720151;
xor.b64 %rd2539, %rd1096, %rd1102;
mov.u32 %r324, -766435501;
mov.u32 %r323, -239350328;
mov.u64 %rd2556, 1684936478;
mov.u64 %rd2555, 534103459;
mov.u64 %rd2554, 387276957;
mov.u64 %rd2553, 3041712726;
mov.u64 %rd2552, 3986602516;
mov.u64 %rd2551, 2835769497;
mov.u64 %rd2550, 3668340011;
mov.u64 %rd2548, 2027808484;
mov.u64 %rd2547, 1993301258;
mov.u64 %rd2545, 842468239;
mov.u64 %rd2544, 2654435769;
mov.u64 %rd2542, 3528531795;
mov.u64 %rd2541, 1013904242;
mov.u64 %rd2540, 3449720151;
LBB55_20:
shr.u64 %rd1131, %rd2549, 32;
shr.u64 %rd1132, %rd2539, 32;
mul.lo.s64 %rd1133, %rd1132, %rd2540;
and.b64 %rd1134, %rd1133, 4294967295;
xor.b64 %rd1135, %rd1134, %rd1131;
xor.b64 %rd1136, %rd1135, %rd2541;
mul.lo.s64 %rd1137, %rd1136, %rd2542;
shr.u64 %rd1138, %rd1137, 32;
shr.u64 %rd1139, %rd1133, 32;
and.b64 %rd1140, %rd2543, 4294967295;
xor.b64 %rd1141, %rd1140, %rd1139;
xor.b64 %rd1142, %rd1141, %rd2544;
mul.lo.s64 %rd1143, %rd1142, %rd2542;
and.b64 %rd1144, %rd1143, 4294967295;
xor.b64 %rd1145, %rd1144, %rd1138;
xor.b64 %rd1146, %rd1145, %rd2545;
mul.lo.s64 %rd1147, %rd1146, %rd2540;
shr.u64 %rd1148, %rd1147, 32;
shr.u64 %rd1149, %rd1143, 32;
and.b64 %rd1150, %rd2546, 4294967295;
xor.b64 %rd1151, %rd1150, %rd1149;
xor.b64 %rd1152, %rd1151, %rd2547;
mul.lo.s64 %rd1153, %rd1152, %rd2540;
and.b64 %rd1154, %rd1153, 4294967295;
xor.b64 %rd1155, %rd1154, %rd1148;
xor.b64 %rd1156, %rd1155, %rd2548;
mul.lo.s64 %rd1157, %rd1156, %rd2542;
shr.u64 %rd1158, %rd1157, 32;
shr.u64 %rd1159, %rd1153, 32;
and.b64 %rd1160, %rd2549, 4294967295;
xor.b64 %rd1161, %rd1160, %rd1159;
xor.b64 %rd1162, %rd1161, %rd2550;
mul.lo.s64 %rd1163, %rd1162, %rd2542;
and.b64 %rd1164, %rd1163, 4294967295;
xor.b64 %rd1165, %rd1164, %rd1158;
xor.b64 %rd1166, %rd1165, %rd2551;
mul.lo.s64 %rd1167, %rd1166, %rd2540;
shr.u64 %rd1168, %rd1167, 32;
shr.u64 %rd1169, %rd1163, 32;
and.b64 %rd1170, %rd1137, 4294967295;
xor.b64 %rd1171, %rd1170, %rd1169;
xor.b64 %rd1172, %rd1171, %rd2552;
mul.lo.s64 %rd1173, %rd1172, %rd2540;
and.b64 %rd1174, %rd1173, 4294967295;
xor.b64 %rd1175, %rd1174, %rd1168;
xor.b64 %rd1176, %rd1175, %rd2553;
mul.lo.s64 %rd1177, %rd1176, %rd2542;
shr.u64 %rd1178, %rd1177, 32;
shr.u64 %rd1179, %rd1173, 32;
and.b64 %rd1180, %rd1147, 4294967295;
xor.b64 %rd1181, %rd1180, %rd1179;
xor.b64 %rd1182, %rd1181, %rd2554;
mul.lo.s64 %rd1183, %rd1182, %rd2542;
and.b64 %rd1184, %rd1183, 4294967295;
xor.b64 %rd1185, %rd1184, %rd1178;
xor.b64 %rd1186, %rd1185, %rd2555;
mul.lo.s64 %rd1187, %rd1186, %rd2540;
shr.u64 %rd1188, %rd1187, 32;
shr.u64 %rd1189, %rd1183, 32;
xor.b64 %rd1190, %rd1157, %rd1189;
xor.b64 %rd1191, %rd1190, %rd2556;
mul.lo.s64 %rd1192, %rd1191, %rd2540;
xor.b64 %rd1193, %rd1188, %rd1192;
cvt.u32.u64 %r125, %rd1193;
xor.b32 %r126, %r323, %r125;
mul.lo.s32 %r127, %r126, %r324;
shr.u32 %r128, %r127, 9;
cvt.rn.f32.u32 %f100, %r128;
mul.rn.f32 %f101, %f100, 0f34000000;
cvt.rn.f16.f32 %h46, %f101;
mov.b16 %h47, 0x2E66;
setp.ge.f16 %p25, %h46, %h47;
ld.global.nc.b16 %h48, [%rd45+514];
ld.global.nc.f32 %f102, [%rd46+1028];
cvt.rn.f16.f32 %h49, %f102;
add.rn.f16 %h50, %h48, %h49;
mov.b16 %h51, 0x3C72;
mul.rn.f16 %h52, %h50, %h51;
selp.b16 %h53, %h52, 0x0000, %p25;
cvt.f32.f16 %f103, %h53;
ld.global.nc.b16 %h54, [%rd47+514];
cvt.f32.f16 %f104, %h54;
ld.global.nc.f32 %f105, [%rd48+1028];
mul.rn.f32 %f106, %f1, %f105;
mul.rn.f32 %f107, %f106, %f104;
ld.global.nc.f32 %f108, [%rd49+1028];
mul.rn.f32 %f109, %f2, %f106;
sub.rn.f32 %f110, %f108, %f109;
add.rn.f32 %f111, %f107, %f110;
add.rn.f32 %f112, %f111, %f103;
sub.rn.f32 %f113, %f112, %f3;
mul.rn.f32 %f114, %f113, %f113;
add.rn.f32 %f9, %f8, %f114;
or.b32 %r130, %r73, 384;
shr.u32 %r131, %r130, 2;
cvt.u64.u32 %rd1194, %r131;
add.s64 %rd186, %rd12, %rd1194;
and.b64 %rd2422, %rd186, 4294967295;
setp.lt.u64 %p70, %rd186, %rd12;
@%p8 bra LBB55_22;
mul.lo.s64 %rd2561, %rd2422, 3528531795;
selp.u64 %rd1237, 1, 0, %p70;
add.s64 %rd1238, %rd2464, %rd1237;
xor.b64 %rd1239, %rd1238, %rd2561;
shr.u64 %rd1240, %rd1239, 32;
mul.lo.s64 %rd2564, %rd1240, 3449720151;
shr.u64 %rd1241, %rd2564, 32;
and.b64 %rd1242, %rd1238, 4294967295;
mul.lo.s64 %rd1243, %rd1242, 3449720151;
and.b64 %rd1244, %rd1243, 4294967295;
xor.b64 %rd1245, %rd1244, %rd1241;
xor.b64 %rd1246, %rd1245, 2654435769;
mul.lo.s64 %rd2567, %rd1246, 3528531795;
xor.b64 %rd2557, %rd1243, %rd186;
mov.u32 %r327, -1879881855;
mov.u32 %r326, -845247145;
mov.u32 %r325, 534103459;
mov.u64 %rd2575, 3678237736;
mov.u64 %rd2574, 3041712726;
mov.u64 %rd2573, 1401181199;
mov.u64 %rd2572, 2835769497;
mov.u64 %rd2571, 1684936478;
mov.u64 %rd2570, 2027808484;
mov.u64 %rd2569, 387276957;
mov.u64 %rd2568, 842468239;
mov.u64 %rd2566, 3986602516;
mov.u64 %rd2565, 1013904242;
mov.u64 %rd2563, 3668340011;
mov.u64 %rd2562, 3144134277;
mov.u64 %rd2560, 3449720151;
mov.u64 %rd2559, 1993301258;
mov.u64 %rd2558, 3528531795;
bra.uni LBB55_23;
LBB55_22:
selp.u64 %rd1210, 1, 0, %p70;
add.s64 %rd1211, %rd2464, %rd1210;
and.b64 %rd1212, %rd1211, 4294967295;
mul.lo.s64 %rd2561, %rd1212, 3449720151;
xor.b64 %rd1213, %rd2561, %rd186;
shr.u64 %rd1214, %rd1213, 32;
mul.lo.s64 %rd2564, %rd1214, 3528531795;
shr.u64 %rd1215, %rd2564, 32;
mul.lo.s64 %rd1217, %rd2422, 3528531795;
and.b64 %rd1218, %rd1217, 4294967295;
xor.b64 %rd1219, %rd1218, %rd1215;
xor.b64 %rd1220, %rd1219, 3144134277;
mul.lo.s64 %rd2567, %rd1220, 3449720151;
xor.b64 %rd2557, %rd1211, %rd1217;
mov.u32 %r327, -1767562579;
mov.u32 %r326, -766435501;
mov.u32 %r325, 1401181199;
mov.u64 %rd2575, 4055616968;
mov.u64 %rd2574, 1684936478;
mov.u64 %rd2573, 534103459;
mov.u64 %rd2572, 387276957;
mov.u64 %rd2571, 3041712726;
mov.u64 %rd2570, 3986602516;
mov.u64 %rd2569, 2835769497;
mov.u64 %rd2568, 3668340011;
mov.u64 %rd2566, 2027808484;
mov.u64 %rd2565, 1993301258;
mov.u64 %rd2563, 842468239;
mov.u64 %rd2562, 2654435769;
mov.u64 %rd2560, 3528531795;
mov.u64 %rd2559, 1013904242;
mov.u64 %rd2558, 3449720151;
LBB55_23:
shr.u64 %rd1247, %rd2567, 32;
shr.u64 %rd1248, %rd2557, 32;
mul.lo.s64 %rd1249, %rd1248, %rd2558;
and.b64 %rd1250, %rd1249, 4294967295;
xor.b64 %rd1251, %rd1250, %rd1247;
xor.b64 %rd1252, %rd1251, %rd2559;
mul.lo.s64 %rd1253, %rd1252, %rd2560;
shr.u64 %rd1254, %rd1253, 32;
shr.u64 %rd1255, %rd1249, 32;
and.b64 %rd1256, %rd2561, 4294967295;
xor.b64 %rd1257, %rd1256, %rd1255;
xor.b64 %rd1258, %rd1257, %rd2562;
mul.lo.s64 %rd1259, %rd1258, %rd2560;
and.b64 %rd1260, %rd1259, 4294967295;
xor.b64 %rd1261, %rd1260, %rd1254;
xor.b64 %rd1262, %rd1261, %rd2563;
mul.lo.s64 %rd1263, %rd1262, %rd2558;
shr.u64 %rd1264, %rd1263, 32;
shr.u64 %rd1265, %rd1259, 32;
and.b64 %rd1266, %rd2564, 4294967295;
xor.b64 %rd1267, %rd1266, %rd1265;
xor.b64 %rd1268, %rd1267, %rd2565;
mul.lo.s64 %rd1269, %rd1268, %rd2558;
and.b64 %rd1270, %rd1269, 4294967295;
xor.b64 %rd1271, %rd1270, %rd1264;
xor.b64 %rd1272, %rd1271, %rd2566;
mul.lo.s64 %rd1273, %rd1272, %rd2560;
shr.u64 %rd1274, %rd1273, 32;
shr.u64 %rd1275, %rd1269, 32;
and.b64 %rd1276, %rd2567, 4294967295;
xor.b64 %rd1277, %rd1276, %rd1275;
xor.b64 %rd1278, %rd1277, %rd2568;
mul.lo.s64 %rd1279, %rd1278, %rd2560;
and.b64 %rd1280, %rd1279, 4294967295;
xor.b64 %rd1281, %rd1280, %rd1274;
xor.b64 %rd1282, %rd1281, %rd2569;
mul.lo.s64 %rd1283, %rd1282, %rd2558;
shr.u64 %rd1284, %rd1283, 32;
shr.u64 %rd1285, %rd1279, 32;
and.b64 %rd1286, %rd1253, 4294967295;
xor.b64 %rd1287, %rd1286, %rd1285;
xor.b64 %rd1288, %rd1287, %rd2570;
mul.lo.s64 %rd1289, %rd1288, %rd2558;
and.b64 %rd1290, %rd1289, 4294967295;
xor.b64 %rd1291, %rd1290, %rd1284;
xor.b64 %rd1292, %rd1291, %rd2571;
mul.lo.s64 %rd1293, %rd1292, %rd2560;
shr.u64 %rd1294, %rd1293, 32;
shr.u64 %rd1295, %rd1289, 32;
and.b64 %rd1296, %rd1263, 4294967295;
xor.b64 %rd1297, %rd1296, %rd1295;
xor.b64 %rd1298, %rd1297, %rd2572;
mul.lo.s64 %rd1299, %rd1298, %rd2560;
and.b64 %rd1300, %rd1299, 4294967295;
xor.b64 %rd1301, %rd1300, %rd1294;
xor.b64 %rd1302, %rd1301, %rd2573;
mul.lo.s64 %rd1303, %rd1302, %rd2558;
shr.u64 %rd1304, %rd1303, 32;
shr.u64 %rd1305, %rd1299, 32;
and.b64 %rd1306, %rd1273, 4294967295;
xor.b64 %rd1307, %rd1306, %rd1305;
xor.b64 %rd1308, %rd1307, %rd2574;
mul.lo.s64 %rd1309, %rd1308, %rd2558;
and.b64 %rd1310, %rd1309, 4294967295;
xor.b64 %rd1311, %rd1310, %rd1304;
xor.b64 %rd1312, %rd1311, %rd2575;
mul.lo.s64 %rd1313, %rd1312, %rd2560;
shr.u64 %rd1314, %rd1313, 32;
cvt.u32.u64 %r138, %rd1314;
shr.u64 %rd1315, %rd1309, 32;
xor.b64 %rd1316, %rd1315, %rd1283;
cvt.u32.u64 %r139, %rd1316;
xor.b32 %r140, %r325, %r139;
mul.lo.s32 %r141, %r140, %r326;
xor.b32 %r142, %r141, %r138;
xor.b32 %r143, %r142, %r327;
shr.u32 %r144, %r143, 9;
cvt.rn.f32.u32 %f115, %r144;
mul.rn.f32 %f116, %f115, 0f34000000;
cvt.rn.f16.f32 %h55, %f116;
mov.b16 %h56, 0x2E66;
setp.ge.f16 %p28, %h55, %h56;
ld.global.nc.b16 %h57, [%rd45+768];
ld.global.nc.f32 %f117, [%rd46+1536];
cvt.rn.f16.f32 %h58, %f117;
add.rn.f16 %h59, %h57, %h58;
mov.b16 %h60, 0x3C72;
mul.rn.f16 %h61, %h59, %h60;
selp.b16 %h62, %h61, 0x0000, %p28;
cvt.f32.f16 %f118, %h62;
ld.global.nc.b16 %h63, [%rd47+768];
cvt.f32.f16 %f119, %h63;
ld.global.nc.f32 %f120, [%rd48+1536];
mul.rn.f32 %f121, %f1, %f120;
mul.rn.f32 %f122, %f121, %f119;
ld.global.nc.f32 %f123, [%rd49+1536];
mul.rn.f32 %f124, %f2, %f121;
sub.rn.f32 %f125, %f123, %f124;
add.rn.f32 %f126, %f122, %f125;
add.rn.f32 %f127, %f126, %f118;
sub.rn.f32 %f128, %f127, %f3;
mul.rn.f32 %f129, %f128, %f128;
add.rn.f32 %f10, %f9, %f129;
or.b32 %r145, %r3, 385;
or.b32 %r146, %r145, %r4;
and.b32 %r147, %r145, 3;
shr.u32 %r148, %r146, 2;
setp.ne.s32 %p29, %r147, 1;
cvt.u64.u32 %rd1317, %r148;
add.s64 %rd214, %rd12, %rd1317;
@%p29 bra LBB55_25;
and.b64 %rd1357, %rd214, 4294967295;
mul.lo.s64 %rd2580, %rd1357, 3528531795;
setp.lt.u64 %p31, %rd214, %rd12;
selp.u64 %rd1358, 1, 0, %p31;
add.s64 %rd1359, %rd2464, %rd1358;
xor.b64 %rd1360, %rd1359, %rd2580;
shr.u64 %rd1361, %rd1360, 32;
mul.lo.s64 %rd2583, %rd1361, 3449720151;
shr.u64 %rd1362, %rd2583, 32;
and.b64 %rd1363, %rd1359, 4294967295;
mul.lo.s64 %rd1364, %rd1363, 3449720151;
and.b64 %rd1365, %rd1364, 4294967295;
xor.b64 %rd1366, %rd1365, %rd1362;
xor.b64 %rd1367, %rd1366, 2654435769;
mul.lo.s64 %rd2586, %rd1367, 3528531795;
xor.b64 %rd2576, %rd1364, %rd214;
mov.u32 %r329, -845247145;
mov.u32 %r328, -616729560;
mov.u64 %rd2593, 3041712726;
mov.u64 %rd2592, 1401181199;
mov.u64 %rd2591, 2835769497;
mov.u64 %rd2590, 1684936478;
mov.u64 %rd2589, 2027808484;
mov.u64 %rd2588, 387276957;
mov.u64 %rd2587, 842468239;
mov.u64 %rd2585, 3986602516;
mov.u64 %rd2584, 1013904242;
mov.u64 %rd2582, 3668340011;
mov.u64 %rd2581, 3144134277;
mov.u64 %rd2579, 3449720151;
mov.u64 %rd2578, 1993301258;
mov.u64 %rd2577, 3528531795;
bra.uni LBB55_26;
LBB55_25:
setp.lt.u64 %p30, %rd214, %rd12;
selp.u64 %rd1332, 1, 0, %p30;
add.s64 %rd1333, %rd2464, %rd1332;
and.b64 %rd1334, %rd1333, 4294967295;
mul.lo.s64 %rd2580, %rd1334, 3449720151;
xor.b64 %rd1335, %rd2580, %rd214;
shr.u64 %rd1336, %rd1335, 32;
mul.lo.s64 %rd2583, %rd1336, 3528531795;
shr.u64 %rd1337, %rd2583, 32;
and.b64 %rd1338, %rd214, 4294967295;
mul.lo.s64 %rd1339, %rd1338, 3528531795;
and.b64 %rd1340, %rd1339, 4294967295;
xor.b64 %rd1341, %rd1340, %rd1337;
xor.b64 %rd1342, %rd1341, 3144134277;
mul.lo.s64 %rd2586, %rd1342, 3449720151;
xor.b64 %rd2576, %rd1333, %rd1339;
mov.u32 %r329, -766435501;
mov.u32 %r328, -239350328;
mov.u64 %rd2593, 1684936478;
mov.u64 %rd2592, 534103459;
mov.u64 %rd2591, 387276957;
mov.u64 %rd2590, 3041712726;
mov.u64 %rd2589, 3986602516;
mov.u64 %rd2588, 2835769497;
mov.u64 %rd2587, 3668340011;
mov.u64 %rd2585, 2027808484;
mov.u64 %rd2584, 1993301258;
mov.u64 %rd2582, 842468239;
mov.u64 %rd2581, 2654435769;
mov.u64 %rd2579, 3528531795;
mov.u64 %rd2578, 1013904242;
mov.u64 %rd2577, 3449720151;
LBB55_26:
shr.u64 %rd1368, %rd2586, 32;
shr.u64 %rd1369, %rd2576, 32;
mul.lo.s64 %rd1370, %rd1369, %rd2577;
and.b64 %rd1371, %rd1370, 4294967295;
xor.b64 %rd1372, %rd1371, %rd1368;
xor.b64 %rd1373, %rd1372, %rd2578;
mul.lo.s64 %rd1374, %rd1373, %rd2579;
shr.u64 %rd1375, %rd1374, 32;
shr.u64 %rd1376, %rd1370, 32;
and.b64 %rd1377, %rd2580, 4294967295;
xor.b64 %rd1378, %rd1377, %rd1376;
xor.b64 %rd1379, %rd1378, %rd2581;
mul.lo.s64 %rd1380, %rd1379, %rd2579;
and.b64 %rd1381, %rd1380, 4294967295;
xor.b64 %rd1382, %rd1381, %rd1375;
xor.b64 %rd1383, %rd1382, %rd2582;
mul.lo.s64 %rd1384, %rd1383, %rd2577;
shr.u64 %rd1385, %rd1384, 32;
shr.u64 %rd1386, %rd1380, 32;
and.b64 %rd1387, %rd2583, 4294967295;
xor.b64 %rd1388, %rd1387, %rd1386;
xor.b64 %rd1389, %rd1388, %rd2584;
mul.lo.s64 %rd1390, %rd1389, %rd2577;
and.b64 %rd1391, %rd1390, 4294967295;
xor.b64 %rd1392, %rd1391, %rd1385;
xor.b64 %rd1393, %rd1392, %rd2585;
mul.lo.s64 %rd1394, %rd1393, %rd2579;
shr.u64 %rd1395, %rd1394, 32;
shr.u64 %rd1396, %rd1390, 32;
and.b64 %rd1397, %rd2586, 4294967295;
xor.b64 %rd1398, %rd1397, %rd1396;
xor.b64 %rd1399, %rd1398, %rd2587;
mul.lo.s64 %rd1400, %rd1399, %rd2579;
and.b64 %rd1401, %rd1400, 4294967295;
xor.b64 %rd1402, %rd1401, %rd1395;
xor.b64 %rd1403, %rd1402, %rd2588;
mul.lo.s64 %rd1404, %rd1403, %rd2577;
shr.u64 %rd1405, %rd1404, 32;
shr.u64 %rd1406, %rd1400, 32;
and.b64 %rd1407, %rd1374, 4294967295;
xor.b64 %rd1408, %rd1407, %rd1406;
xor.b64 %rd1409, %rd1408, %rd2589;
mul.lo.s64 %rd1410, %rd1409, %rd2577;
and.b64 %rd1411, %rd1410, 4294967295;
xor.b64 %rd1412, %rd1411, %rd1405;
xor.b64 %rd1413, %rd1412, %rd2590;
mul.lo.s64 %rd1414, %rd1413, %rd2579;
shr.u64 %rd1415, %rd1414, 32;
shr.u64 %rd1416, %rd1410, 32;
and.b64 %rd1417, %rd1384, 4294967295;
xor.b64 %rd1418, %rd1417, %rd1416;
xor.b64 %rd1419, %rd1418, %rd2591;
mul.lo.s64 %rd1420, %rd1419, %rd2579;
and.b64 %rd1421, %rd1420, 4294967295;
xor.b64 %rd1422, %rd1421, %rd1415;
xor.b64 %rd1423, %rd1422, %rd2592;
mul.lo.s64 %rd1424, %rd1423, %rd2577;
shr.u64 %rd1425, %rd1424, 32;
shr.u64 %rd1426, %rd1420, 32;
xor.b64 %rd1427, %rd1394, %rd1426;
xor.b64 %rd1428, %rd1427, %rd2593;
mul.lo.s64 %rd1429, %rd1428, %rd2577;
xor.b64 %rd1430, %rd1425, %rd1429;
cvt.u32.u64 %r153, %rd1430;
xor.b32 %r154, %r328, %r153;
mul.lo.s32 %r155, %r154, %r329;
shr.u32 %r156, %r155, 9;
cvt.rn.f32.u32 %f130, %r156;
mul.rn.f32 %f131, %f130, 0f34000000;
cvt.rn.f16.f32 %h64, %f131;
mov.b16 %h65, 0x2E66;
setp.ge.f16 %p33, %h64, %h65;
ld.global.nc.b16 %h66, [%rd45+770];
ld.global.nc.f32 %f132, [%rd46+1540];
cvt.rn.f16.f32 %h67, %f132;
add.rn.f16 %h68, %h66, %h67;
mov.b16 %h69, 0x3C72;
mul.rn.f16 %h70, %h68, %h69;
selp.b16 %h71, %h70, 0x0000, %p33;
cvt.f32.f16 %f133, %h71;
ld.global.nc.b16 %h72, [%rd47+770];
cvt.f32.f16 %f134, %h72;
ld.global.nc.f32 %f135, [%rd48+1540];
mul.rn.f32 %f136, %f1, %f135;
mul.rn.f32 %f137, %f136, %f134;
ld.global.nc.f32 %f138, [%rd49+1540];
mul.rn.f32 %f139, %f2, %f136;
sub.rn.f32 %f140, %f138, %f139;
add.rn.f32 %f141, %f137, %f140;
add.rn.f32 %f142, %f141, %f133;
sub.rn.f32 %f143, %f142, %f3;
mul.rn.f32 %f144, %f143, %f143;
add.rn.f32 %f11, %f10, %f144;
or.b32 %r158, %r73, 512;
shr.u32 %r159, %r158, 2;
cvt.u64.u32 %rd1431, %r159;
add.s64 %rd241, %rd12, %rd1431;
@%p8 bra LBB55_28;
and.b64 %rd1473, %rd241, 4294967295;
mul.lo.s64 %rd2598, %rd1473, 3528531795;
setp.lt.u64 %p35, %rd241, %rd12;
selp.u64 %rd1474, 1, 0, %p35;
add.s64 %rd1475, %rd2464, %rd1474;
xor.b64 %rd1476, %rd1475, %rd2598;
shr.u64 %rd1477, %rd1476, 32;
mul.lo.s64 %rd2601, %rd1477, 3449720151;
shr.u64 %rd1478, %rd2601, 32;
and.b64 %rd1479, %rd1475, 4294967295;
mul.lo.s64 %rd1480, %rd1479, 3449720151;
and.b64 %rd1481, %rd1480, 4294967295;
xor.b64 %rd1482, %rd1481, %rd1478;
xor.b64 %rd1483, %rd1482, 2654435769;
mul.lo.s64 %rd2604, %rd1483, 3528531795;
xor.b64 %rd2594, %rd1480, %rd241;
mov.u32 %r332, -1879881855;
mov.u32 %r331, -845247145;
mov.u32 %r330, 534103459;
mov.u64 %rd2612, 3678237736;
mov.u64 %rd2611, 3041712726;
mov.u64 %rd2610, 1401181199;
mov.u64 %rd2609, 2835769497;
mov.u64 %rd2608, 1684936478;
mov.u64 %rd2607, 2027808484;
mov.u64 %rd2606, 387276957;
mov.u64 %rd2605, 842468239;
mov.u64 %rd2603, 3986602516;
mov.u64 %rd2602, 1013904242;
mov.u64 %rd2600, 3668340011;
mov.u64 %rd2599, 3144134277;
mov.u64 %rd2597, 3449720151;
mov.u64 %rd2596, 1993301258;
mov.u64 %rd2595, 3528531795;
bra.uni LBB55_29;
LBB55_28:
setp.lt.u64 %p34, %rd241, %rd12;
selp.u64 %rd1447, 1, 0, %p34;
add.s64 %rd1448, %rd2464, %rd1447;
and.b64 %rd1449, %rd1448, 4294967295;
mul.lo.s64 %rd2598, %rd1449, 3449720151;
xor.b64 %rd1450, %rd2598, %rd241;
shr.u64 %rd1451, %rd1450, 32;
mul.lo.s64 %rd2601, %rd1451, 3528531795;
shr.u64 %rd1452, %rd2601, 32;
and.b64 %rd1453, %rd241, 4294967295;
mul.lo.s64 %rd1454, %rd1453, 3528531795;
and.b64 %rd1455, %rd1454, 4294967295;
xor.b64 %rd1456, %rd1455, %rd1452;
xor.b64 %rd1457, %rd1456, 3144134277;
mul.lo.s64 %rd2604, %rd1457, 3449720151;
xor.b64 %rd2594, %rd1448, %rd1454;
mov.u32 %r332, -1767562579;
mov.u32 %r331, -766435501;
mov.u32 %r330, 1401181199;
mov.u64 %rd2612, 4055616968;
mov.u64 %rd2611, 1684936478;
mov.u64 %rd2610, 534103459;
mov.u64 %rd2609, 387276957;
mov.u64 %rd2608, 3041712726;
mov.u64 %rd2607, 3986602516;
mov.u64 %rd2606, 2835769497;
mov.u64 %rd2605, 3668340011;
mov.u64 %rd2603, 2027808484;
mov.u64 %rd2602, 1993301258;
mov.u64 %rd2600, 842468239;
mov.u64 %rd2599, 2654435769;
mov.u64 %rd2597, 3528531795;
mov.u64 %rd2596, 1013904242;
mov.u64 %rd2595, 3449720151;
LBB55_29:
shr.u64 %rd1484, %rd2604, 32;
shr.u64 %rd1485, %rd2594, 32;
mul.lo.s64 %rd1486, %rd1485, %rd2595;
and.b64 %rd1487, %rd1486, 4294967295;
xor.b64 %rd1488, %rd1487, %rd1484;
xor.b64 %rd1489, %rd1488, %rd2596;
mul.lo.s64 %rd1490, %rd1489, %rd2597;
shr.u64 %rd1491, %rd1490, 32;
shr.u64 %rd1492, %rd1486, 32;
and.b64 %rd1493, %rd2598, 4294967295;
xor.b64 %rd1494, %rd1493, %rd1492;
xor.b64 %rd1495, %rd1494, %rd2599;
mul.lo.s64 %rd1496, %rd1495, %rd2597;
and.b64 %rd1497, %rd1496, 4294967295;
xor.b64 %rd1498, %rd1497, %rd1491;
xor.b64 %rd1499, %rd1498, %rd2600;
mul.lo.s64 %rd1500, %rd1499, %rd2595;
shr.u64 %rd1501, %rd1500, 32;
shr.u64 %rd1502, %rd1496, 32;
and.b64 %rd1503, %rd2601, 4294967295;
xor.b64 %rd1504, %rd1503, %rd1502;
xor.b64 %rd1505, %rd1504, %rd2602;
mul.lo.s64 %rd1506, %rd1505, %rd2595;
and.b64 %rd1507, %rd1506, 4294967295;
xor.b64 %rd1508, %rd1507, %rd1501;
xor.b64 %rd1509, %rd1508, %rd2603;
mul.lo.s64 %rd1510, %rd1509, %rd2597;
shr.u64 %rd1511, %rd1510, 32;
shr.u64 %rd1512, %rd1506, 32;
and.b64 %rd1513, %rd2604, 4294967295;
xor.b64 %rd1514, %rd1513, %rd1512;
xor.b64 %rd1515, %rd1514, %rd2605;
mul.lo.s64 %rd1516, %rd1515, %rd2597;
and.b64 %rd1517, %rd1516, 4294967295;
xor.b64 %rd1518, %rd1517, %rd1511;
xor.b64 %rd1519, %rd1518, %rd2606;
mul.lo.s64 %rd1520, %rd1519, %rd2595;
shr.u64 %rd1521, %rd1520, 32;
shr.u64 %rd1522, %rd1516, 32;
and.b64 %rd1523, %rd1490, 4294967295;
xor.b64 %rd1524, %rd1523, %rd1522;
xor.b64 %rd1525, %rd1524, %rd2607;
mul.lo.s64 %rd1526, %rd1525, %rd2595;
and.b64 %rd1527, %rd1526, 4294967295;
xor.b64 %rd1528, %rd1527, %rd1521;
xor.b64 %rd1529, %rd1528, %rd2608;
mul.lo.s64 %rd1530, %rd1529, %rd2597;
shr.u64 %rd1531, %rd1530, 32;
shr.u64 %rd1532, %rd1526, 32;
and.b64 %rd1533, %rd1500, 4294967295;
xor.b64 %rd1534, %rd1533, %rd1532;
xor.b64 %rd1535, %rd1534, %rd2609;
mul.lo.s64 %rd1536, %rd1535, %rd2597;
and.b64 %rd1537, %rd1536, 4294967295;
xor.b64 %rd1538, %rd1537, %rd1531;
xor.b64 %rd1539, %rd1538, %rd2610;
mul.lo.s64 %rd1540, %rd1539, %rd2595;
shr.u64 %rd1541, %rd1540, 32;
shr.u64 %rd1542, %rd1536, 32;
and.b64 %rd1543, %rd1510, 4294967295;
xor.b64 %rd1544, %rd1543, %rd1542;
xor.b64 %rd1545, %rd1544, %rd2611;
mul.lo.s64 %rd1546, %rd1545, %rd2595;
and.b64 %rd1547, %rd1546, 4294967295;
xor.b64 %rd1548, %rd1547, %rd1541;
xor.b64 %rd1549, %rd1548, %rd2612;
mul.lo.s64 %rd1550, %rd1549, %rd2597;
shr.u64 %rd1551, %rd1550, 32;
cvt.u32.u64 %r166, %rd1551;
shr.u64 %rd1552, %rd1546, 32;
xor.b64 %rd1553, %rd1552, %rd1520;
cvt.u32.u64 %r167, %rd1553;
xor.b32 %r168, %r330, %r167;
mul.lo.s32 %r169, %r168, %r331;
xor.b32 %r170, %r169, %r166;
xor.b32 %r171, %r170, %r332;
shr.u32 %r172, %r171, 9;
cvt.rn.f32.u32 %f145, %r172;
mul.rn.f32 %f146, %f145, 0f34000000;
cvt.rn.f16.f32 %h73, %f146;
mov.b16 %h74, 0x2E66;
setp.ge.f16 %p36, %h73, %h74;
ld.global.nc.b16 %h75, [%rd45+1024];
ld.global.nc.f32 %f147, [%rd46+2048];
cvt.rn.f16.f32 %h76, %f147;
add.rn.f16 %h77, %h75, %h76;
mov.b16 %h78, 0x3C72;
mul.rn.f16 %h79, %h77, %h78;
selp.b16 %h80, %h79, 0x0000, %p36;
cvt.f32.f16 %f148, %h80;
ld.global.nc.b16 %h81, [%rd47+1024];
cvt.f32.f16 %f149, %h81;
ld.global.nc.f32 %f150, [%rd48+2048];
mul.rn.f32 %f151, %f1, %f150;
mul.rn.f32 %f152, %f151, %f149;
ld.global.nc.f32 %f153, [%rd49+2048];
mul.rn.f32 %f154, %f2, %f151;
sub.rn.f32 %f155, %f153, %f154;
add.rn.f32 %f156, %f152, %f155;
add.rn.f32 %f157, %f156, %f148;
sub.rn.f32 %f158, %f157, %f3;
mul.rn.f32 %f159, %f158, %f158;
add.rn.f32 %f12, %f11, %f159;
or.b32 %r173, %r3, 513;
or.b32 %r174, %r173, %r4;
and.b32 %r175, %r173, 3;
shr.u32 %r176, %r174, 2;
setp.ne.s32 %p37, %r175, 1;
cvt.u64.u32 %rd1554, %r176;
add.s64 %rd269, %rd12, %rd1554;
@%p37 bra LBB55_31;
and.b64 %rd1594, %rd269, 4294967295;
mul.lo.s64 %rd2617, %rd1594, 3528531795;
setp.lt.u64 %p39, %rd269, %rd12;
selp.u64 %rd1595, 1, 0, %p39;
add.s64 %rd1596, %rd2464, %rd1595;
xor.b64 %rd1597, %rd1596, %rd2617;
shr.u64 %rd1598, %rd1597, 32;
mul.lo.s64 %rd2620, %rd1598, 3449720151;
shr.u64 %rd1599, %rd2620, 32;
and.b64 %rd1600, %rd1596, 4294967295;
mul.lo.s64 %rd1601, %rd1600, 3449720151;
and.b64 %rd1602, %rd1601, 4294967295;
xor.b64 %rd1603, %rd1602, %rd1599;
xor.b64 %rd1604, %rd1603, 2654435769;
mul.lo.s64 %rd2623, %rd1604, 3528531795;
xor.b64 %rd2613, %rd1601, %rd269;
mov.u32 %r334, -845247145;
mov.u32 %r333, -616729560;
mov.u64 %rd2630, 3041712726;
mov.u64 %rd2629, 1401181199;
mov.u64 %rd2628, 2835769497;
mov.u64 %rd2627, 1684936478;
mov.u64 %rd2626, 2027808484;
mov.u64 %rd2625, 387276957;
mov.u64 %rd2624, 842468239;
mov.u64 %rd2622, 3986602516;
mov.u64 %rd2621, 1013904242;
mov.u64 %rd2619, 3668340011;
mov.u64 %rd2618, 3144134277;
mov.u64 %rd2616, 3449720151;
mov.u64 %rd2615, 1993301258;
mov.u64 %rd2614, 3528531795;
bra.uni LBB55_32;
LBB55_31:
setp.lt.u64 %p38, %rd269, %rd12;
selp.u64 %rd1569, 1, 0, %p38;
add.s64 %rd1570, %rd2464, %rd1569;
and.b64 %rd1571, %rd1570, 4294967295;
mul.lo.s64 %rd2617, %rd1571, 3449720151;
xor.b64 %rd1572, %rd2617, %rd269;
shr.u64 %rd1573, %rd1572, 32;
mul.lo.s64 %rd2620, %rd1573, 3528531795;
shr.u64 %rd1574, %rd2620, 32;
and.b64 %rd1575, %rd269, 4294967295;
mul.lo.s64 %rd1576, %rd1575, 3528531795;
and.b64 %rd1577, %rd1576, 4294967295;
xor.b64 %rd1578, %rd1577, %rd1574;
xor.b64 %rd1579, %rd1578, 3144134277;
mul.lo.s64 %rd2623, %rd1579, 3449720151;
xor.b64 %rd2613, %rd1570, %rd1576;
mov.u32 %r334, -766435501;
mov.u32 %r333, -239350328;
mov.u64 %rd2630, 1684936478;
mov.u64 %rd2629, 534103459;
mov.u64 %rd2628, 387276957;
mov.u64 %rd2627, 3041712726;
mov.u64 %rd2626, 3986602516;
mov.u64 %rd2625, 2835769497;
mov.u64 %rd2624, 3668340011;
mov.u64 %rd2622, 2027808484;
mov.u64 %rd2621, 1993301258;
mov.u64 %rd2619, 842468239;
mov.u64 %rd2618, 2654435769;
mov.u64 %rd2616, 3528531795;
mov.u64 %rd2615, 1013904242;
mov.u64 %rd2614, 3449720151;
LBB55_32:
shr.u64 %rd1605, %rd2623, 32;
shr.u64 %rd1606, %rd2613, 32;
mul.lo.s64 %rd1607, %rd1606, %rd2614;
and.b64 %rd1608, %rd1607, 4294967295;
xor.b64 %rd1609, %rd1608, %rd1605;
xor.b64 %rd1610, %rd1609, %rd2615;
mul.lo.s64 %rd1611, %rd1610, %rd2616;
shr.u64 %rd1612, %rd1611, 32;
shr.u64 %rd1613, %rd1607, 32;
and.b64 %rd1614, %rd2617, 4294967295;
xor.b64 %rd1615, %rd1614, %rd1613;
xor.b64 %rd1616, %rd1615, %rd2618;
mul.lo.s64 %rd1617, %rd1616, %rd2616;
and.b64 %rd1618, %rd1617, 4294967295;
xor.b64 %rd1619, %rd1618, %rd1612;
xor.b64 %rd1620, %rd1619, %rd2619;
mul.lo.s64 %rd1621, %rd1620, %rd2614;
shr.u64 %rd1622, %rd1621, 32;
shr.u64 %rd1623, %rd1617, 32;
and.b64 %rd1624, %rd2620, 4294967295;
xor.b64 %rd1625, %rd1624, %rd1623;
xor.b64 %rd1626, %rd1625, %rd2621;
mul.lo.s64 %rd1627, %rd1626, %rd2614;
and.b64 %rd1628, %rd1627, 4294967295;
xor.b64 %rd1629, %rd1628, %rd1622;
xor.b64 %rd1630, %rd1629, %rd2622;
mul.lo.s64 %rd1631, %rd1630, %rd2616;
shr.u64 %rd1632, %rd1631, 32;
shr.u64 %rd1633, %rd1627, 32;
and.b64 %rd1634, %rd2623, 4294967295;
xor.b64 %rd1635, %rd1634, %rd1633;
xor.b64 %rd1636, %rd1635, %rd2624;
mul.lo.s64 %rd1637, %rd1636, %rd2616;
and.b64 %rd1638, %rd1637, 4294967295;
xor.b64 %rd1639, %rd1638, %rd1632;
xor.b64 %rd1640, %rd1639, %rd2625;
mul.lo.s64 %rd1641, %rd1640, %rd2614;
shr.u64 %rd1642, %rd1641, 32;
shr.u64 %rd1643, %rd1637, 32;
and.b64 %rd1644, %rd1611, 4294967295;
xor.b64 %rd1645, %rd1644, %rd1643;
xor.b64 %rd1646, %rd1645, %rd2626;
mul.lo.s64 %rd1647, %rd1646, %rd2614;
and.b64 %rd1648, %rd1647, 4294967295;
xor.b64 %rd1649, %rd1648, %rd1642;
xor.b64 %rd1650, %rd1649, %rd2627;
mul.lo.s64 %rd1651, %rd1650, %rd2616;
shr.u64 %rd1652, %rd1651, 32;
shr.u64 %rd1653, %rd1647, 32;
and.b64 %rd1654, %rd1621, 4294967295;
xor.b64 %rd1655, %rd1654, %rd1653;
xor.b64 %rd1656, %rd1655, %rd2628;
mul.lo.s64 %rd1657, %rd1656, %rd2616;
and.b64 %rd1658, %rd1657, 4294967295;
xor.b64 %rd1659, %rd1658, %rd1652;
xor.b64 %rd1660, %rd1659, %rd2629;
mul.lo.s64 %rd1661, %rd1660, %rd2614;
shr.u64 %rd1662, %rd1661, 32;
shr.u64 %rd1663, %rd1657, 32;
xor.b64 %rd1664, %rd1631, %rd1663;
xor.b64 %rd1665, %rd1664, %rd2630;
mul.lo.s64 %rd1666, %rd1665, %rd2614;
xor.b64 %rd1667, %rd1662, %rd1666;
cvt.u32.u64 %r181, %rd1667;
xor.b32 %r182, %r333, %r181;
mul.lo.s32 %r183, %r182, %r334;
shr.u32 %r184, %r183, 9;
cvt.rn.f32.u32 %f160, %r184;
mul.rn.f32 %f161, %f160, 0f34000000;
cvt.rn.f16.f32 %h82, %f161;
mov.b16 %h83, 0x2E66;
setp.ge.f16 %p41, %h82, %h83;
ld.global.nc.b16 %h84, [%rd45+1026];
ld.global.nc.f32 %f162, [%rd46+2052];
cvt.rn.f16.f32 %h85, %f162;
add.rn.f16 %h86, %h84, %h85;
mov.b16 %h87, 0x3C72;
mul.rn.f16 %h88, %h86, %h87;
selp.b16 %h89, %h88, 0x0000, %p41;
cvt.f32.f16 %f163, %h89;
ld.global.nc.b16 %h90, [%rd47+1026];
cvt.f32.f16 %f164, %h90;
ld.global.nc.f32 %f165, [%rd48+2052];
mul.rn.f32 %f166, %f1, %f165;
mul.rn.f32 %f167, %f166, %f164;
ld.global.nc.f32 %f168, [%rd49+2052];
mul.rn.f32 %f169, %f2, %f166;
sub.rn.f32 %f170, %f168, %f169;
add.rn.f32 %f171, %f167, %f170;
add.rn.f32 %f172, %f171, %f163;
sub.rn.f32 %f173, %f172, %f3;
mul.rn.f32 %f174, %f173, %f173;
add.rn.f32 %f13, %f12, %f174;
or.b32 %r186, %r73, 640;
shr.u32 %r187, %r186, 2;
cvt.u64.u32 %rd1668, %r187;
add.s64 %rd296, %rd12, %rd1668;
@%p8 bra LBB55_34;
and.b64 %rd1710, %rd296, 4294967295;
mul.lo.s64 %rd2635, %rd1710, 3528531795;
setp.lt.u64 %p43, %rd296, %rd12;
selp.u64 %rd1711, 1, 0, %p43;
add.s64 %rd1712, %rd2464, %rd1711;
xor.b64 %rd1713, %rd1712, %rd2635;
shr.u64 %rd1714, %rd1713, 32;
mul.lo.s64 %rd2638, %rd1714, 3449720151;
shr.u64 %rd1715, %rd2638, 32;
and.b64 %rd1716, %rd1712, 4294967295;
mul.lo.s64 %rd1717, %rd1716, 3449720151;
and.b64 %rd1718, %rd1717, 4294967295;
xor.b64 %rd1719, %rd1718, %rd1715;
xor.b64 %rd1720, %rd1719, 2654435769;
mul.lo.s64 %rd2641, %rd1720, 3528531795;
xor.b64 %rd2631, %rd1717, %rd296;
mov.u32 %r337, -1879881855;
mov.u32 %r336, -845247145;
mov.u32 %r335, 534103459;
mov.u64 %rd2649, 3678237736;
mov.u64 %rd2648, 3041712726;
mov.u64 %rd2647, 1401181199;
mov.u64 %rd2646, 2835769497;
mov.u64 %rd2645, 1684936478;
mov.u64 %rd2644, 2027808484;
mov.u64 %rd2643, 387276957;
mov.u64 %rd2642, 842468239;
mov.u64 %rd2640, 3986602516;
mov.u64 %rd2639, 1013904242;
mov.u64 %rd2637, 3668340011;
mov.u64 %rd2636, 3144134277;
mov.u64 %rd2634, 3449720151;
mov.u64 %rd2633, 1993301258;
mov.u64 %rd2632, 3528531795;
bra.uni LBB55_35;
LBB55_34:
setp.lt.u64 %p42, %rd296, %rd12;
selp.u64 %rd1684, 1, 0, %p42;
add.s64 %rd1685, %rd2464, %rd1684;
and.b64 %rd1686, %rd1685, 4294967295;
mul.lo.s64 %rd2635, %rd1686, 3449720151;
xor.b64 %rd1687, %rd2635, %rd296;
shr.u64 %rd1688, %rd1687, 32;
mul.lo.s64 %rd2638, %rd1688, 3528531795;
shr.u64 %rd1689, %rd2638, 32;
and.b64 %rd1690, %rd296, 4294967295;
mul.lo.s64 %rd1691, %rd1690, 3528531795;
and.b64 %rd1692, %rd1691, 4294967295;
xor.b64 %rd1693, %rd1692, %rd1689;
xor.b64 %rd1694, %rd1693, 3144134277;
mul.lo.s64 %rd2641, %rd1694, 3449720151;
xor.b64 %rd2631, %rd1685, %rd1691;
mov.u32 %r337, -1767562579;
mov.u32 %r336, -766435501;
mov.u32 %r335, 1401181199;
mov.u64 %rd2649, 4055616968;
mov.u64 %rd2648, 1684936478;
mov.u64 %rd2647, 534103459;
mov.u64 %rd2646, 387276957;
mov.u64 %rd2645, 3041712726;
mov.u64 %rd2644, 3986602516;
mov.u64 %rd2643, 2835769497;
mov.u64 %rd2642, 3668340011;
mov.u64 %rd2640, 2027808484;
mov.u64 %rd2639, 1993301258;
mov.u64 %rd2637, 842468239;
mov.u64 %rd2636, 2654435769;
mov.u64 %rd2634, 3528531795;
mov.u64 %rd2633, 1013904242;
mov.u64 %rd2632, 3449720151;
LBB55_35:
shr.u64 %rd1721, %rd2641, 32;
shr.u64 %rd1722, %rd2631, 32;
mul.lo.s64 %rd1723, %rd1722, %rd2632;
and.b64 %rd1724, %rd1723, 4294967295;
xor.b64 %rd1725, %rd1724, %rd1721;
xor.b64 %rd1726, %rd1725, %rd2633;
mul.lo.s64 %rd1727, %rd1726, %rd2634;
shr.u64 %rd1728, %rd1727, 32;
shr.u64 %rd1729, %rd1723, 32;
and.b64 %rd1730, %rd2635, 4294967295;
xor.b64 %rd1731, %rd1730, %rd1729;
xor.b64 %rd1732, %rd1731, %rd2636;
mul.lo.s64 %rd1733, %rd1732, %rd2634;
and.b64 %rd1734, %rd1733, 4294967295;
xor.b64 %rd1735, %rd1734, %rd1728;
xor.b64 %rd1736, %rd1735, %rd2637;
mul.lo.s64 %rd1737, %rd1736, %rd2632;
shr.u64 %rd1738, %rd1737, 32;
shr.u64 %rd1739, %rd1733, 32;
and.b64 %rd1740, %rd2638, 4294967295;
xor.b64 %rd1741, %rd1740, %rd1739;
xor.b64 %rd1742, %rd1741, %rd2639;
mul.lo.s64 %rd1743, %rd1742, %rd2632;
and.b64 %rd1744, %rd1743, 4294967295;
xor.b64 %rd1745, %rd1744, %rd1738;
xor.b64 %rd1746, %rd1745, %rd2640;
mul.lo.s64 %rd1747, %rd1746, %rd2634;
shr.u64 %rd1748, %rd1747, 32;
shr.u64 %rd1749, %rd1743, 32;
and.b64 %rd1750, %rd2641, 4294967295;
xor.b64 %rd1751, %rd1750, %rd1749;
xor.b64 %rd1752, %rd1751, %rd2642;
mul.lo.s64 %rd1753, %rd1752, %rd2634;
and.b64 %rd1754, %rd1753, 4294967295;
xor.b64 %rd1755, %rd1754, %rd1748;
xor.b64 %rd1756, %rd1755, %rd2643;
mul.lo.s64 %rd1757, %rd1756, %rd2632;
shr.u64 %rd1758, %rd1757, 32;
shr.u64 %rd1759, %rd1753, 32;
and.b64 %rd1760, %rd1727, 4294967295;
xor.b64 %rd1761, %rd1760, %rd1759;
xor.b64 %rd1762, %rd1761, %rd2644;
mul.lo.s64 %rd1763, %rd1762, %rd2632;
and.b64 %rd1764, %rd1763, 4294967295;
xor.b64 %rd1765, %rd1764, %rd1758;
xor.b64 %rd1766, %rd1765, %rd2645;
mul.lo.s64 %rd1767, %rd1766, %rd2634;
shr.u64 %rd1768, %rd1767, 32;
shr.u64 %rd1769, %rd1763, 32;
and.b64 %rd1770, %rd1737, 4294967295;
xor.b64 %rd1771, %rd1770, %rd1769;
xor.b64 %rd1772, %rd1771, %rd2646;
mul.lo.s64 %rd1773, %rd1772, %rd2634;
and.b64 %rd1774, %rd1773, 4294967295;
xor.b64 %rd1775, %rd1774, %rd1768;
xor.b64 %rd1776, %rd1775, %rd2647;
mul.lo.s64 %rd1777, %rd1776, %rd2632;
shr.u64 %rd1778, %rd1777, 32;
shr.u64 %rd1779, %rd1773, 32;
and.b64 %rd1780, %rd1747, 4294967295;
xor.b64 %rd1781, %rd1780, %rd1779;
xor.b64 %rd1782, %rd1781, %rd2648;
mul.lo.s64 %rd1783, %rd1782, %rd2632;
and.b64 %rd1784, %rd1783, 4294967295;
xor.b64 %rd1785, %rd1784, %rd1778;
xor.b64 %rd1786, %rd1785, %rd2649;
mul.lo.s64 %rd1787, %rd1786, %rd2634;
shr.u64 %rd1788, %rd1787, 32;
cvt.u32.u64 %r194, %rd1788;
shr.u64 %rd1789, %rd1783, 32;
xor.b64 %rd1790, %rd1789, %rd1757;
cvt.u32.u64 %r195, %rd1790;
xor.b32 %r196, %r335, %r195;
mul.lo.s32 %r197, %r196, %r336;
xor.b32 %r198, %r197, %r194;
xor.b32 %r199, %r198, %r337;
shr.u32 %r200, %r199, 9;
cvt.rn.f32.u32 %f175, %r200;
mul.rn.f32 %f176, %f175, 0f34000000;
cvt.rn.f16.f32 %h91, %f176;
mov.b16 %h92, 0x2E66;
setp.ge.f16 %p44, %h91, %h92;
ld.global.nc.b16 %h93, [%rd45+1280];
ld.global.nc.f32 %f177, [%rd46+2560];
cvt.rn.f16.f32 %h94, %f177;
add.rn.f16 %h95, %h93, %h94;
mov.b16 %h96, 0x3C72;
mul.rn.f16 %h97, %h95, %h96;
selp.b16 %h98, %h97, 0x0000, %p44;
cvt.f32.f16 %f178, %h98;
ld.global.nc.b16 %h99, [%rd47+1280];
cvt.f32.f16 %f179, %h99;
ld.global.nc.f32 %f180, [%rd48+2560];
mul.rn.f32 %f181, %f1, %f180;
mul.rn.f32 %f182, %f181, %f179;
ld.global.nc.f32 %f183, [%rd49+2560];
mul.rn.f32 %f184, %f2, %f181;
sub.rn.f32 %f185, %f183, %f184;
add.rn.f32 %f186, %f182, %f185;
add.rn.f32 %f187, %f186, %f178;
sub.rn.f32 %f188, %f187, %f3;
mul.rn.f32 %f189, %f188, %f188;
add.rn.f32 %f14, %f13, %f189;
or.b32 %r201, %r3, 641;
or.b32 %r202, %r201, %r4;
and.b32 %r203, %r201, 3;
shr.u32 %r204, %r202, 2;
setp.ne.s32 %p45, %r203, 1;
cvt.u64.u32 %rd1791, %r204;
add.s64 %rd324, %rd12, %rd1791;
@%p45 bra LBB55_37;
and.b64 %rd1831, %rd324, 4294967295;
mul.lo.s64 %rd2654, %rd1831, 3528531795;
setp.lt.u64 %p47, %rd324, %rd12;
selp.u64 %rd1832, 1, 0, %p47;
add.s64 %rd1833, %rd2464, %rd1832;
xor.b64 %rd1834, %rd1833, %rd2654;
shr.u64 %rd1835, %rd1834, 32;
mul.lo.s64 %rd2657, %rd1835, 3449720151;
shr.u64 %rd1836, %rd2657, 32;
and.b64 %rd1837, %rd1833, 4294967295;
mul.lo.s64 %rd1838, %rd1837, 3449720151;
and.b64 %rd1839, %rd1838, 4294967295;
xor.b64 %rd1840, %rd1839, %rd1836;
xor.b64 %rd1841, %rd1840, 2654435769;
mul.lo.s64 %rd2660, %rd1841, 3528531795;
xor.b64 %rd2650, %rd1838, %rd324;
mov.u32 %r339, -845247145;
mov.u32 %r338, -616729560;
mov.u64 %rd2667, 3041712726;
mov.u64 %rd2666, 1401181199;
mov.u64 %rd2665, 2835769497;
mov.u64 %rd2664, 1684936478;
mov.u64 %rd2663, 2027808484;
mov.u64 %rd2662, 387276957;
mov.u64 %rd2661, 842468239;
mov.u64 %rd2659, 3986602516;
mov.u64 %rd2658, 1013904242;
mov.u64 %rd2656, 3668340011;
mov.u64 %rd2655, 3144134277;
mov.u64 %rd2653, 3449720151;
mov.u64 %rd2652, 1993301258;
mov.u64 %rd2651, 3528531795;
bra.uni LBB55_38;
LBB55_37:
setp.lt.u64 %p46, %rd324, %rd12;
selp.u64 %rd1806, 1, 0, %p46;
add.s64 %rd1807, %rd2464, %rd1806;
and.b64 %rd1808, %rd1807, 4294967295;
mul.lo.s64 %rd2654, %rd1808, 3449720151;
xor.b64 %rd1809, %rd2654, %rd324;
shr.u64 %rd1810, %rd1809, 32;
mul.lo.s64 %rd2657, %rd1810, 3528531795;
shr.u64 %rd1811, %rd2657, 32;
and.b64 %rd1812, %rd324, 4294967295;
mul.lo.s64 %rd1813, %rd1812, 3528531795;
and.b64 %rd1814, %rd1813, 4294967295;
xor.b64 %rd1815, %rd1814, %rd1811;
xor.b64 %rd1816, %rd1815, 3144134277;
mul.lo.s64 %rd2660, %rd1816, 3449720151;
xor.b64 %rd2650, %rd1807, %rd1813;
mov.u32 %r339, -766435501;
mov.u32 %r338, -239350328;
mov.u64 %rd2667, 1684936478;
mov.u64 %rd2666, 534103459;
mov.u64 %rd2665, 387276957;
mov.u64 %rd2664, 3041712726;
mov.u64 %rd2663, 3986602516;
mov.u64 %rd2662, 2835769497;
mov.u64 %rd2661, 3668340011;
mov.u64 %rd2659, 2027808484;
mov.u64 %rd2658, 1993301258;
mov.u64 %rd2656, 842468239;
mov.u64 %rd2655, 2654435769;
mov.u64 %rd2653, 3528531795;
mov.u64 %rd2652, 1013904242;
mov.u64 %rd2651, 3449720151;
LBB55_38:
shr.u64 %rd1842, %rd2660, 32;
shr.u64 %rd1843, %rd2650, 32;
mul.lo.s64 %rd1844, %rd1843, %rd2651;
and.b64 %rd1845, %rd1844, 4294967295;
xor.b64 %rd1846, %rd1845, %rd1842;
xor.b64 %rd1847, %rd1846, %rd2652;
mul.lo.s64 %rd1848, %rd1847, %rd2653;
shr.u64 %rd1849, %rd1848, 32;
shr.u64 %rd1850, %rd1844, 32;
and.b64 %rd1851, %rd2654, 4294967295;
xor.b64 %rd1852, %rd1851, %rd1850;
xor.b64 %rd1853, %rd1852, %rd2655;
mul.lo.s64 %rd1854, %rd1853, %rd2653;
and.b64 %rd1855, %rd1854, 4294967295;
xor.b64 %rd1856, %rd1855, %rd1849;
xor.b64 %rd1857, %rd1856, %rd2656;
mul.lo.s64 %rd1858, %rd1857, %rd2651;
shr.u64 %rd1859, %rd1858, 32;
shr.u64 %rd1860, %rd1854, 32;
and.b64 %rd1861, %rd2657, 4294967295;
xor.b64 %rd1862, %rd1861, %rd1860;
xor.b64 %rd1863, %rd1862, %rd2658;
mul.lo.s64 %rd1864, %rd1863, %rd2651;
and.b64 %rd1865, %rd1864, 4294967295;
xor.b64 %rd1866, %rd1865, %rd1859;
xor.b64 %rd1867, %rd1866, %rd2659;
mul.lo.s64 %rd1868, %rd1867, %rd2653;
shr.u64 %rd1869, %rd1868, 32;
shr.u64 %rd1870, %rd1864, 32;
and.b64 %rd1871, %rd2660, 4294967295;
xor.b64 %rd1872, %rd1871, %rd1870;
xor.b64 %rd1873, %rd1872, %rd2661;
mul.lo.s64 %rd1874, %rd1873, %rd2653;
and.b64 %rd1875, %rd1874, 4294967295;
xor.b64 %rd1876, %rd1875, %rd1869;
xor.b64 %rd1877, %rd1876, %rd2662;
mul.lo.s64 %rd1878, %rd1877, %rd2651;
shr.u64 %rd1879, %rd1878, 32;
shr.u64 %rd1880, %rd1874, 32;
and.b64 %rd1881, %rd1848, 4294967295;
xor.b64 %rd1882, %rd1881, %rd1880;
xor.b64 %rd1883, %rd1882, %rd2663;
mul.lo.s64 %rd1884, %rd1883, %rd2651;
and.b64 %rd1885, %rd1884, 4294967295;
xor.b64 %rd1886, %rd1885, %rd1879;
xor.b64 %rd1887, %rd1886, %rd2664;
mul.lo.s64 %rd1888, %rd1887, %rd2653;
shr.u64 %rd1889, %rd1888, 32;
shr.u64 %rd1890, %rd1884, 32;
and.b64 %rd1891, %rd1858, 4294967295;
xor.b64 %rd1892, %rd1891, %rd1890;
xor.b64 %rd1893, %rd1892, %rd2665;
mul.lo.s64 %rd1894, %rd1893, %rd2653;
and.b64 %rd1895, %rd1894, 4294967295;
xor.b64 %rd1896, %rd1895, %rd1889;
xor.b64 %rd1897, %rd1896, %rd2666;
mul.lo.s64 %rd1898, %rd1897, %rd2651;
shr.u64 %rd1899, %rd1898, 32;
shr.u64 %rd1900, %rd1894, 32;
xor.b64 %rd1901, %rd1868, %rd1900;
xor.b64 %rd1902, %rd1901, %rd2667;
mul.lo.s64 %rd1903, %rd1902, %rd2651;
xor.b64 %rd1904, %rd1899, %rd1903;
cvt.u32.u64 %r209, %rd1904;
xor.b32 %r210, %r338, %r209;
mul.lo.s32 %r211, %r210, %r339;
shr.u32 %r212, %r211, 9;
cvt.rn.f32.u32 %f190, %r212;
mul.rn.f32 %f191, %f190, 0f34000000;
cvt.rn.f16.f32 %h100, %f191;
mov.b16 %h101, 0x2E66;
setp.ge.f16 %p49, %h100, %h101;
ld.global.nc.b16 %h102, [%rd45+1282];
ld.global.nc.f32 %f192, [%rd46+2564];
cvt.rn.f16.f32 %h103, %f192;
add.rn.f16 %h104, %h102, %h103;
mov.b16 %h105, 0x3C72;
mul.rn.f16 %h106, %h104, %h105;
selp.b16 %h107, %h106, 0x0000, %p49;
cvt.f32.f16 %f193, %h107;
ld.global.nc.b16 %h108, [%rd47+1282];
cvt.f32.f16 %f194, %h108;
ld.global.nc.f32 %f195, [%rd48+2564];
mul.rn.f32 %f196, %f1, %f195;
mul.rn.f32 %f197, %f196, %f194;
ld.global.nc.f32 %f198, [%rd49+2564];
mul.rn.f32 %f199, %f2, %f196;
sub.rn.f32 %f200, %f198, %f199;
add.rn.f32 %f201, %f197, %f200;
add.rn.f32 %f202, %f201, %f193;
sub.rn.f32 %f203, %f202, %f3;
mul.rn.f32 %f204, %f203, %f203;
add.rn.f32 %f15, %f14, %f204;
or.b32 %r214, %r73, 768;
shr.u32 %r215, %r214, 2;
cvt.u64.u32 %rd1905, %r215;
add.s64 %rd351, %rd12, %rd1905;
@%p8 bra LBB55_40;
and.b64 %rd1947, %rd351, 4294967295;
mul.lo.s64 %rd2672, %rd1947, 3528531795;
setp.lt.u64 %p51, %rd351, %rd12;
selp.u64 %rd1948, 1, 0, %p51;
add.s64 %rd1949, %rd2464, %rd1948;
xor.b64 %rd1950, %rd1949, %rd2672;
shr.u64 %rd1951, %rd1950, 32;
mul.lo.s64 %rd2675, %rd1951, 3449720151;
shr.u64 %rd1952, %rd2675, 32;
and.b64 %rd1953, %rd1949, 4294967295;
mul.lo.s64 %rd1954, %rd1953, 3449720151;
and.b64 %rd1955, %rd1954, 4294967295;
xor.b64 %rd1956, %rd1955, %rd1952;
xor.b64 %rd1957, %rd1956, 2654435769;
mul.lo.s64 %rd2678, %rd1957, 3528531795;
xor.b64 %rd2668, %rd1954, %rd351;
mov.u32 %r342, -1879881855;
mov.u32 %r341, -845247145;
mov.u32 %r340, 534103459;
mov.u64 %rd2686, 3678237736;
mov.u64 %rd2685, 3041712726;
mov.u64 %rd2684, 1401181199;
mov.u64 %rd2683, 2835769497;
mov.u64 %rd2682, 1684936478;
mov.u64 %rd2681, 2027808484;
mov.u64 %rd2680, 387276957;
mov.u64 %rd2679, 842468239;
mov.u64 %rd2677, 3986602516;
mov.u64 %rd2676, 1013904242;
mov.u64 %rd2674, 3668340011;
mov.u64 %rd2673, 3144134277;
mov.u64 %rd2671, 3449720151;
mov.u64 %rd2670, 1993301258;
mov.u64 %rd2669, 3528531795;
bra.uni LBB55_41;
LBB55_40:
setp.lt.u64 %p50, %rd351, %rd12;
selp.u64 %rd1921, 1, 0, %p50;
add.s64 %rd1922, %rd2464, %rd1921;
and.b64 %rd1923, %rd1922, 4294967295;
mul.lo.s64 %rd2672, %rd1923, 3449720151;
xor.b64 %rd1924, %rd2672, %rd351;
shr.u64 %rd1925, %rd1924, 32;
mul.lo.s64 %rd2675, %rd1925, 3528531795;
shr.u64 %rd1926, %rd2675, 32;
and.b64 %rd1927, %rd351, 4294967295;
mul.lo.s64 %rd1928, %rd1927, 3528531795;
and.b64 %rd1929, %rd1928, 4294967295;
xor.b64 %rd1930, %rd1929, %rd1926;
xor.b64 %rd1931, %rd1930, 3144134277;
mul.lo.s64 %rd2678, %rd1931, 3449720151;
xor.b64 %rd2668, %rd1922, %rd1928;
mov.u32 %r342, -1767562579;
mov.u32 %r341, -766435501;
mov.u32 %r340, 1401181199;
mov.u64 %rd2686, 4055616968;
mov.u64 %rd2685, 1684936478;
mov.u64 %rd2684, 534103459;
mov.u64 %rd2683, 387276957;
mov.u64 %rd2682, 3041712726;
mov.u64 %rd2681, 3986602516;
mov.u64 %rd2680, 2835769497;
mov.u64 %rd2679, 3668340011;
mov.u64 %rd2677, 2027808484;
mov.u64 %rd2676, 1993301258;
mov.u64 %rd2674, 842468239;
mov.u64 %rd2673, 2654435769;
mov.u64 %rd2671, 3528531795;
mov.u64 %rd2670, 1013904242;
mov.u64 %rd2669, 3449720151;
LBB55_41:
shr.u64 %rd1958, %rd2678, 32;
shr.u64 %rd1959, %rd2668, 32;
mul.lo.s64 %rd1960, %rd1959, %rd2669;
and.b64 %rd1961, %rd1960, 4294967295;
xor.b64 %rd1962, %rd1961, %rd1958;
xor.b64 %rd1963, %rd1962, %rd2670;
mul.lo.s64 %rd1964, %rd1963, %rd2671;
shr.u64 %rd1965, %rd1964, 32;
shr.u64 %rd1966, %rd1960, 32;
and.b64 %rd1967, %rd2672, 4294967295;
xor.b64 %rd1968, %rd1967, %rd1966;
xor.b64 %rd1969, %rd1968, %rd2673;
mul.lo.s64 %rd1970, %rd1969, %rd2671;
and.b64 %rd1971, %rd1970, 4294967295;
xor.b64 %rd1972, %rd1971, %rd1965;
xor.b64 %rd1973, %rd1972, %rd2674;
mul.lo.s64 %rd1974, %rd1973, %rd2669;
shr.u64 %rd1975, %rd1974, 32;
shr.u64 %rd1976, %rd1970, 32;
and.b64 %rd1977, %rd2675, 4294967295;
xor.b64 %rd1978, %rd1977, %rd1976;
xor.b64 %rd1979, %rd1978, %rd2676;
mul.lo.s64 %rd1980, %rd1979, %rd2669;
and.b64 %rd1981, %rd1980, 4294967295;
xor.b64 %rd1982, %rd1981, %rd1975;
xor.b64 %rd1983, %rd1982, %rd2677;
mul.lo.s64 %rd1984, %rd1983, %rd2671;
shr.u64 %rd1985, %rd1984, 32;
shr.u64 %rd1986, %rd1980, 32;
and.b64 %rd1987, %rd2678, 4294967295;
xor.b64 %rd1988, %rd1987, %rd1986;
xor.b64 %rd1989, %rd1988, %rd2679;
mul.lo.s64 %rd1990, %rd1989, %rd2671;
and.b64 %rd1991, %rd1990, 4294967295;
xor.b64 %rd1992, %rd1991, %rd1985;
xor.b64 %rd1993, %rd1992, %rd2680;
mul.lo.s64 %rd1994, %rd1993, %rd2669;
shr.u64 %rd1995, %rd1994, 32;
shr.u64 %rd1996, %rd1990, 32;
and.b64 %rd1997, %rd1964, 4294967295;
xor.b64 %rd1998, %rd1997, %rd1996;
xor.b64 %rd1999, %rd1998, %rd2681;
mul.lo.s64 %rd2000, %rd1999, %rd2669;
and.b64 %rd2001, %rd2000, 4294967295;
xor.b64 %rd2002, %rd2001, %rd1995;
xor.b64 %rd2003, %rd2002, %rd2682;
mul.lo.s64 %rd2004, %rd2003, %rd2671;
shr.u64 %rd2005, %rd2004, 32;
shr.u64 %rd2006, %rd2000, 32;
and.b64 %rd2007, %rd1974, 4294967295;
xor.b64 %rd2008, %rd2007, %rd2006;
xor.b64 %rd2009, %rd2008, %rd2683;
mul.lo.s64 %rd2010, %rd2009, %rd2671;
and.b64 %rd2011, %rd2010, 4294967295;
xor.b64 %rd2012, %rd2011, %rd2005;
xor.b64 %rd2013, %rd2012, %rd2684;
mul.lo.s64 %rd2014, %rd2013, %rd2669;
shr.u64 %rd2015, %rd2014, 32;
shr.u64 %rd2016, %rd2010, 32;
and.b64 %rd2017, %rd1984, 4294967295;
xor.b64 %rd2018, %rd2017, %rd2016;
xor.b64 %rd2019, %rd2018, %rd2685;
mul.lo.s64 %rd2020, %rd2019, %rd2669;
and.b64 %rd2021, %rd2020, 4294967295;
xor.b64 %rd2022, %rd2021, %rd2015;
xor.b64 %rd2023, %rd2022, %rd2686;
mul.lo.s64 %rd2024, %rd2023, %rd2671;
shr.u64 %rd2025, %rd2024, 32;
cvt.u32.u64 %r222, %rd2025;
shr.u64 %rd2026, %rd2020, 32;
xor.b64 %rd2027, %rd2026, %rd1994;
cvt.u32.u64 %r223, %rd2027;
xor.b32 %r224, %r340, %r223;
mul.lo.s32 %r225, %r224, %r341;
xor.b32 %r226, %r225, %r222;
xor.b32 %r227, %r226, %r342;
shr.u32 %r228, %r227, 9;
cvt.rn.f32.u32 %f205, %r228;
mul.rn.f32 %f206, %f205, 0f34000000;
cvt.rn.f16.f32 %h109, %f206;
mov.b16 %h110, 0x2E66;
setp.ge.f16 %p52, %h109, %h110;
ld.global.nc.b16 %h111, [%rd45+1536];
ld.global.nc.f32 %f207, [%rd46+3072];
cvt.rn.f16.f32 %h112, %f207;
add.rn.f16 %h113, %h111, %h112;
mov.b16 %h114, 0x3C72;
mul.rn.f16 %h115, %h113, %h114;
selp.b16 %h116, %h115, 0x0000, %p52;
cvt.f32.f16 %f208, %h116;
ld.global.nc.b16 %h117, [%rd47+1536];
cvt.f32.f16 %f209, %h117;
ld.global.nc.f32 %f210, [%rd48+3072];
mul.rn.f32 %f211, %f1, %f210;
mul.rn.f32 %f212, %f211, %f209;
ld.global.nc.f32 %f213, [%rd49+3072];
mul.rn.f32 %f214, %f2, %f211;
sub.rn.f32 %f215, %f213, %f214;
add.rn.f32 %f216, %f212, %f215;
add.rn.f32 %f217, %f216, %f208;
sub.rn.f32 %f218, %f217, %f3;
mul.rn.f32 %f219, %f218, %f218;
add.rn.f32 %f16, %f15, %f219;
or.b32 %r229, %r3, 769;
or.b32 %r230, %r229, %r4;
and.b32 %r231, %r229, 3;
shr.u32 %r232, %r230, 2;
setp.ne.s32 %p53, %r231, 1;
cvt.u64.u32 %rd2028, %r232;
add.s64 %rd379, %rd12, %rd2028;
@%p53 bra LBB55_43;
and.b64 %rd2068, %rd379, 4294967295;
mul.lo.s64 %rd2691, %rd2068, 3528531795;
setp.lt.u64 %p55, %rd379, %rd12;
selp.u64 %rd2069, 1, 0, %p55;
add.s64 %rd2070, %rd2464, %rd2069;
xor.b64 %rd2071, %rd2070, %rd2691;
shr.u64 %rd2072, %rd2071, 32;
mul.lo.s64 %rd2694, %rd2072, 3449720151;
shr.u64 %rd2073, %rd2694, 32;
and.b64 %rd2074, %rd2070, 4294967295;
mul.lo.s64 %rd2075, %rd2074, 3449720151;
and.b64 %rd2076, %rd2075, 4294967295;
xor.b64 %rd2077, %rd2076, %rd2073;
xor.b64 %rd2078, %rd2077, 2654435769;
mul.lo.s64 %rd2697, %rd2078, 3528531795;
xor.b64 %rd2687, %rd2075, %rd379;
mov.u32 %r344, -845247145;
mov.u32 %r343, -616729560;
mov.u64 %rd2704, 3041712726;
mov.u64 %rd2703, 1401181199;
mov.u64 %rd2702, 2835769497;
mov.u64 %rd2701, 1684936478;
mov.u64 %rd2700, 2027808484;
mov.u64 %rd2699, 387276957;
mov.u64 %rd2698, 842468239;
mov.u64 %rd2696, 3986602516;
mov.u64 %rd2695, 1013904242;
mov.u64 %rd2693, 3668340011;
mov.u64 %rd2692, 3144134277;
mov.u64 %rd2690, 3449720151;
mov.u64 %rd2689, 1993301258;
mov.u64 %rd2688, 3528531795;
bra.uni LBB55_44;
LBB55_43:
setp.lt.u64 %p54, %rd379, %rd12;
selp.u64 %rd2043, 1, 0, %p54;
add.s64 %rd2044, %rd2464, %rd2043;
and.b64 %rd2045, %rd2044, 4294967295;
mul.lo.s64 %rd2691, %rd2045, 3449720151;
xor.b64 %rd2046, %rd2691, %rd379;
shr.u64 %rd2047, %rd2046, 32;
mul.lo.s64 %rd2694, %rd2047, 3528531795;
shr.u64 %rd2048, %rd2694, 32;
and.b64 %rd2049, %rd379, 4294967295;
mul.lo.s64 %rd2050, %rd2049, 3528531795;
and.b64 %rd2051, %rd2050, 4294967295;
xor.b64 %rd2052, %rd2051, %rd2048;
xor.b64 %rd2053, %rd2052, 3144134277;
mul.lo.s64 %rd2697, %rd2053, 3449720151;
xor.b64 %rd2687, %rd2044, %rd2050;
mov.u32 %r344, -766435501;
mov.u32 %r343, -239350328;
mov.u64 %rd2704, 1684936478;
mov.u64 %rd2703, 534103459;
mov.u64 %rd2702, 387276957;
mov.u64 %rd2701, 3041712726;
mov.u64 %rd2700, 3986602516;
mov.u64 %rd2699, 2835769497;
mov.u64 %rd2698, 3668340011;
mov.u64 %rd2696, 2027808484;
mov.u64 %rd2695, 1993301258;
mov.u64 %rd2693, 842468239;
mov.u64 %rd2692, 2654435769;
mov.u64 %rd2690, 3528531795;
mov.u64 %rd2689, 1013904242;
mov.u64 %rd2688, 3449720151;
LBB55_44:
shr.u64 %rd2079, %rd2697, 32;
shr.u64 %rd2080, %rd2687, 32;
mul.lo.s64 %rd2081, %rd2080, %rd2688;
and.b64 %rd2082, %rd2081, 4294967295;
xor.b64 %rd2083, %rd2082, %rd2079;
xor.b64 %rd2084, %rd2083, %rd2689;
mul.lo.s64 %rd2085, %rd2084, %rd2690;
shr.u64 %rd2086, %rd2085, 32;
shr.u64 %rd2087, %rd2081, 32;
and.b64 %rd2088, %rd2691, 4294967295;
xor.b64 %rd2089, %rd2088, %rd2087;
xor.b64 %rd2090, %rd2089, %rd2692;
mul.lo.s64 %rd2091, %rd2090, %rd2690;
and.b64 %rd2092, %rd2091, 4294967295;
xor.b64 %rd2093, %rd2092, %rd2086;
xor.b64 %rd2094, %rd2093, %rd2693;
mul.lo.s64 %rd2095, %rd2094, %rd2688;
shr.u64 %rd2096, %rd2095, 32;
shr.u64 %rd2097, %rd2091, 32;
and.b64 %rd2098, %rd2694, 4294967295;
xor.b64 %rd2099, %rd2098, %rd2097;
xor.b64 %rd2100, %rd2099, %rd2695;
mul.lo.s64 %rd2101, %rd2100, %rd2688;
and.b64 %rd2102, %rd2101, 4294967295;
xor.b64 %rd2103, %rd2102, %rd2096;
xor.b64 %rd2104, %rd2103, %rd2696;
mul.lo.s64 %rd2105, %rd2104, %rd2690;
shr.u64 %rd2106, %rd2105, 32;
shr.u64 %rd2107, %rd2101, 32;
and.b64 %rd2108, %rd2697, 4294967295;
xor.b64 %rd2109, %rd2108, %rd2107;
xor.b64 %rd2110, %rd2109, %rd2698;
mul.lo.s64 %rd2111, %rd2110, %rd2690;
and.b64 %rd2112, %rd2111, 4294967295;
xor.b64 %rd2113, %rd2112, %rd2106;
xor.b64 %rd2114, %rd2113, %rd2699;
mul.lo.s64 %rd2115, %rd2114, %rd2688;
shr.u64 %rd2116, %rd2115, 32;
shr.u64 %rd2117, %rd2111, 32;
and.b64 %rd2118, %rd2085, 4294967295;
xor.b64 %rd2119, %rd2118, %rd2117;
xor.b64 %rd2120, %rd2119, %rd2700;
mul.lo.s64 %rd2121, %rd2120, %rd2688;
and.b64 %rd2122, %rd2121, 4294967295;
xor.b64 %rd2123, %rd2122, %rd2116;
xor.b64 %rd2124, %rd2123, %rd2701;
mul.lo.s64 %rd2125, %rd2124, %rd2690;
shr.u64 %rd2126, %rd2125, 32;
shr.u64 %rd2127, %rd2121, 32;
and.b64 %rd2128, %rd2095, 4294967295;
xor.b64 %rd2129, %rd2128, %rd2127;
xor.b64 %rd2130, %rd2129, %rd2702;
mul.lo.s64 %rd2131, %rd2130, %rd2690;
and.b64 %rd2132, %rd2131, 4294967295;
xor.b64 %rd2133, %rd2132, %rd2126;
xor.b64 %rd2134, %rd2133, %rd2703;
mul.lo.s64 %rd2135, %rd2134, %rd2688;
shr.u64 %rd2136, %rd2135, 32;
shr.u64 %rd2137, %rd2131, 32;
xor.b64 %rd2138, %rd2105, %rd2137;
xor.b64 %rd2139, %rd2138, %rd2704;
mul.lo.s64 %rd2140, %rd2139, %rd2688;
xor.b64 %rd2141, %rd2136, %rd2140;
cvt.u32.u64 %r237, %rd2141;
xor.b32 %r238, %r343, %r237;
mul.lo.s32 %r239, %r238, %r344;
shr.u32 %r240, %r239, 9;
cvt.rn.f32.u32 %f220, %r240;
mul.rn.f32 %f221, %f220, 0f34000000;
cvt.rn.f16.f32 %h118, %f221;
mov.b16 %h119, 0x2E66;
setp.ge.f16 %p57, %h118, %h119;
ld.global.nc.b16 %h120, [%rd45+1538];
ld.global.nc.f32 %f222, [%rd46+3076];
cvt.rn.f16.f32 %h121, %f222;
add.rn.f16 %h122, %h120, %h121;
mov.b16 %h123, 0x3C72;
mul.rn.f16 %h124, %h122, %h123;
selp.b16 %h125, %h124, 0x0000, %p57;
cvt.f32.f16 %f223, %h125;
ld.global.nc.b16 %h126, [%rd47+1538];
cvt.f32.f16 %f224, %h126;
ld.global.nc.f32 %f225, [%rd48+3076];
mul.rn.f32 %f226, %f1, %f225;
mul.rn.f32 %f227, %f226, %f224;
ld.global.nc.f32 %f228, [%rd49+3076];
mul.rn.f32 %f229, %f2, %f226;
sub.rn.f32 %f230, %f228, %f229;
add.rn.f32 %f231, %f227, %f230;
add.rn.f32 %f232, %f231, %f223;
sub.rn.f32 %f233, %f232, %f3;
mul.rn.f32 %f234, %f233, %f233;
add.rn.f32 %f17, %f16, %f234;
or.b32 %r242, %r73, 896;
shr.u32 %r243, %r242, 2;
cvt.u64.u32 %rd2142, %r243;
add.s64 %rd406, %rd12, %rd2142;
@%p8 bra LBB55_46;
mov.u32 %r347, -1879881855;
mov.u32 %r345, 534103459;
mov.u64 %rd2723, 3678237736;
and.b64 %rd2184, %rd406, 4294967295;
mul.lo.s64 %rd2709, %rd2184, 3528531795;
setp.lt.u64 %p59, %rd406, %rd12;
selp.u64 %rd2185, 1, 0, %p59;
add.s64 %rd2186, %rd2464, %rd2185;
xor.b64 %rd2187, %rd2186, %rd2709;
shr.u64 %rd2188, %rd2187, 32;
mul.lo.s64 %rd2712, %rd2188, 3449720151;
shr.u64 %rd2189, %rd2712, 32;
and.b64 %rd2190, %rd2186, 4294967295;
mul.lo.s64 %rd2191, %rd2190, 3449720151;
and.b64 %rd2192, %rd2191, 4294967295;
xor.b64 %rd2193, %rd2192, %rd2189;
xor.b64 %rd2194, %rd2193, 2654435769;
mul.lo.s64 %rd2715, %rd2194, 3528531795;
xor.b64 %rd2705, %rd2191, %rd406;
mov.u32 %r346, -845247145;
mov.u64 %rd2722, 3041712726;
mov.u64 %rd2721, 1401181199;
mov.u64 %rd2720, 2835769497;
mov.u64 %rd2719, 1684936478;
mov.u64 %rd2718, 2027808484;
mov.u64 %rd2717, 387276957;
mov.u64 %rd2716, 842468239;
mov.u64 %rd2714, 3986602516;
mov.u64 %rd2713, 1013904242;
mov.u64 %rd2711, 3668340011;
mov.u64 %rd2710, 3144134277;
mov.u64 %rd2708, 3449720151;
mov.u64 %rd2707, 1993301258;
mov.u64 %rd2706, 3528531795;
bra.uni LBB55_47;
LBB55_46:
setp.lt.u64 %p58, %rd406, %rd12;
selp.u64 %rd2158, 1, 0, %p58;
add.s64 %rd2159, %rd2464, %rd2158;
and.b64 %rd2160, %rd2159, 4294967295;
mul.lo.s64 %rd2709, %rd2160, 3449720151;
xor.b64 %rd2161, %rd2709, %rd406;
shr.u64 %rd2162, %rd2161, 32;
mul.lo.s64 %rd2712, %rd2162, 3528531795;
shr.u64 %rd2163, %rd2712, 32;
and.b64 %rd2164, %rd406, 4294967295;
mul.lo.s64 %rd2165, %rd2164, 3528531795;
and.b64 %rd2166, %rd2165, 4294967295;
xor.b64 %rd2167, %rd2166, %rd2163;
xor.b64 %rd2168, %rd2167, 3144134277;
mul.lo.s64 %rd2715, %rd2168, 3449720151;
xor.b64 %rd2705, %rd2159, %rd2165;
mov.u32 %r347, -1767562579;
mov.u32 %r346, -766435501;
mov.u32 %r345, 1401181199;
mov.u64 %rd2723, 4055616968;
mov.u64 %rd2722, 1684936478;
mov.u64 %rd2721, 534103459;
mov.u64 %rd2720, 387276957;
mov.u64 %rd2719, 3041712726;
mov.u64 %rd2718, 3986602516;
mov.u64 %rd2717, 2835769497;
mov.u64 %rd2716, 3668340011;
mov.u64 %rd2714, 2027808484;
mov.u64 %rd2713, 1993301258;
mov.u64 %rd2711, 842468239;
mov.u64 %rd2710, 2654435769;
mov.u64 %rd2708, 3528531795;
mov.u64 %rd2707, 1013904242;
mov.u64 %rd2706, 3449720151;
LBB55_47:
shr.u64 %rd2195, %rd2715, 32;
shr.u64 %rd2196, %rd2705, 32;
mul.lo.s64 %rd2197, %rd2196, %rd2706;
and.b64 %rd2198, %rd2197, 4294967295;
xor.b64 %rd2199, %rd2198, %rd2195;
xor.b64 %rd2200, %rd2199, %rd2707;
mul.lo.s64 %rd2201, %rd2200, %rd2708;
shr.u64 %rd2202, %rd2201, 32;
shr.u64 %rd2203, %rd2197, 32;
and.b64 %rd2204, %rd2709, 4294967295;
xor.b64 %rd2205, %rd2204, %rd2203;
xor.b64 %rd2206, %rd2205, %rd2710;
mul.lo.s64 %rd2207, %rd2206, %rd2708;
and.b64 %rd2208, %rd2207, 4294967295;
xor.b64 %rd2209, %rd2208, %rd2202;
xor.b64 %rd2210, %rd2209, %rd2711;
mul.lo.s64 %rd2211, %rd2210, %rd2706;
shr.u64 %rd2212, %rd2211, 32;
shr.u64 %rd2213, %rd2207, 32;
and.b64 %rd2214, %rd2712, 4294967295;
xor.b64 %rd2215, %rd2214, %rd2213;
xor.b64 %rd2216, %rd2215, %rd2713;
mul.lo.s64 %rd2217, %rd2216, %rd2706;
and.b64 %rd2218, %rd2217, 4294967295;
xor.b64 %rd2219, %rd2218, %rd2212;
xor.b64 %rd2220, %rd2219, %rd2714;
mul.lo.s64 %rd2221, %rd2220, %rd2708;
shr.u64 %rd2222, %rd2221, 32;
shr.u64 %rd2223, %rd2217, 32;
and.b64 %rd2224, %rd2715, 4294967295;
xor.b64 %rd2225, %rd2224, %rd2223;
xor.b64 %rd2226, %rd2225, %rd2716;
mul.lo.s64 %rd2227, %rd2226, %rd2708;
and.b64 %rd2228, %rd2227, 4294967295;
xor.b64 %rd2229, %rd2228, %rd2222;
xor.b64 %rd2230, %rd2229, %rd2717;
mul.lo.s64 %rd2231, %rd2230, %rd2706;
shr.u64 %rd2232, %rd2231, 32;
shr.u64 %rd2233, %rd2227, 32;
and.b64 %rd2234, %rd2201, 4294967295;
xor.b64 %rd2235, %rd2234, %rd2233;
xor.b64 %rd2236, %rd2235, %rd2718;
mul.lo.s64 %rd2237, %rd2236, %rd2706;
and.b64 %rd2238, %rd2237, 4294967295;
xor.b64 %rd2239, %rd2238, %rd2232;
xor.b64 %rd2240, %rd2239, %rd2719;
mul.lo.s64 %rd2241, %rd2240, %rd2708;
shr.u64 %rd2242, %rd2241, 32;
shr.u64 %rd2243, %rd2237, 32;
and.b64 %rd2244, %rd2211, 4294967295;
xor.b64 %rd2245, %rd2244, %rd2243;
xor.b64 %rd2246, %rd2245, %rd2720;
mul.lo.s64 %rd2247, %rd2246, %rd2708;
and.b64 %rd2248, %rd2247, 4294967295;
xor.b64 %rd2249, %rd2248, %rd2242;
xor.b64 %rd2250, %rd2249, %rd2721;
mul.lo.s64 %rd2251, %rd2250, %rd2706;
shr.u64 %rd2252, %rd2251, 32;
shr.u64 %rd2253, %rd2247, 32;
and.b64 %rd2254, %rd2221, 4294967295;
xor.b64 %rd2255, %rd2254, %rd2253;
xor.b64 %rd2256, %rd2255, %rd2722;
mul.lo.s64 %rd2257, %rd2256, %rd2706;
and.b64 %rd2258, %rd2257, 4294967295;
xor.b64 %rd2259, %rd2258, %rd2252;
xor.b64 %rd2260, %rd2259, %rd2723;
mul.lo.s64 %rd2261, %rd2260, %rd2708;
shr.u64 %rd2262, %rd2261, 32;
cvt.u32.u64 %r250, %rd2262;
shr.u64 %rd2263, %rd2257, 32;
xor.b64 %rd2264, %rd2263, %rd2231;
cvt.u32.u64 %r251, %rd2264;
xor.b32 %r252, %r345, %r251;
mul.lo.s32 %r253, %r252, %r346;
xor.b32 %r254, %r253, %r250;
xor.b32 %r255, %r254, %r347;
shr.u32 %r256, %r255, 9;
cvt.rn.f32.u32 %f235, %r256;
mul.rn.f32 %f236, %f235, 0f34000000;
cvt.rn.f16.f32 %h127, %f236;
mov.b16 %h128, 0x2E66;
setp.ge.f16 %p60, %h127, %h128;
ld.global.nc.b16 %h129, [%rd45+1792];
ld.global.nc.f32 %f237, [%rd46+3584];
cvt.rn.f16.f32 %h130, %f237;
add.rn.f16 %h131, %h129, %h130;
mov.b16 %h132, 0x3C72;
mul.rn.f16 %h133, %h131, %h132;
selp.b16 %h134, %h133, 0x0000, %p60;
cvt.f32.f16 %f238, %h134;
ld.global.nc.b16 %h135, [%rd47+1792];
cvt.f32.f16 %f239, %h135;
ld.global.nc.f32 %f240, [%rd48+3584];
mul.rn.f32 %f241, %f1, %f240;
mul.rn.f32 %f242, %f241, %f239;
ld.global.nc.f32 %f243, [%rd49+3584];
mul.rn.f32 %f244, %f2, %f241;
sub.rn.f32 %f245, %f243, %f244;
add.rn.f32 %f246, %f242, %f245;
add.rn.f32 %f247, %f246, %f238;
sub.rn.f32 %f248, %f247, %f3;
mul.rn.f32 %f249, %f248, %f248;
add.rn.f32 %f18, %f17, %f249;
or.b32 %r257, %r3, 897;
or.b32 %r258, %r257, %r4;
and.b32 %r259, %r257, 3;
shr.u32 %r260, %r258, 2;
setp.ne.s32 %p61, %r259, 1;
cvt.u64.u32 %rd2265, %r260;
add.s64 %rd434, %rd12, %rd2265;
@%p61 bra LBB55_49;
mov.u32 %r349, -845247145;
mov.u64 %rd2740, 1401181199;
mov.u64 %rd2729, 3144134277;
mov.u32 %r348, -616729560;
and.b64 %rd2305, %rd434, 4294967295;
mul.lo.s64 %rd2728, %rd2305, 3528531795;
setp.lt.u64 %p63, %rd434, %rd12;
selp.u64 %rd2306, 1, 0, %p63;
add.s64 %rd2307, %rd2464, %rd2306;
xor.b64 %rd2308, %rd2307, %rd2728;
shr.u64 %rd2309, %rd2308, 32;
mul.lo.s64 %rd2731, %rd2309, 3449720151;
shr.u64 %rd2310, %rd2731, 32;
and.b64 %rd2311, %rd2307, 4294967295;
mul.lo.s64 %rd2312, %rd2311, 3449720151;
and.b64 %rd2313, %rd2312, 4294967295;
xor.b64 %rd2314, %rd2313, %rd2310;
xor.b64 %rd2315, %rd2314, 2654435769;
mul.lo.s64 %rd2734, %rd2315, 3528531795;
xor.b64 %rd2724, %rd2312, %rd434;
mov.u64 %rd2741, 3041712726;
mov.u64 %rd2739, 2835769497;
mov.u64 %rd2738, 1684936478;
mov.u64 %rd2737, 2027808484;
mov.u64 %rd2736, 387276957;
mov.u64 %rd2735, 842468239;
mov.u64 %rd2733, 3986602516;
mov.u64 %rd2732, 1013904242;
mov.u64 %rd2730, 3668340011;
mov.u64 %rd2727, 3449720151;
mov.u64 %rd2726, 1993301258;
mov.u64 %rd2725, 3528531795;
bra.uni LBB55_50;
LBB55_49:
setp.lt.u64 %p62, %rd434, %rd12;
selp.u64 %rd2280, 1, 0, %p62;
add.s64 %rd2281, %rd2464, %rd2280;
and.b64 %rd2282, %rd2281, 4294967295;
mul.lo.s64 %rd2728, %rd2282, 3449720151;
xor.b64 %rd2283, %rd2728, %rd434;
shr.u64 %rd2284, %rd2283, 32;
mul.lo.s64 %rd2731, %rd2284, 3528531795;
shr.u64 %rd2285, %rd2731, 32;
and.b64 %rd2286, %rd434, 4294967295;
mul.lo.s64 %rd2287, %rd2286, 3528531795;
and.b64 %rd2288, %rd2287, 4294967295;
xor.b64 %rd2289, %rd2288, %rd2285;
xor.b64 %rd2290, %rd2289, 3144134277;
mul.lo.s64 %rd2734, %rd2290, 3449720151;
xor.b64 %rd2724, %rd2281, %rd2287;
mov.u32 %r349, -766435501;
mov.u32 %r348, -239350328;
mov.u64 %rd2741, 1684936478;
mov.u64 %rd2740, 534103459;
mov.u64 %rd2739, 387276957;
mov.u64 %rd2738, 3041712726;
mov.u64 %rd2737, 3986602516;
mov.u64 %rd2736, 2835769497;
mov.u64 %rd2735, 3668340011;
mov.u64 %rd2733, 2027808484;
mov.u64 %rd2732, 1993301258;
mov.u64 %rd2730, 842468239;
mov.u64 %rd2729, 2654435769;
mov.u64 %rd2727, 3528531795;
mov.u64 %rd2726, 1013904242;
mov.u64 %rd2725, 3449720151;
LBB55_50:
shr.u64 %rd2316, %rd2734, 32;
shr.u64 %rd2317, %rd2724, 32;
mul.lo.s64 %rd2318, %rd2317, %rd2725;
and.b64 %rd2319, %rd2318, 4294967295;
xor.b64 %rd2320, %rd2319, %rd2316;
xor.b64 %rd2321, %rd2320, %rd2726;
mul.lo.s64 %rd2322, %rd2321, %rd2727;
shr.u64 %rd2323, %rd2322, 32;
shr.u64 %rd2324, %rd2318, 32;
and.b64 %rd2325, %rd2728, 4294967295;
xor.b64 %rd2326, %rd2325, %rd2324;
xor.b64 %rd2327, %rd2326, %rd2729;
mul.lo.s64 %rd2328, %rd2327, %rd2727;
and.b64 %rd2329, %rd2328, 4294967295;
xor.b64 %rd2330, %rd2329, %rd2323;
xor.b64 %rd2331, %rd2330, %rd2730;
mul.lo.s64 %rd2332, %rd2331, %rd2725;
shr.u64 %rd2333, %rd2332, 32;
shr.u64 %rd2334, %rd2328, 32;
and.b64 %rd2335, %rd2731, 4294967295;
xor.b64 %rd2336, %rd2335, %rd2334;
xor.b64 %rd2337, %rd2336, %rd2732;
mul.lo.s64 %rd2338, %rd2337, %rd2725;
and.b64 %rd2339, %rd2338, 4294967295;
xor.b64 %rd2340, %rd2339, %rd2333;
xor.b64 %rd2341, %rd2340, %rd2733;
mul.lo.s64 %rd2342, %rd2341, %rd2727;
shr.u64 %rd2343, %rd2342, 32;
shr.u64 %rd2344, %rd2338, 32;
and.b64 %rd2345, %rd2734, 4294967295;
xor.b64 %rd2346, %rd2345, %rd2344;
xor.b64 %rd2347, %rd2346, %rd2735;
mul.lo.s64 %rd2348, %rd2347, %rd2727;
and.b64 %rd2349, %rd2348, 4294967295;
xor.b64 %rd2350, %rd2349, %rd2343;
xor.b64 %rd2351, %rd2350, %rd2736;
mul.lo.s64 %rd2352, %rd2351, %rd2725;
shr.u64 %rd2353, %rd2352, 32;
shr.u64 %rd2354, %rd2348, 32;
and.b64 %rd2355, %rd2322, 4294967295;
xor.b64 %rd2356, %rd2355, %rd2354;
xor.b64 %rd2357, %rd2356, %rd2737;
mul.lo.s64 %rd2358, %rd2357, %rd2725;
and.b64 %rd2359, %rd2358, 4294967295;
xor.b64 %rd2360, %rd2359, %rd2353;
xor.b64 %rd2361, %rd2360, %rd2738;
mul.lo.s64 %rd2362, %rd2361, %rd2727;
shr.u64 %rd2363, %rd2362, 32;
shr.u64 %rd2364, %rd2358, 32;
and.b64 %rd2365, %rd2332, 4294967295;
xor.b64 %rd2366, %rd2365, %rd2364;
xor.b64 %rd2367, %rd2366, %rd2739;
mul.lo.s64 %rd2368, %rd2367, %rd2727;
and.b64 %rd2369, %rd2368, 4294967295;
xor.b64 %rd2370, %rd2369, %rd2363;
xor.b64 %rd2371, %rd2370, %rd2740;
mul.lo.s64 %rd2372, %rd2371, %rd2725;
shr.u64 %rd2373, %rd2372, 32;
shr.u64 %rd2374, %rd2368, 32;
xor.b64 %rd2375, %rd2342, %rd2374;
xor.b64 %rd2376, %rd2375, %rd2741;
mul.lo.s64 %rd2377, %rd2376, %rd2725;
xor.b64 %rd2378, %rd2373, %rd2377;
cvt.u32.u64 %r265, %rd2378;
xor.b32 %r266, %r348, %r265;
mul.lo.s32 %r267, %r266, %r349;
shr.u32 %r268, %r267, 9;
cvt.rn.f32.u32 %f250, %r268;
mul.rn.f32 %f251, %f250, 0f34000000;
cvt.rn.f16.f32 %h136, %f251;
mov.b16 %h137, 0x2E66;
setp.ge.f16 %p64, %h136, %h137;
ld.global.nc.b16 %h138, [%rd45+1794];
ld.global.nc.f32 %f252, [%rd46+3588];
cvt.rn.f16.f32 %h139, %f252;
add.rn.f16 %h140, %h138, %h139;
mov.b16 %h141, 0x3C72;
mul.rn.f16 %h142, %h140, %h141;
selp.b16 %h143, %h142, 0x0000, %p64;
cvt.f32.f16 %f253, %h143;
ld.global.nc.b16 %h144, [%rd47+1794];
cvt.f32.f16 %f254, %h144;
ld.global.nc.f32 %f255, [%rd48+3588];
mul.rn.f32 %f256, %f1, %f255;
mul.rn.f32 %f257, %f256, %f254;
ld.global.nc.f32 %f258, [%rd49+3588];
mul.rn.f32 %f259, %f2, %f256;
sub.rn.f32 %f260, %f258, %f259;
add.rn.f32 %f261, %f257, %f260;
add.rn.f32 %f262, %f261, %f253;
sub.rn.f32 %f263, %f262, %f3;
mul.rn.f32 %f264, %f263, %f263;
add.rn.f32 %f265, %f18, %f264;
and.b32 %r46, %r1, 31;
shfl.sync.down.b32 %f266, %f265, 16, 31, -1;
add.rn.f32 %f267, %f266, %f265;
shfl.sync.down.b32 %f268, %f267, 8, 31, -1;
add.rn.f32 %f269, %f268, %f267;
shfl.sync.down.b32 %f270, %f269, 4, 31, -1;
add.rn.f32 %f271, %f270, %f269;
shfl.sync.down.b32 %f272, %f271, 2, 31, -1;
add.rn.f32 %f273, %f272, %f271;
shfl.sync.down.b32 %f274, %f273, 1, 31, -1;
shr.u32 %r47, %r1, 5;
setp.ne.s32 %p65, %r46, 0;
mov.u64 %rd2381, shared_cache_013;
@%p65 bra LBB55_2;
mul.wide.u32 %rd2380, %r47, 4;
add.s64 %rd462, %rd2381, %rd2380;
add.rn.f32 %f19, %f274, %f273;
st.shared.f32 [%rd462], %f19;
LBB55_2:
bar.sync 0;
setp.eq.s32 %p66, %r47, 0;
@%p66 bra LBB55_52;
bra.uni LBB55_3;
LBB55_52:
add.u64 %rd474, %SP, 0;
add.u64 %rd11, %SPL, 0;
mul.wide.u32 %rd2382, %r46, 4;
add.s64 %rd463, %rd2381, %rd2382;
cvta.shared.u64 %rd2384, %rd463;
mov.u32 %r269, 0;
st.local.u32 [%rd11], %r269;
setp.lt.u32 %p67, %r1, 2;
selp.b64 %rd2386, %rd2384, %rd474, %p67;
ld.f32 %f275, [%rd2386];
shfl.sync.down.b32 %f276, %f275, 16, 31, -1;
add.rn.f32 %f277, %f275, %f276;
shfl.sync.down.b32 %f278, %f277, 8, 31, -1;
add.rn.f32 %f279, %f277, %f278;
shfl.sync.down.b32 %f280, %f279, 4, 31, -1;
add.rn.f32 %f281, %f279, %f280;
shfl.sync.down.b32 %f282, %f281, 2, 31, -1;
add.rn.f32 %f283, %f281, %f282;
shfl.sync.down.b32 %f284, %f283, 1, 31, -1;
add.rn.f32 %f285, %f283, %f284;
st.f32 [%rd2386], %f285;
setp.ne.s32 %p68, %r1, 0;
@%p68 bra LBB55_3;
ld.param.u64 %rd470, [fusion_2212_param_3];
cvt.u64.u32 %rd44, %r2;
cvta.to.global.u64 %rd7, %rd470;
shl.b64 %rd2379, %rd44, 2;
add.s64 %rd461, %rd7, %rd2379;
ld.shared.f32 %f286, [%rd463];
atom.global.add.f32 %f287, [%rd461], %f286;
LBB55_3:
ret;
}
// .globl fusion_2209
.visible .entry fusion_2209(
.param .u64 fusion_2209_param_0,
.param .u64 fusion_2209_param_1,
.param .u64 fusion_2209_param_2,
.param .u64 fusion_2209_param_3,
.param .u64 fusion_2209_param_4,
.param .u64 fusion_2209_param_5,
.param .u64 fusion_2209_param_6,
.param .u64 fusion_2209_param_7,
.param .u64 fusion_2209_param_8,
.param .u64 fusion_2209_param_9,
.param .u64 fusion_2209_param_10,
.param .u64 fusion_2209_param_11,
.param .u64 fusion_2209_param_12,
.param .u64 fusion_2209_param_13
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<39>;
.reg .b32 %hh<5>;
.reg .f32 %f<97>;
.reg .b32 %r<31>;
.reg .b64 %rd<162>;
ld.param.u64 %rd1, [fusion_2209_param_0];
ld.param.u64 %rd2, [fusion_2209_param_12];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2209_param_1];
ld.param.u64 %rd5, [fusion_2209_param_11];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2209_param_2];
ld.param.u64 %rd8, [fusion_2209_param_10];
cvta.to.global.u64 %rd9, %rd8;
ld.param.u64 %rd10, [fusion_2209_param_3];
ld.param.u64 %rd11, [fusion_2209_param_9];
cvta.to.global.u64 %rd12, %rd11;
ld.param.u64 %rd13, [fusion_2209_param_4];
ld.param.u64 %rd14, [fusion_2209_param_8];
cvta.to.global.u64 %rd15, %rd14;
ld.param.u64 %rd16, [fusion_2209_param_5];
ld.param.u64 %rd17, [fusion_2209_param_7];
cvta.to.global.u64 %rd18, %rd17;
ld.param.u64 %rd19, [fusion_2209_param_6];
cvta.to.global.u64 %rd20, %rd19;
cvta.to.global.u64 %rd21, %rd16;
cvta.to.global.u64 %rd22, %rd13;
cvta.to.global.u64 %rd23, %rd10;
cvta.to.global.u64 %rd24, %rd7;
cvta.to.global.u64 %rd25, %rd4;
cvta.to.global.u64 %rd26, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
shr.u32 %r8, %r5, 2;
ld.global.nc.v2.u64 {%rd27, %rd28}, [%rd23];
cvt.u64.u32 %rd29, %r8;
add.s64 %rd30, %rd27, %rd29;
setp.lt.u64 %p1, %rd30, %rd27;
and.b64 %rd31, %rd30, 4294967295;
mul.lo.s64 %rd32, %rd31, 3528531795;
selp.u64 %rd33, 1, 0, %p1;
add.s64 %rd34, %rd28, %rd33;
xor.b64 %rd35, %rd34, %rd32;
shr.u64 %rd36, %rd35, 32;
mul.lo.s64 %rd37, %rd36, 3449720151;
shr.u64 %rd38, %rd37, 32;
and.b64 %rd39, %rd34, 4294967295;
mul.lo.s64 %rd40, %rd39, 3449720151;
and.b64 %rd41, %rd40, 4294967295;
xor.b64 %rd42, %rd41, %rd38;
xor.b64 %rd43, %rd42, 2654435769;
mul.lo.s64 %rd44, %rd43, 3528531795;
shr.u64 %rd45, %rd44, 32;
xor.b64 %rd46, %rd40, %rd30;
shr.u64 %rd47, %rd46, 32;
mul.lo.s64 %rd48, %rd47, 3528531795;
and.b64 %rd49, %rd48, 4294967295;
xor.b64 %rd50, %rd49, %rd45;
xor.b64 %rd51, %rd50, 1993301258;
mul.lo.s64 %rd52, %rd51, 3449720151;
shr.u64 %rd53, %rd52, 32;
shr.u64 %rd54, %rd48, 32;
and.b64 %rd55, %rd32, 4294967295;
xor.b64 %rd56, %rd55, %rd54;
xor.b64 %rd57, %rd56, 3144134277;
mul.lo.s64 %rd58, %rd57, 3449720151;
and.b64 %rd59, %rd58, 4294967295;
xor.b64 %rd60, %rd59, %rd53;
xor.b64 %rd61, %rd60, 3668340011;
mul.lo.s64 %rd62, %rd61, 3528531795;
shr.u64 %rd63, %rd62, 32;
shr.u64 %rd64, %rd58, 32;
and.b64 %rd65, %rd37, 4294967295;
xor.b64 %rd66, %rd65, %rd64;
xor.b64 %rd67, %rd66, 1013904242;
mul.lo.s64 %rd68, %rd67, 3528531795;
and.b64 %rd69, %rd68, 4294967295;
xor.b64 %rd70, %rd69, %rd63;
xor.b64 %rd71, %rd70, 3986602516;
mul.lo.s64 %rd72, %rd71, 3449720151;
shr.u64 %rd73, %rd72, 32;
shr.u64 %rd74, %rd68, 32;
and.b64 %rd75, %rd44, 4294967295;
xor.b64 %rd76, %rd75, %rd74;
xor.b64 %rd77, %rd76, 842468239;
mul.lo.s64 %rd78, %rd77, 3449720151;
and.b64 %rd79, %rd78, 4294967295;
xor.b64 %rd80, %rd79, %rd73;
xor.b64 %rd81, %rd80, 387276957;
mul.lo.s64 %rd82, %rd81, 3528531795;
shr.u64 %rd83, %rd82, 32;
shr.u64 %rd84, %rd78, 32;
and.b64 %rd85, %rd52, 4294967295;
xor.b64 %rd86, %rd85, %rd84;
xor.b64 %rd87, %rd86, 2027808484;
mul.lo.s64 %rd88, %rd87, 3528531795;
and.b64 %rd89, %rd88, 4294967295;
shr.u64 %rd90, %rd88, 32;
and.b64 %rd91, %rd62, 4294967295;
xor.b64 %rd92, %rd91, %rd90;
xor.b64 %rd93, %rd92, 2835769497;
mul.lo.s64 %rd94, %rd93, 3449720151;
and.b64 %rd95, %rd94, 4294967295;
shr.u64 %rd96, %rd94, 32;
and.b64 %rd97, %rd72, 4294967295;
xor.b64 %rd98, %rd97, %rd96;
xor.b64 %rd99, %rd98, 3041712726;
mul.lo.s64 %rd100, %rd99, 3528531795;
and.b64 %rd101, %rd100, 4294967295;
xor.b64 %rd102, %rd89, %rd83;
xor.b64 %rd103, %rd102, 1684936478;
mul.lo.s64 %rd104, %rd103, 3449720151;
shr.u64 %rd105, %rd104, 32;
xor.b64 %rd106, %rd95, %rd105;
xor.b64 %rd107, %rd106, 1401181199;
mul.lo.s64 %rd108, %rd107, 3528531795;
shr.u64 %rd109, %rd108, 32;
xor.b64 %rd110, %rd101, %rd109;
xor.b64 %rd111, %rd110, 3678237736;
mul.lo.s64 %rd112, %rd111, 3449720151;
shr.u64 %rd113, %rd112, 32;
cvt.u32.u64 %r9, %rd113;
shr.u64 %rd114, %rd100, 32;
xor.b64 %rd115, %rd114, %rd82;
cvt.u32.u64 %r10, %rd115;
xor.b32 %r11, %r10, 534103459;
mul.lo.s32 %r12, %r11, -845247145;
xor.b32 %r13, %r12, %r9;
shr.u32 %r14, %r13, 9;
xor.b32 %r15, %r14, 4716963;
cvt.rn.f32.u32 %f1, %r15;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h1, %f2;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p2, %h1, %h2;
mul.wide.u32 %rd116, %r5, 2;
add.s64 %rd117, %rd25, %rd116;
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd117];
mov.b32 %hh1, {%h5, %h6};
mov.b32 %hh2, {%h3, %h4};
mov.b32 {%h7, %h8}, %hh2;
mov.b32 {%h9, %h10}, %hh1;
mul.wide.u32 %rd118, %r4, 4;
add.s64 %rd119, %rd6, %rd118;
ld.global.nc.f32 %f3, [%rd119];
cvt.rn.f16.f32 %h11, %f3;
add.rn.f16 %h12, %h7, %h11;
mov.b16 %h13, 0x3C72;
mul.rn.f16 %h14, %h12, %h13;
cvt.f32.f16 %f4, %h14;
selp.f32 %f5, %f4, 0f00000000, %p2;
add.s64 %rd120, %rd24, %rd116;
ld.global.nc.v4.b16 {%h15, %h16, %h17, %h18}, [%rd120];
mov.b32 %hh3, {%h17, %h18};
mov.b32 %hh4, {%h15, %h16};
mov.b32 {%h19, %h20}, %hh4;
mov.b32 {%h21, %h22}, %hh3;
cvt.f32.f16 %f6, %h19;
mul.wide.u32 %rd121, %r1, 4;
add.s64 %rd122, %rd20, %rd121;
ld.global.nc.f32 %f7, [%rd122];
mul.rn.f32 %f8, %f7, 0f3A800000;
add.rn.f32 %f9, %f8, 0f2B8CBCCC;
rsqrt.approx.f32 %f10, %f9;
add.s64 %rd123, %rd9, %rd118;
ld.global.nc.f32 %f11, [%rd123];
mul.rn.f32 %f12, %f10, %f11;
mul.rn.f32 %f13, %f12, %f6;
add.s64 %rd124, %rd12, %rd118;
ld.global.nc.f32 %f14, [%rd124];
add.s64 %rd125, %rd18, %rd121;
ld.global.nc.f32 %f15, [%rd125];
mul.rn.f32 %f16, %f15, 0f3A800000;
mul.rn.f32 %f17, %f12, %f16;
sub.rn.f32 %f18, %f14, %f17;
add.rn.f32 %f19, %f13, %f18;
add.rn.f32 %f20, %f19, %f5;
add.s64 %rd126, %rd22, %rd121;
ld.global.nc.f32 %f21, [%rd126];
mul.rn.f32 %f22, %f21, 0f3A800000;
add.rn.f32 %f23, %f22, 0f2B8CBCCC;
rsqrt.approx.f32 %f24, %f23;
add.s64 %rd127, %rd15, %rd118;
ld.global.nc.f32 %f25, [%rd127];
mul.rn.f32 %f26, %f24, %f25;
mul.rn.f32 %f27, %f26, %f20;
add.s64 %rd128, %rd3, %rd118;
ld.global.nc.f32 %f28, [%rd128];
add.s64 %rd129, %rd21, %rd121;
ld.global.nc.f32 %f29, [%rd129];
mul.rn.f32 %f30, %f29, 0f3A800000;
mul.rn.f32 %f31, %f26, %f30;
sub.rn.f32 %f32, %f28, %f31;
add.rn.f32 %f33, %f32, %f27;
cvt.rn.f16.f32 %h23, %f33;
add.s64 %rd130, %rd26, %rd116;
xor.b64 %rd131, %rd72, %rd96;
xor.b64 %rd132, %rd131, 3041712726;
mul.lo.s64 %rd133, %rd132, 3528531795;
xor.b64 %rd134, %rd109, %rd133;
cvt.u32.u64 %r16, %rd134;
xor.b32 %r17, %r16, -616729560;
mul.lo.s32 %r18, %r17, -845247145;
shr.u32 %r19, %r18, 9;
cvt.rn.f32.u32 %f34, %r19;
mul.rn.f32 %f35, %f34, 0f34000000;
cvt.rn.f16.f32 %h24, %f35;
setp.ge.f16 %p3, %h24, %h2;
mul.wide.u32 %rd135, %r6, 4;
add.s64 %rd136, %rd6, %rd135;
ld.global.nc.f32 %f36, [%rd136];
cvt.rn.f16.f32 %h25, %f36;
add.rn.f16 %h26, %h8, %h25;
mul.rn.f16 %h27, %h26, %h13;
cvt.f32.f16 %f37, %h27;
selp.f32 %f38, %f37, 0f00000000, %p3;
cvt.f32.f16 %f39, %h20;
add.s64 %rd137, %rd9, %rd135;
ld.global.nc.f32 %f40, [%rd137];
mul.rn.f32 %f41, %f10, %f40;
mul.rn.f32 %f42, %f41, %f39;
add.s64 %rd138, %rd12, %rd135;
ld.global.nc.f32 %f43, [%rd138];
mul.rn.f32 %f44, %f16, %f41;
sub.rn.f32 %f45, %f43, %f44;
add.rn.f32 %f46, %f42, %f45;
add.rn.f32 %f47, %f46, %f38;
add.s64 %rd139, %rd15, %rd135;
ld.global.nc.f32 %f48, [%rd139];
mul.rn.f32 %f49, %f24, %f48;
mul.rn.f32 %f50, %f49, %f47;
add.s64 %rd140, %rd3, %rd135;
ld.global.nc.f32 %f51, [%rd140];
mul.rn.f32 %f52, %f30, %f49;
sub.rn.f32 %f53, %f51, %f52;
add.rn.f32 %f54, %f53, %f50;
cvt.rn.f16.f32 %h28, %f54;
and.b64 %rd141, %rd104, 4294967295;
and.b64 %rd142, %rd82, 4294967295;
xor.b64 %rd143, %rd142, %rd114;
xor.b64 %rd144, %rd143, 534103459;
mul.lo.s64 %rd145, %rd144, 3449720151;
shr.u64 %rd146, %rd145, 32;
xor.b64 %rd147, %rd141, %rd146;
xor.b64 %rd148, %rd147, 4055616968;
mul.lo.s64 %rd149, %rd148, 3528531795;
shr.u64 %rd150, %rd149, 32;
cvt.u32.u64 %r20, %rd150;
xor.b64 %rd151, %rd105, %rd94;
cvt.u32.u64 %r21, %rd151;
xor.b32 %r22, %r21, 1401181199;
mul.lo.s32 %r23, %r22, -766435501;
xor.b32 %r24, %r23, %r20;
shr.u32 %r25, %r24, 9;
xor.b32 %r26, %r25, 4936337;
cvt.rn.f32.u32 %f55, %r26;
mul.rn.f32 %f56, %f55, 0f34000000;
cvt.rn.f16.f32 %h29, %f56;
setp.ge.f16 %p4, %h29, %h2;
mul.wide.u32 %rd152, %r7, 4;
add.s64 %rd153, %rd6, %rd152;
ld.global.nc.f32 %f57, [%rd153];
cvt.rn.f16.f32 %h30, %f57;
add.rn.f16 %h31, %h9, %h30;
mul.rn.f16 %h32, %h31, %h13;
cvt.f32.f16 %f58, %h32;
selp.f32 %f59, %f58, 0f00000000, %p4;
cvt.f32.f16 %f60, %h21;
add.s64 %rd154, %rd9, %rd152;
ld.global.nc.f32 %f61, [%rd154];
mul.rn.f32 %f62, %f10, %f61;
mul.rn.f32 %f63, %f62, %f60;
add.s64 %rd155, %rd12, %rd152;
ld.global.nc.f32 %f64, [%rd155];
mul.rn.f32 %f65, %f16, %f62;
sub.rn.f32 %f66, %f64, %f65;
add.rn.f32 %f67, %f63, %f66;
add.rn.f32 %f68, %f67, %f59;
add.s64 %rd156, %rd15, %rd152;
ld.global.nc.f32 %f69, [%rd156];
mul.rn.f32 %f70, %f24, %f69;
mul.rn.f32 %f71, %f70, %f68;
add.s64 %rd157, %rd3, %rd152;
ld.global.nc.f32 %f72, [%rd157];
mul.rn.f32 %f73, %f30, %f70;
sub.rn.f32 %f74, %f72, %f73;
add.rn.f32 %f75, %f74, %f71;
cvt.rn.f16.f32 %h33, %f75;
xor.b64 %rd158, %rd83, %rd88;
xor.b64 %rd159, %rd158, 1684936478;
mul.lo.s64 %rd160, %rd159, 3449720151;
xor.b64 %rd161, %rd146, %rd160;
cvt.u32.u64 %r27, %rd161;
xor.b32 %r28, %r27, -239350328;
mul.lo.s32 %r29, %r28, -766435501;
shr.u32 %r30, %r29, 9;
cvt.rn.f32.u32 %f76, %r30;
mul.rn.f32 %f77, %f76, 0f34000000;
cvt.rn.f16.f32 %h34, %f77;
setp.ge.f16 %p5, %h34, %h2;
ld.global.nc.f32 %f78, [%rd119+12];
cvt.rn.f16.f32 %h35, %f78;
add.rn.f16 %h36, %h10, %h35;
mul.rn.f16 %h37, %h36, %h13;
cvt.f32.f16 %f79, %h37;
selp.f32 %f80, %f79, 0f00000000, %p5;
cvt.f32.f16 %f81, %h22;
ld.global.nc.f32 %f82, [%rd123+12];
mul.rn.f32 %f83, %f10, %f82;
mul.rn.f32 %f84, %f83, %f81;
ld.global.nc.f32 %f85, [%rd124+12];
mul.rn.f32 %f86, %f16, %f83;
sub.rn.f32 %f87, %f85, %f86;
add.rn.f32 %f88, %f84, %f87;
add.rn.f32 %f89, %f88, %f80;
ld.global.nc.f32 %f90, [%rd127+12];
mul.rn.f32 %f91, %f24, %f90;
mul.rn.f32 %f92, %f91, %f89;
ld.global.nc.f32 %f93, [%rd128+12];
mul.rn.f32 %f94, %f30, %f91;
sub.rn.f32 %f95, %f93, %f94;
add.rn.f32 %f96, %f95, %f92;
cvt.rn.f16.f32 %h38, %f96;
st.global.v4.b16 [%rd130], {%h23, %h28, %h33, %h38};
ret;
}
// .globl fusion_2702
.visible .entry fusion_2702(
.param .u64 fusion_2702_param_0,
.param .u64 fusion_2702_param_1,
.param .u64 fusion_2702_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2702_param_0];
ld.param.u64 %rd2, [fusion_2702_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2205
.visible .entry fusion_2205(
.param .u64 fusion_2205_param_0,
.param .u64 fusion_2205_param_1,
.param .u64 fusion_2205_param_2,
.param .u64 fusion_2205_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2205_param_0];
ld.param.u64 %rd2, [fusion_2205_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2205_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd5, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd6, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2703
.visible .entry fusion_2703(
.param .u64 fusion_2703_param_0,
.param .u64 fusion_2703_param_1,
.param .u64 fusion_2703_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2703_param_0];
ld.param.u64 %rd2, [fusion_2703_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2206
.visible .entry fusion_2206(
.param .u64 fusion_2206_param_0,
.param .u64 fusion_2206_param_1,
.param .u64 fusion_2206_param_2,
.param .u64 fusion_2206_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2206_param_0];
ld.param.u64 %rd2, [fusion_2206_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2206_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd5, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd6, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2203
.visible .entry fusion_2203(
.param .u64 fusion_2203_param_0,
.param .u64 fusion_2203_param_1,
.param .u64 fusion_2203_param_2,
.param .u64 fusion_2203_param_3
)
.reqntid 32, 1, 1
{
.local .align 4 .b8 __local_depot61[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<4>;
.reg .b16 %h<83>;
.reg .b32 %hh<9>;
.reg .f32 %f<57>;
.reg .b32 %r<37>;
.reg .b64 %rd<37>;
mov.u64 %SPL, __local_depot61;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd4, [fusion_2203_param_0];
ld.param.u64 %rd5, [fusion_2203_param_2];
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd9, %rd4;
add.u64 %rd10, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r5, %ctaid.x;
shl.b32 %r6, %r1, 1;
shl.b32 %r7, %r5, 9;
or.b32 %r8, %r7, %r6;
mul.wide.u32 %rd11, %r8, 2;
add.s64 %rd12, %rd9, %rd11;
ld.global.nc.b32 %hh1, [%rd12];
mov.b32 {%h1, %h2}, %hh1;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd6, %rd13;
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14];
cvt.rn.f16.s32 %h3, %r9;
mov.b16 %h4, 0x3C00;
sub.rn.f16 %h5, %h4, %h3;
mov.b16 %h6, 0x70E2;
mul.rn.f16 %h7, %h5, %h6;
sub.rn.f16 %h8, %h1, %h7;
cvt.f32.f16 %f2, %h8;
max.f32 %f3, %f2, 0fFF800000;
cvt.rn.f16.s32 %h9, %r10;
sub.rn.f16 %h10, %h4, %h9;
mul.rn.f16 %h11, %h10, %h6;
sub.rn.f16 %h12, %h2, %h11;
cvt.f32.f16 %f4, %h12;
max.f32 %f5, %f3, %f4;
or.b32 %r11, %r6, 64;
ld.global.nc.b32 %hh2, [%rd12+128];
mov.b32 {%h13, %h14}, %hh2;
mul.wide.u32 %rd15, %r11, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.u32 %r12, [%rd16];
cvt.rn.f16.s32 %h15, %r12;
sub.rn.f16 %h16, %h4, %h15;
mul.rn.f16 %h17, %h16, %h6;
sub.rn.f16 %h18, %h13, %h17;
cvt.f32.f16 %f6, %h18;
max.f32 %f7, %f5, %f6;
ld.global.nc.u32 %r13, [%rd14+260];
cvt.rn.f16.s32 %h19, %r13;
sub.rn.f16 %h20, %h4, %h19;
mul.rn.f16 %h21, %h20, %h6;
sub.rn.f16 %h22, %h14, %h21;
cvt.f32.f16 %f8, %h22;
max.f32 %f9, %f7, %f8;
or.b32 %r14, %r6, 128;
ld.global.nc.b32 %hh3, [%rd12+256];
mov.b32 {%h23, %h24}, %hh3;
mul.wide.u32 %rd17, %r14, 4;
add.s64 %rd18, %rd6, %rd17;
ld.global.nc.u32 %r15, [%rd18];
cvt.rn.f16.s32 %h25, %r15;
sub.rn.f16 %h26, %h4, %h25;
mul.rn.f16 %h27, %h26, %h6;
sub.rn.f16 %h28, %h23, %h27;
cvt.f32.f16 %f10, %h28;
max.f32 %f11, %f9, %f10;
ld.global.nc.u32 %r16, [%rd14+516];
cvt.rn.f16.s32 %h29, %r16;
sub.rn.f16 %h30, %h4, %h29;
mul.rn.f16 %h31, %h30, %h6;
sub.rn.f16 %h32, %h24, %h31;
cvt.f32.f16 %f12, %h32;
max.f32 %f13, %f11, %f12;
or.b32 %r17, %r6, 192;
ld.global.nc.b32 %hh4, [%rd12+384];
mov.b32 {%h33, %h34}, %hh4;
mul.wide.u32 %rd19, %r17, 4;
add.s64 %rd20, %rd6, %rd19;
ld.global.nc.u32 %r18, [%rd20];
cvt.rn.f16.s32 %h35, %r18;
sub.rn.f16 %h36, %h4, %h35;
mul.rn.f16 %h37, %h36, %h6;
sub.rn.f16 %h38, %h33, %h37;
cvt.f32.f16 %f14, %h38;
max.f32 %f15, %f13, %f14;
ld.global.nc.u32 %r19, [%rd14+772];
cvt.rn.f16.s32 %h39, %r19;
sub.rn.f16 %h40, %h4, %h39;
mul.rn.f16 %h41, %h40, %h6;
sub.rn.f16 %h42, %h34, %h41;
cvt.f32.f16 %f16, %h42;
max.f32 %f17, %f15, %f16;
or.b32 %r20, %r6, 256;
ld.global.nc.b32 %hh5, [%rd12+512];
mov.b32 {%h43, %h44}, %hh5;
mul.wide.u32 %rd21, %r20, 4;
add.s64 %rd22, %rd6, %rd21;
ld.global.nc.u32 %r21, [%rd22];
cvt.rn.f16.s32 %h45, %r21;
sub.rn.f16 %h46, %h4, %h45;
mul.rn.f16 %h47, %h46, %h6;
sub.rn.f16 %h48, %h43, %h47;
cvt.f32.f16 %f18, %h48;
max.f32 %f19, %f17, %f18;
ld.global.nc.u32 %r22, [%rd14+1028];
cvt.rn.f16.s32 %h49, %r22;
sub.rn.f16 %h50, %h4, %h49;
mul.rn.f16 %h51, %h50, %h6;
sub.rn.f16 %h52, %h44, %h51;
cvt.f32.f16 %f20, %h52;
max.f32 %f21, %f19, %f20;
or.b32 %r23, %r6, 320;
ld.global.nc.b32 %hh6, [%rd12+640];
mov.b32 {%h53, %h54}, %hh6;
mul.wide.u32 %rd23, %r23, 4;
add.s64 %rd24, %rd6, %rd23;
ld.global.nc.u32 %r24, [%rd24];
cvt.rn.f16.s32 %h55, %r24;
sub.rn.f16 %h56, %h4, %h55;
mul.rn.f16 %h57, %h56, %h6;
sub.rn.f16 %h58, %h53, %h57;
cvt.f32.f16 %f22, %h58;
max.f32 %f23, %f21, %f22;
ld.global.nc.u32 %r25, [%rd14+1284];
cvt.rn.f16.s32 %h59, %r25;
sub.rn.f16 %h60, %h4, %h59;
mul.rn.f16 %h61, %h60, %h6;
sub.rn.f16 %h62, %h54, %h61;
cvt.f32.f16 %f24, %h62;
max.f32 %f25, %f23, %f24;
or.b32 %r26, %r6, 384;
ld.global.nc.b32 %hh7, [%rd12+768];
mov.b32 {%h63, %h64}, %hh7;
mul.wide.u32 %rd25, %r26, 4;
add.s64 %rd26, %rd6, %rd25;
ld.global.nc.u32 %r27, [%rd26];
cvt.rn.f16.s32 %h65, %r27;
sub.rn.f16 %h66, %h4, %h65;
mul.rn.f16 %h67, %h66, %h6;
sub.rn.f16 %h68, %h63, %h67;
cvt.f32.f16 %f26, %h68;
max.f32 %f27, %f25, %f26;
ld.global.nc.u32 %r28, [%rd14+1540];
cvt.rn.f16.s32 %h69, %r28;
sub.rn.f16 %h70, %h4, %h69;
mul.rn.f16 %h71, %h70, %h6;
sub.rn.f16 %h72, %h64, %h71;
cvt.f32.f16 %f28, %h72;
max.f32 %f29, %f27, %f28;
or.b32 %r29, %r6, 448;
ld.global.nc.b32 %hh8, [%rd12+896];
mov.b32 {%h73, %h74}, %hh8;
mul.wide.u32 %rd27, %r29, 4;
add.s64 %rd28, %rd6, %rd27;
ld.global.nc.u32 %r30, [%rd28];
cvt.rn.f16.s32 %h75, %r30;
sub.rn.f16 %h76, %h4, %h75;
mul.rn.f16 %h77, %h76, %h6;
sub.rn.f16 %h78, %h73, %h77;
cvt.f32.f16 %f30, %h78;
max.f32 %f31, %f29, %f30;
ld.global.nc.u32 %r31, [%rd14+1796];
cvt.rn.f16.s32 %h79, %r31;
sub.rn.f16 %h80, %h4, %h79;
mul.rn.f16 %h81, %h80, %h6;
sub.rn.f16 %h82, %h74, %h81;
cvt.f32.f16 %f32, %h82;
max.f32 %f33, %f31, %f32;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
max.f32 %f35, %f33, %f34;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
max.f32 %f37, %f35, %f36;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
max.f32 %f39, %f37, %f38;
shfl.sync.down.b32 %f40, %f39, 2, 31, -1;
max.f32 %f41, %f39, %f40;
shfl.sync.down.b32 %f42, %f41, 1, 31, -1;
setp.eq.s32 %p1, %r1, 0;
@%p1 bra LBB61_3;
bra.uni LBB61_1;
LBB61_3:
max.f32 %f1, %f41, %f42;
st.shared.f32 [shared_cache_014], %f1;
LBB61_1:
bar.sync 0;
mul.wide.u32 %rd32, %r1, 4;
mov.u64 %rd33, shared_cache_014;
add.s64 %rd3, %rd33, %rd32;
cvta.shared.u64 %rd34, %rd3;
mov.u32 %r34, -8388608;
st.local.u32 [%rd1], %r34;
selp.b64 %rd36, %rd34, %rd10, %p1;
ld.f32 %f43, [%rd36];
shfl.sync.down.b32 %f44, %f43, 16, 31, -1;
max.f32 %f45, %f43, %f44;
shfl.sync.down.b32 %f46, %f45, 8, 31, -1;
max.f32 %f47, %f45, %f46;
shfl.sync.down.b32 %f48, %f47, 4, 31, -1;
max.f32 %f49, %f47, %f48;
shfl.sync.down.b32 %f50, %f49, 2, 31, -1;
max.f32 %f51, %f49, %f50;
shfl.sync.down.b32 %f52, %f51, 1, 31, -1;
max.f32 %f53, %f51, %f52;
st.f32 [%rd36], %f53;
@%p1 bra LBB61_4;
bra.uni LBB61_2;
LBB61_4:
ld.param.u64 %rd7, [fusion_2203_param_1];
shr.u32 %r33, %r5, 9;
cvta.to.global.u64 %rd8, %rd7;
and.b32 %r32, %r5, 511;
mul.wide.u32 %rd29, %r33, 2048;
add.s64 %rd30, %rd8, %rd29;
mul.wide.u32 %rd31, %r32, 4;
add.s64 %rd2, %rd30, %rd31;
ld.global.u32 %r36, [%rd2];
LBB61_5:
mov.b32 %f54, %r36;
ld.shared.f32 %f55, [%rd3];
max.f32 %f56, %f54, %f55;
mov.b32 %r35, %f56;
atom.global.cas.b32 %r4, [%rd2], %r36, %r35;
setp.eq.s32 %p3, %r4, %r36;
mov.u32 %r36, %r4;
@%p3 bra LBB61_2;
bra.uni LBB61_5;
LBB61_2:
ret;
}
// .globl fusion_2201
.visible .entry fusion_2201(
.param .u64 fusion_2201_param_0,
.param .u64 fusion_2201_param_1,
.param .u64 fusion_2201_param_2,
.param .u64 fusion_2201_param_3,
.param .u64 fusion_2201_param_4
)
.reqntid 32, 1, 1
{
.local .align 4 .b8 __local_depot62[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<35>;
.reg .b16 %h<83>;
.reg .b32 %hh<9>;
.reg .f32 %f<249>;
.reg .b32 %r<32>;
.reg .b64 %rd<41>;
mov.u64 %SPL, __local_depot62;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd4, [fusion_2201_param_0];
ld.param.u64 %rd5, [fusion_2201_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd8, [fusion_2201_param_2];
cvta.to.global.u64 %rd9, %rd8;
cvta.to.global.u64 %rd11, %rd4;
add.u64 %rd12, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 9;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd13, %r5, 2;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.b32 %hh1, [%rd14];
mov.b32 {%h1, %h2}, %hh1;
mul.wide.u32 %rd15, %r3, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.v2.u32 {%r6, %r7}, [%rd16];
cvt.rn.f16.s32 %h3, %r6;
mov.b16 %h4, 0x3C00;
sub.rn.f16 %h5, %h4, %h3;
mov.b16 %h6, 0x70E2;
mul.rn.f16 %h7, %h5, %h6;
sub.rn.f16 %h8, %h1, %h7;
cvt.f32.f16 %f2, %h8;
mul.wide.u32 %rd17, %r2, 4;
add.s64 %rd18, %rd9, %rd17;
ld.global.nc.f32 %f3, [%rd18];
sub.rn.f32 %f4, %f2, %f3;
mul.rn.f32 %f5, %f4, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f6, %f5;
add.rn.f32 %f7, %f6, 0f00000000;
ex2.approx.f32 %f8, %f7;
fma.rn.f32 %f9, %f6, 0fBF317200, %f4;
fma.rn.f32 %f10, %f6, 0fB5BFBE8E, %f9;
mul.rn.f32 %f11, %f10, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f12, %f11;
mul.rn.f32 %f13, %f8, %f12;
setp.lt.f32 %p1, %f4, 0fC2D20000;
setp.gt.f32 %p2, %f4, 0f42D20000;
add.rn.f32 %f14, %f13, 0f00000000;
selp.f32 %f15, 0f00000000, %f14, %p1;
selp.f32 %f16, 0f7F800000, %f15, %p2;
cvt.rn.f16.s32 %h9, %r7;
sub.rn.f16 %h10, %h4, %h9;
mul.rn.f16 %h11, %h10, %h6;
sub.rn.f16 %h12, %h2, %h11;
cvt.f32.f16 %f17, %h12;
sub.rn.f32 %f18, %f17, %f3;
mul.rn.f32 %f19, %f18, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f20, %f19;
add.rn.f32 %f21, %f20, 0f00000000;
ex2.approx.f32 %f22, %f21;
fma.rn.f32 %f23, %f20, 0fBF317200, %f18;
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23;
mul.rn.f32 %f25, %f24, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f26, %f25;
mul.rn.f32 %f27, %f22, %f26;
setp.lt.f32 %p3, %f18, 0fC2D20000;
selp.f32 %f28, 0f00000000, %f27, %p3;
setp.gt.f32 %p4, %f18, 0f42D20000;
selp.f32 %f29, 0f7F800000, %f28, %p4;
add.rn.f32 %f30, %f16, %f29;
or.b32 %r8, %r3, 64;
ld.global.nc.b32 %hh2, [%rd14+128];
mov.b32 {%h13, %h14}, %hh2;
mul.wide.u32 %rd19, %r8, 4;
add.s64 %rd20, %rd6, %rd19;
ld.global.nc.u32 %r9, [%rd20];
cvt.rn.f16.s32 %h15, %r9;
sub.rn.f16 %h16, %h4, %h15;
mul.rn.f16 %h17, %h16, %h6;
sub.rn.f16 %h18, %h13, %h17;
cvt.f32.f16 %f31, %h18;
sub.rn.f32 %f32, %f31, %f3;
mul.rn.f32 %f33, %f32, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f34, %f33;
add.rn.f32 %f35, %f34, 0f00000000;
ex2.approx.f32 %f36, %f35;
fma.rn.f32 %f37, %f34, 0fBF317200, %f32;
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37;
mul.rn.f32 %f39, %f38, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f40, %f39;
mul.rn.f32 %f41, %f36, %f40;
setp.lt.f32 %p5, %f32, 0fC2D20000;
selp.f32 %f42, 0f00000000, %f41, %p5;
setp.gt.f32 %p6, %f32, 0f42D20000;
selp.f32 %f43, 0f7F800000, %f42, %p6;
add.rn.f32 %f44, %f30, %f43;
ld.global.nc.u32 %r10, [%rd16+260];
cvt.rn.f16.s32 %h19, %r10;
sub.rn.f16 %h20, %h4, %h19;
mul.rn.f16 %h21, %h20, %h6;
sub.rn.f16 %h22, %h14, %h21;
cvt.f32.f16 %f45, %h22;
sub.rn.f32 %f46, %f45, %f3;
mul.rn.f32 %f47, %f46, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f48, %f47;
add.rn.f32 %f49, %f48, 0f00000000;
ex2.approx.f32 %f50, %f49;
fma.rn.f32 %f51, %f48, 0fBF317200, %f46;
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51;
mul.rn.f32 %f53, %f52, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f54, %f53;
mul.rn.f32 %f55, %f50, %f54;
setp.lt.f32 %p7, %f46, 0fC2D20000;
selp.f32 %f56, 0f00000000, %f55, %p7;
setp.gt.f32 %p8, %f46, 0f42D20000;
selp.f32 %f57, 0f7F800000, %f56, %p8;
add.rn.f32 %f58, %f44, %f57;
or.b32 %r11, %r3, 128;
ld.global.nc.b32 %hh3, [%rd14+256];
mov.b32 {%h23, %h24}, %hh3;
mul.wide.u32 %rd21, %r11, 4;
add.s64 %rd22, %rd6, %rd21;
ld.global.nc.u32 %r12, [%rd22];
cvt.rn.f16.s32 %h25, %r12;
sub.rn.f16 %h26, %h4, %h25;
mul.rn.f16 %h27, %h26, %h6;
sub.rn.f16 %h28, %h23, %h27;
cvt.f32.f16 %f59, %h28;
sub.rn.f32 %f60, %f59, %f3;
mul.rn.f32 %f61, %f60, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f62, %f61;
add.rn.f32 %f63, %f62, 0f00000000;
ex2.approx.f32 %f64, %f63;
fma.rn.f32 %f65, %f62, 0fBF317200, %f60;
fma.rn.f32 %f66, %f62, 0fB5BFBE8E, %f65;
mul.rn.f32 %f67, %f66, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f68, %f67;
mul.rn.f32 %f69, %f64, %f68;
setp.lt.f32 %p9, %f60, 0fC2D20000;
selp.f32 %f70, 0f00000000, %f69, %p9;
setp.gt.f32 %p10, %f60, 0f42D20000;
selp.f32 %f71, 0f7F800000, %f70, %p10;
add.rn.f32 %f72, %f58, %f71;
ld.global.nc.u32 %r13, [%rd16+516];
cvt.rn.f16.s32 %h29, %r13;
sub.rn.f16 %h30, %h4, %h29;
mul.rn.f16 %h31, %h30, %h6;
sub.rn.f16 %h32, %h24, %h31;
cvt.f32.f16 %f73, %h32;
sub.rn.f32 %f74, %f73, %f3;
mul.rn.f32 %f75, %f74, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f76, %f75;
add.rn.f32 %f77, %f76, 0f00000000;
ex2.approx.f32 %f78, %f77;
fma.rn.f32 %f79, %f76, 0fBF317200, %f74;
fma.rn.f32 %f80, %f76, 0fB5BFBE8E, %f79;
mul.rn.f32 %f81, %f80, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f82, %f81;
mul.rn.f32 %f83, %f78, %f82;
setp.lt.f32 %p11, %f74, 0fC2D20000;
selp.f32 %f84, 0f00000000, %f83, %p11;
setp.gt.f32 %p12, %f74, 0f42D20000;
selp.f32 %f85, 0f7F800000, %f84, %p12;
add.rn.f32 %f86, %f72, %f85;
or.b32 %r14, %r3, 192;
ld.global.nc.b32 %hh4, [%rd14+384];
mov.b32 {%h33, %h34}, %hh4;
mul.wide.u32 %rd23, %r14, 4;
add.s64 %rd24, %rd6, %rd23;
ld.global.nc.u32 %r15, [%rd24];
cvt.rn.f16.s32 %h35, %r15;
sub.rn.f16 %h36, %h4, %h35;
mul.rn.f16 %h37, %h36, %h6;
sub.rn.f16 %h38, %h33, %h37;
cvt.f32.f16 %f87, %h38;
sub.rn.f32 %f88, %f87, %f3;
mul.rn.f32 %f89, %f88, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f90, %f89;
add.rn.f32 %f91, %f90, 0f00000000;
ex2.approx.f32 %f92, %f91;
fma.rn.f32 %f93, %f90, 0fBF317200, %f88;
fma.rn.f32 %f94, %f90, 0fB5BFBE8E, %f93;
mul.rn.f32 %f95, %f94, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f96, %f95;
mul.rn.f32 %f97, %f92, %f96;
setp.lt.f32 %p13, %f88, 0fC2D20000;
selp.f32 %f98, 0f00000000, %f97, %p13;
setp.gt.f32 %p14, %f88, 0f42D20000;
selp.f32 %f99, 0f7F800000, %f98, %p14;
add.rn.f32 %f100, %f86, %f99;
ld.global.nc.u32 %r16, [%rd16+772];
cvt.rn.f16.s32 %h39, %r16;
sub.rn.f16 %h40, %h4, %h39;
mul.rn.f16 %h41, %h40, %h6;
sub.rn.f16 %h42, %h34, %h41;
cvt.f32.f16 %f101, %h42;
sub.rn.f32 %f102, %f101, %f3;
mul.rn.f32 %f103, %f102, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f104, %f103;
add.rn.f32 %f105, %f104, 0f00000000;
ex2.approx.f32 %f106, %f105;
fma.rn.f32 %f107, %f104, 0fBF317200, %f102;
fma.rn.f32 %f108, %f104, 0fB5BFBE8E, %f107;
mul.rn.f32 %f109, %f108, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f110, %f109;
mul.rn.f32 %f111, %f106, %f110;
setp.lt.f32 %p15, %f102, 0fC2D20000;
selp.f32 %f112, 0f00000000, %f111, %p15;
setp.gt.f32 %p16, %f102, 0f42D20000;
selp.f32 %f113, 0f7F800000, %f112, %p16;
add.rn.f32 %f114, %f100, %f113;
or.b32 %r17, %r3, 256;
ld.global.nc.b32 %hh5, [%rd14+512];
mov.b32 {%h43, %h44}, %hh5;
mul.wide.u32 %rd25, %r17, 4;
add.s64 %rd26, %rd6, %rd25;
ld.global.nc.u32 %r18, [%rd26];
cvt.rn.f16.s32 %h45, %r18;
sub.rn.f16 %h46, %h4, %h45;
mul.rn.f16 %h47, %h46, %h6;
sub.rn.f16 %h48, %h43, %h47;
cvt.f32.f16 %f115, %h48;
sub.rn.f32 %f116, %f115, %f3;
mul.rn.f32 %f117, %f116, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f118, %f117;
add.rn.f32 %f119, %f118, 0f00000000;
ex2.approx.f32 %f120, %f119;
fma.rn.f32 %f121, %f118, 0fBF317200, %f116;
fma.rn.f32 %f122, %f118, 0fB5BFBE8E, %f121;
mul.rn.f32 %f123, %f122, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f124, %f123;
mul.rn.f32 %f125, %f120, %f124;
setp.lt.f32 %p17, %f116, 0fC2D20000;
selp.f32 %f126, 0f00000000, %f125, %p17;
setp.gt.f32 %p18, %f116, 0f42D20000;
selp.f32 %f127, 0f7F800000, %f126, %p18;
add.rn.f32 %f128, %f114, %f127;
ld.global.nc.u32 %r19, [%rd16+1028];
cvt.rn.f16.s32 %h49, %r19;
sub.rn.f16 %h50, %h4, %h49;
mul.rn.f16 %h51, %h50, %h6;
sub.rn.f16 %h52, %h44, %h51;
cvt.f32.f16 %f129, %h52;
sub.rn.f32 %f130, %f129, %f3;
mul.rn.f32 %f131, %f130, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f132, %f131;
add.rn.f32 %f133, %f132, 0f00000000;
ex2.approx.f32 %f134, %f133;
fma.rn.f32 %f135, %f132, 0fBF317200, %f130;
fma.rn.f32 %f136, %f132, 0fB5BFBE8E, %f135;
mul.rn.f32 %f137, %f136, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f138, %f137;
mul.rn.f32 %f139, %f134, %f138;
setp.lt.f32 %p19, %f130, 0fC2D20000;
selp.f32 %f140, 0f00000000, %f139, %p19;
setp.gt.f32 %p20, %f130, 0f42D20000;
selp.f32 %f141, 0f7F800000, %f140, %p20;
add.rn.f32 %f142, %f128, %f141;
or.b32 %r20, %r3, 320;
ld.global.nc.b32 %hh6, [%rd14+640];
mov.b32 {%h53, %h54}, %hh6;
mul.wide.u32 %rd27, %r20, 4;
add.s64 %rd28, %rd6, %rd27;
ld.global.nc.u32 %r21, [%rd28];
cvt.rn.f16.s32 %h55, %r21;
sub.rn.f16 %h56, %h4, %h55;
mul.rn.f16 %h57, %h56, %h6;
sub.rn.f16 %h58, %h53, %h57;
cvt.f32.f16 %f143, %h58;
sub.rn.f32 %f144, %f143, %f3;
mul.rn.f32 %f145, %f144, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f146, %f145;
add.rn.f32 %f147, %f146, 0f00000000;
ex2.approx.f32 %f148, %f147;
fma.rn.f32 %f149, %f146, 0fBF317200, %f144;
fma.rn.f32 %f150, %f146, 0fB5BFBE8E, %f149;
mul.rn.f32 %f151, %f150, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f152, %f151;
mul.rn.f32 %f153, %f148, %f152;
setp.lt.f32 %p21, %f144, 0fC2D20000;
selp.f32 %f154, 0f00000000, %f153, %p21;
setp.gt.f32 %p22, %f144, 0f42D20000;
selp.f32 %f155, 0f7F800000, %f154, %p22;
add.rn.f32 %f156, %f142, %f155;
ld.global.nc.u32 %r22, [%rd16+1284];
cvt.rn.f16.s32 %h59, %r22;
sub.rn.f16 %h60, %h4, %h59;
mul.rn.f16 %h61, %h60, %h6;
sub.rn.f16 %h62, %h54, %h61;
cvt.f32.f16 %f157, %h62;
sub.rn.f32 %f158, %f157, %f3;
mul.rn.f32 %f159, %f158, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f160, %f159;
add.rn.f32 %f161, %f160, 0f00000000;
ex2.approx.f32 %f162, %f161;
fma.rn.f32 %f163, %f160, 0fBF317200, %f158;
fma.rn.f32 %f164, %f160, 0fB5BFBE8E, %f163;
mul.rn.f32 %f165, %f164, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f166, %f165;
mul.rn.f32 %f167, %f162, %f166;
setp.lt.f32 %p23, %f158, 0fC2D20000;
selp.f32 %f168, 0f00000000, %f167, %p23;
setp.gt.f32 %p24, %f158, 0f42D20000;
selp.f32 %f169, 0f7F800000, %f168, %p24;
add.rn.f32 %f170, %f156, %f169;
or.b32 %r23, %r3, 384;
ld.global.nc.b32 %hh7, [%rd14+768];
mov.b32 {%h63, %h64}, %hh7;
mul.wide.u32 %rd29, %r23, 4;
add.s64 %rd30, %rd6, %rd29;
ld.global.nc.u32 %r24, [%rd30];
cvt.rn.f16.s32 %h65, %r24;
sub.rn.f16 %h66, %h4, %h65;
mul.rn.f16 %h67, %h66, %h6;
sub.rn.f16 %h68, %h63, %h67;
cvt.f32.f16 %f171, %h68;
sub.rn.f32 %f172, %f171, %f3;
mul.rn.f32 %f173, %f172, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f174, %f173;
add.rn.f32 %f175, %f174, 0f00000000;
ex2.approx.f32 %f176, %f175;
fma.rn.f32 %f177, %f174, 0fBF317200, %f172;
fma.rn.f32 %f178, %f174, 0fB5BFBE8E, %f177;
mul.rn.f32 %f179, %f178, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f180, %f179;
mul.rn.f32 %f181, %f176, %f180;
setp.lt.f32 %p25, %f172, 0fC2D20000;
selp.f32 %f182, 0f00000000, %f181, %p25;
setp.gt.f32 %p26, %f172, 0f42D20000;
selp.f32 %f183, 0f7F800000, %f182, %p26;
add.rn.f32 %f184, %f170, %f183;
ld.global.nc.u32 %r25, [%rd16+1540];
cvt.rn.f16.s32 %h69, %r25;
sub.rn.f16 %h70, %h4, %h69;
mul.rn.f16 %h71, %h70, %h6;
sub.rn.f16 %h72, %h64, %h71;
cvt.f32.f16 %f185, %h72;
sub.rn.f32 %f186, %f185, %f3;
mul.rn.f32 %f187, %f186, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f188, %f187;
add.rn.f32 %f189, %f188, 0f00000000;
ex2.approx.f32 %f190, %f189;
fma.rn.f32 %f191, %f188, 0fBF317200, %f186;
fma.rn.f32 %f192, %f188, 0fB5BFBE8E, %f191;
mul.rn.f32 %f193, %f192, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f194, %f193;
mul.rn.f32 %f195, %f190, %f194;
setp.lt.f32 %p27, %f186, 0fC2D20000;
selp.f32 %f196, 0f00000000, %f195, %p27;
setp.gt.f32 %p28, %f186, 0f42D20000;
selp.f32 %f197, 0f7F800000, %f196, %p28;
add.rn.f32 %f198, %f184, %f197;
or.b32 %r26, %r3, 448;
ld.global.nc.b32 %hh8, [%rd14+896];
mov.b32 {%h73, %h74}, %hh8;
mul.wide.u32 %rd31, %r26, 4;
add.s64 %rd32, %rd6, %rd31;
ld.global.nc.u32 %r27, [%rd32];
cvt.rn.f16.s32 %h75, %r27;
sub.rn.f16 %h76, %h4, %h75;
mul.rn.f16 %h77, %h76, %h6;
sub.rn.f16 %h78, %h73, %h77;
cvt.f32.f16 %f199, %h78;
sub.rn.f32 %f200, %f199, %f3;
mul.rn.f32 %f201, %f200, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f202, %f201;
add.rn.f32 %f203, %f202, 0f00000000;
ex2.approx.f32 %f204, %f203;
fma.rn.f32 %f205, %f202, 0fBF317200, %f200;
fma.rn.f32 %f206, %f202, 0fB5BFBE8E, %f205;
mul.rn.f32 %f207, %f206, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f208, %f207;
mul.rn.f32 %f209, %f204, %f208;
setp.lt.f32 %p29, %f200, 0fC2D20000;
selp.f32 %f210, 0f00000000, %f209, %p29;
setp.gt.f32 %p30, %f200, 0f42D20000;
selp.f32 %f211, 0f7F800000, %f210, %p30;
add.rn.f32 %f212, %f198, %f211;
ld.global.nc.u32 %r28, [%rd16+1796];
cvt.rn.f16.s32 %h79, %r28;
sub.rn.f16 %h80, %h4, %h79;
mul.rn.f16 %h81, %h80, %h6;
sub.rn.f16 %h82, %h74, %h81;
cvt.f32.f16 %f213, %h82;
sub.rn.f32 %f214, %f213, %f3;
mul.rn.f32 %f215, %f214, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f216, %f215;
add.rn.f32 %f217, %f216, 0f00000000;
ex2.approx.f32 %f218, %f217;
fma.rn.f32 %f219, %f216, 0fBF317200, %f214;
fma.rn.f32 %f220, %f216, 0fB5BFBE8E, %f219;
mul.rn.f32 %f221, %f220, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f222, %f221;
mul.rn.f32 %f223, %f218, %f222;
setp.lt.f32 %p31, %f214, 0fC2D20000;
selp.f32 %f224, 0f00000000, %f223, %p31;
setp.gt.f32 %p32, %f214, 0f42D20000;
selp.f32 %f225, 0f7F800000, %f224, %p32;
add.rn.f32 %f226, %f212, %f225;
shfl.sync.down.b32 %f227, %f226, 16, 31, -1;
add.rn.f32 %f228, %f227, %f226;
shfl.sync.down.b32 %f229, %f228, 8, 31, -1;
add.rn.f32 %f230, %f229, %f228;
shfl.sync.down.b32 %f231, %f230, 4, 31, -1;
add.rn.f32 %f232, %f231, %f230;
shfl.sync.down.b32 %f233, %f232, 2, 31, -1;
add.rn.f32 %f234, %f233, %f232;
shfl.sync.down.b32 %f235, %f234, 1, 31, -1;
setp.eq.s32 %p33, %r1, 0;
@%p33 bra LBB62_3;
bra.uni LBB62_1;
LBB62_3:
add.rn.f32 %f1, %f235, %f234;
st.shared.f32 [shared_cache_015], %f1;
LBB62_1:
bar.sync 0;
mul.wide.u32 %rd36, %r1, 4;
mov.u64 %rd37, shared_cache_015;
add.s64 %rd3, %rd37, %rd36;
cvta.shared.u64 %rd38, %rd3;
mov.u32 %r31, 0;
st.local.u32 [%rd1], %r31;
selp.b64 %rd40, %rd38, %rd12, %p33;
ld.f32 %f236, [%rd40];
shfl.sync.down.b32 %f237, %f236, 16, 31, -1;
add.rn.f32 %f238, %f236, %f237;
shfl.sync.down.b32 %f239, %f238, 8, 31, -1;
add.rn.f32 %f240, %f238, %f239;
shfl.sync.down.b32 %f241, %f240, 4, 31, -1;
add.rn.f32 %f242, %f240, %f241;
shfl.sync.down.b32 %f243, %f242, 2, 31, -1;
add.rn.f32 %f244, %f242, %f243;
shfl.sync.down.b32 %f245, %f244, 1, 31, -1;
add.rn.f32 %f246, %f244, %f245;
st.f32 [%rd40], %f246;
@%p33 bra LBB62_4;
bra.uni LBB62_2;
LBB62_4:
ld.param.u64 %rd7, [fusion_2201_param_1];
shr.u32 %r30, %r2, 9;
cvta.to.global.u64 %rd10, %rd7;
and.b32 %r29, %r2, 511;
mul.wide.u32 %rd33, %r30, 2048;
add.s64 %rd34, %rd10, %rd33;
mul.wide.u32 %rd35, %r29, 4;
add.s64 %rd2, %rd34, %rd35;
ld.shared.f32 %f247, [%rd3];
atom.global.add.f32 %f248, [%rd2], %f247;
LBB62_2:
ret;
}
// .globl fusion_2200
.visible .entry fusion_2200(
.param .u64 fusion_2200_param_0,
.param .u64 fusion_2200_param_1,
.param .u64 fusion_2200_param_2,
.param .u64 fusion_2200_param_3,
.param .u64 fusion_2200_param_4,
.param .u64 fusion_2200_param_5
)
.reqntid 256, 1, 1
{
.reg .pred %p<9>;
.reg .b16 %h<27>;
.reg .b32 %hh<3>;
.reg .f32 %f<59>;
.reg .b32 %r<18>;
.reg .b64 %rd<26>;
ld.param.u64 %rd1, [fusion_2200_param_0];
ld.param.u64 %rd2, [fusion_2200_param_4];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2200_param_1];
ld.param.u64 %rd5, [fusion_2200_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2200_param_2];
cvta.to.global.u64 %rd8, %rd7;
cvta.to.global.u64 %rd9, %rd4;
cvta.to.global.u64 %rd10, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
or.b32 %r8, %r4, 3;
shr.u32 %r9, %r5, 9;
and.b32 %r10, %r8, 511;
and.b32 %r11, %r7, 510;
and.b32 %r12, %r6, 509;
and.b32 %r13, %r4, 508;
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd10, %rd11;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
mul.wide.u32 %rd13, %r13, 4;
add.s64 %rd14, %rd3, %rd13;
ld.global.nc.u32 %r14, [%rd14];
cvt.rn.f16.s32 %h9, %r14;
mov.b16 %h10, 0x3C00;
sub.rn.f16 %h11, %h10, %h9;
mov.b16 %h12, 0x70E2;
mul.rn.f16 %h13, %h11, %h12;
sub.rn.f16 %h14, %h5, %h13;
cvt.f32.f16 %f1, %h14;
mul.wide.u32 %rd15, %r9, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.f32 %f2, [%rd16];
sub.rn.f32 %f3, %f1, %f2;
mul.rn.f32 %f4, %f3, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f5, %f4;
add.rn.f32 %f6, %f5, 0f00000000;
ex2.approx.f32 %f7, %f6;
fma.rn.f32 %f8, %f5, 0fBF317200, %f3;
fma.rn.f32 %f9, %f5, 0fB5BFBE8E, %f8;
mul.rn.f32 %f10, %f9, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f11, %f10;
mul.rn.f32 %f12, %f7, %f11;
setp.lt.f32 %p1, %f3, 0fC2D20000;
selp.f32 %f13, 0f00000000, %f12, %p1;
setp.gt.f32 %p2, %f3, 0f42D20000;
selp.f32 %f14, 0f7F800000, %f13, %p2;
add.s64 %rd17, %rd8, %rd15;
ld.global.nc.f32 %f15, [%rd17];
div.full.f32 %f16, %f14, %f15;
mul.wide.u32 %rd18, %r5, 4;
add.s64 %rd19, %rd9, %rd18;
mul.wide.u32 %rd20, %r12, 4;
add.s64 %rd21, %rd3, %rd20;
ld.global.nc.u32 %r15, [%rd21];
cvt.rn.f16.s32 %h15, %r15;
sub.rn.f16 %h16, %h10, %h15;
mul.rn.f16 %h17, %h16, %h12;
sub.rn.f16 %h18, %h6, %h17;
cvt.f32.f16 %f17, %h18;
sub.rn.f32 %f18, %f17, %f2;
mul.rn.f32 %f19, %f18, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f20, %f19;
add.rn.f32 %f21, %f20, 0f00000000;
ex2.approx.f32 %f22, %f21;
fma.rn.f32 %f23, %f20, 0fBF317200, %f18;
fma.rn.f32 %f24, %f20, 0fB5BFBE8E, %f23;
mul.rn.f32 %f25, %f24, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f26, %f25;
mul.rn.f32 %f27, %f22, %f26;
setp.lt.f32 %p3, %f18, 0fC2D20000;
selp.f32 %f28, 0f00000000, %f27, %p3;
setp.gt.f32 %p4, %f18, 0f42D20000;
selp.f32 %f29, 0f7F800000, %f28, %p4;
div.full.f32 %f30, %f29, %f15;
mul.wide.u32 %rd22, %r11, 4;
add.s64 %rd23, %rd3, %rd22;
ld.global.nc.u32 %r16, [%rd23];
cvt.rn.f16.s32 %h19, %r16;
sub.rn.f16 %h20, %h10, %h19;
mul.rn.f16 %h21, %h20, %h12;
sub.rn.f16 %h22, %h7, %h21;
cvt.f32.f16 %f31, %h22;
sub.rn.f32 %f32, %f31, %f2;
mul.rn.f32 %f33, %f32, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f34, %f33;
add.rn.f32 %f35, %f34, 0f00000000;
ex2.approx.f32 %f36, %f35;
fma.rn.f32 %f37, %f34, 0fBF317200, %f32;
fma.rn.f32 %f38, %f34, 0fB5BFBE8E, %f37;
mul.rn.f32 %f39, %f38, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f40, %f39;
mul.rn.f32 %f41, %f36, %f40;
setp.lt.f32 %p5, %f32, 0fC2D20000;
selp.f32 %f42, 0f00000000, %f41, %p5;
setp.gt.f32 %p6, %f32, 0f42D20000;
selp.f32 %f43, 0f7F800000, %f42, %p6;
div.full.f32 %f44, %f43, %f15;
mul.wide.u32 %rd24, %r10, 4;
add.s64 %rd25, %rd3, %rd24;
ld.global.nc.u32 %r17, [%rd25];
cvt.rn.f16.s32 %h23, %r17;
sub.rn.f16 %h24, %h10, %h23;
mul.rn.f16 %h25, %h24, %h12;
sub.rn.f16 %h26, %h8, %h25;
cvt.f32.f16 %f45, %h26;
sub.rn.f32 %f46, %f45, %f2;
mul.rn.f32 %f47, %f46, 0f3FB8AA3B;
cvt.rzi.f32.f32 %f48, %f47;
add.rn.f32 %f49, %f48, 0f00000000;
ex2.approx.f32 %f50, %f49;
fma.rn.f32 %f51, %f48, 0fBF317200, %f46;
fma.rn.f32 %f52, %f48, 0fB5BFBE8E, %f51;
mul.rn.f32 %f53, %f52, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f54, %f53;
mul.rn.f32 %f55, %f50, %f54;
setp.lt.f32 %p7, %f46, 0fC2D20000;
selp.f32 %f56, 0f00000000, %f55, %p7;
setp.gt.f32 %p8, %f46, 0f42D20000;
selp.f32 %f57, 0f7F800000, %f56, %p8;
div.full.f32 %f58, %f57, %f15;
st.global.v4.f32 [%rd19], {%f16, %f30, %f44, %f58};
ret;
}
// .globl rng_get_and_update_state_39
.visible .entry rng_get_and_update_state_39(
.param .u64 rng_get_and_update_state_39_param_0,
.param .u64 rng_get_and_update_state_39_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_39_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 4194304;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 4194304;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2199
.visible .entry fusion_2199(
.param .u64 fusion_2199_param_0,
.param .u64 fusion_2199_param_1,
.param .u64 fusion_2199_param_2,
.param .u64 fusion_2199_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<19>;
.reg .f32 %f<13>;
.reg .b32 %r<29>;
.reg .b64 %rd<119>;
ld.param.u64 %rd1, [fusion_2199_param_0];
ld.param.u64 %rd2, [fusion_2199_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2199_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
ld.global.nc.v2.u64 {%rd7, %rd8}, [%rd3];
shr.u32 %r6, %r5, 2;
cvt.u64.u32 %rd9, %r6;
add.s64 %rd10, %rd7, %rd9;
setp.lt.u64 %p1, %rd10, %rd7;
and.b64 %rd11, %rd10, 4294967295;
mul.lo.s64 %rd12, %rd11, 3528531795;
selp.u64 %rd13, 1, 0, %p1;
add.s64 %rd14, %rd8, %rd13;
xor.b64 %rd15, %rd14, %rd12;
shr.u64 %rd16, %rd15, 32;
mul.lo.s64 %rd17, %rd16, 3449720151;
shr.u64 %rd18, %rd17, 32;
and.b64 %rd19, %rd14, 4294967295;
mul.lo.s64 %rd20, %rd19, 3449720151;
and.b64 %rd21, %rd20, 4294967295;
xor.b64 %rd22, %rd21, %rd18;
xor.b64 %rd23, %rd22, 2654435769;
mul.lo.s64 %rd24, %rd23, 3528531795;
shr.u64 %rd25, %rd24, 32;
xor.b64 %rd26, %rd20, %rd10;
shr.u64 %rd27, %rd26, 32;
mul.lo.s64 %rd28, %rd27, 3528531795;
and.b64 %rd29, %rd28, 4294967295;
xor.b64 %rd30, %rd29, %rd25;
xor.b64 %rd31, %rd30, 1993301258;
mul.lo.s64 %rd32, %rd31, 3449720151;
shr.u64 %rd33, %rd32, 32;
shr.u64 %rd34, %rd28, 32;
and.b64 %rd35, %rd12, 4294967295;
xor.b64 %rd36, %rd35, %rd34;
xor.b64 %rd37, %rd36, 3144134277;
mul.lo.s64 %rd38, %rd37, 3449720151;
and.b64 %rd39, %rd38, 4294967295;
xor.b64 %rd40, %rd39, %rd33;
xor.b64 %rd41, %rd40, 3668340011;
mul.lo.s64 %rd42, %rd41, 3528531795;
shr.u64 %rd43, %rd42, 32;
shr.u64 %rd44, %rd38, 32;
and.b64 %rd45, %rd17, 4294967295;
xor.b64 %rd46, %rd45, %rd44;
xor.b64 %rd47, %rd46, 1013904242;
mul.lo.s64 %rd48, %rd47, 3528531795;
and.b64 %rd49, %rd48, 4294967295;
xor.b64 %rd50, %rd49, %rd43;
xor.b64 %rd51, %rd50, 3986602516;
mul.lo.s64 %rd52, %rd51, 3449720151;
shr.u64 %rd53, %rd52, 32;
shr.u64 %rd54, %rd48, 32;
and.b64 %rd55, %rd24, 4294967295;
xor.b64 %rd56, %rd55, %rd54;
xor.b64 %rd57, %rd56, 842468239;
mul.lo.s64 %rd58, %rd57, 3449720151;
and.b64 %rd59, %rd58, 4294967295;
xor.b64 %rd60, %rd59, %rd53;
xor.b64 %rd61, %rd60, 387276957;
mul.lo.s64 %rd62, %rd61, 3528531795;
shr.u64 %rd63, %rd62, 32;
shr.u64 %rd64, %rd58, 32;
and.b64 %rd65, %rd32, 4294967295;
xor.b64 %rd66, %rd65, %rd64;
xor.b64 %rd67, %rd66, 2027808484;
mul.lo.s64 %rd68, %rd67, 3528531795;
and.b64 %rd69, %rd68, 4294967295;
xor.b64 %rd70, %rd69, %rd63;
xor.b64 %rd71, %rd70, 1684936478;
mul.lo.s64 %rd72, %rd71, 3449720151;
shr.u64 %rd73, %rd72, 32;
shr.u64 %rd74, %rd68, 32;
and.b64 %rd75, %rd42, 4294967295;
xor.b64 %rd76, %rd75, %rd74;
xor.b64 %rd77, %rd76, 2835769497;
mul.lo.s64 %rd78, %rd77, 3449720151;
and.b64 %rd79, %rd78, 4294967295;
xor.b64 %rd80, %rd79, %rd73;
xor.b64 %rd81, %rd80, 1401181199;
mul.lo.s64 %rd82, %rd81, 3528531795;
shr.u64 %rd83, %rd82, 32;
shr.u64 %rd84, %rd78, 32;
and.b64 %rd85, %rd52, 4294967295;
xor.b64 %rd86, %rd85, %rd84;
xor.b64 %rd87, %rd86, 3041712726;
mul.lo.s64 %rd88, %rd87, 3528531795;
and.b64 %rd89, %rd88, 4294967295;
xor.b64 %rd90, %rd89, %rd83;
xor.b64 %rd91, %rd90, 3678237736;
mul.lo.s64 %rd92, %rd91, 3449720151;
shr.u64 %rd93, %rd92, 32;
cvt.u32.u64 %r7, %rd93;
shr.u64 %rd94, %rd88, 32;
xor.b64 %rd95, %rd94, %rd62;
cvt.u32.u64 %r8, %rd95;
xor.b32 %r9, %r8, 534103459;
mul.lo.s32 %r10, %r9, -845247145;
xor.b32 %r11, %r10, %r7;
shr.u32 %r12, %r11, 9;
xor.b32 %r13, %r12, 4716963;
cvt.rn.f32.u32 %f1, %r13;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h1, %f2;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p2, %h1, %h2;
mul.wide.u32 %rd96, %r5, 4;
add.s64 %rd97, %rd5, %rd96;
ld.global.nc.v4.f32 {%f3, %f4, %f5, %f6}, [%rd97];
cvt.rn.f16.f32 %h3, %f3;
mov.b16 %h4, 0x3C72;
mul.rn.f16 %h5, %h3, %h4;
selp.b16 %h6, %h5, 0x0000, %p2;
mul.wide.u32 %rd98, %r5, 2;
add.s64 %rd99, %rd6, %rd98;
xor.b64 %rd100, %rd84, %rd52;
xor.b64 %rd101, %rd100, 3041712726;
mul.lo.s64 %rd102, %rd101, 3528531795;
xor.b64 %rd103, %rd83, %rd102;
cvt.u32.u64 %r14, %rd103;
xor.b32 %r15, %r14, -616729560;
mul.lo.s32 %r16, %r15, -845247145;
shr.u32 %r17, %r16, 9;
cvt.rn.f32.u32 %f7, %r17;
mul.rn.f32 %f8, %f7, 0f34000000;
cvt.rn.f16.f32 %h7, %f8;
setp.ge.f16 %p3, %h7, %h2;
cvt.rn.f16.f32 %h8, %f4;
mul.rn.f16 %h9, %h8, %h4;
selp.b16 %h10, %h9, 0x0000, %p3;
and.b64 %rd104, %rd62, 4294967295;
xor.b64 %rd105, %rd104, %rd94;
xor.b64 %rd106, %rd105, 534103459;
mul.lo.s64 %rd107, %rd106, 3449720151;
shr.u64 %rd108, %rd107, 32;
and.b64 %rd109, %rd72, 4294967295;
xor.b64 %rd110, %rd109, %rd108;
xor.b64 %rd111, %rd110, 4055616968;
mul.lo.s64 %rd112, %rd111, 3528531795;
shr.u64 %rd113, %rd112, 32;
cvt.u32.u64 %r18, %rd113;
xor.b64 %rd114, %rd73, %rd78;
cvt.u32.u64 %r19, %rd114;
xor.b32 %r20, %r19, 1401181199;
mul.lo.s32 %r21, %r20, -766435501;
xor.b32 %r22, %r21, %r18;
shr.u32 %r23, %r22, 9;
xor.b32 %r24, %r23, 4936337;
cvt.rn.f32.u32 %f9, %r24;
mul.rn.f32 %f10, %f9, 0f34000000;
cvt.rn.f16.f32 %h11, %f10;
setp.ge.f16 %p4, %h11, %h2;
cvt.rn.f16.f32 %h12, %f5;
mul.rn.f16 %h13, %h12, %h4;
selp.b16 %h14, %h13, 0x0000, %p4;
xor.b64 %rd115, %rd63, %rd68;
xor.b64 %rd116, %rd115, 1684936478;
mul.lo.s64 %rd117, %rd116, 3449720151;
xor.b64 %rd118, %rd108, %rd117;
cvt.u32.u64 %r25, %rd118;
xor.b32 %r26, %r25, -239350328;
mul.lo.s32 %r27, %r26, -766435501;
shr.u32 %r28, %r27, 9;
cvt.rn.f32.u32 %f11, %r28;
mul.rn.f32 %f12, %f11, 0f34000000;
cvt.rn.f16.f32 %h15, %f12;
setp.ge.f16 %p5, %h15, %h2;
cvt.rn.f16.f32 %h16, %f6;
mul.rn.f16 %h17, %h16, %h4;
selp.b16 %h18, %h17, 0x0000, %p5;
st.global.v4.b16 [%rd99], {%h6, %h10, %h14, %h18};
ret;
}
// .globl fusion_2701
.visible .entry fusion_2701(
.param .u64 fusion_2701_param_0,
.param .u64 fusion_2701_param_1,
.param .u64 fusion_2701_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2701_param_0];
ld.param.u64 %rd2, [fusion_2701_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2197
.visible .entry fusion_2197(
.param .u64 fusion_2197_param_0,
.param .u64 fusion_2197_param_1,
.param .u64 fusion_2197_param_2,
.param .u64 fusion_2197_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2197_param_0];
ld.param.u64 %rd2, [fusion_2197_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2197_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd7, %r14, 2048;
add.s64 %rd8, %rd6, %rd7;
mul.wide.u32 %rd9, %r16, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r7, 256;
add.s64 %rd12, %rd3, %rd11;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f1, [%rd14];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
mul.wide.u32 %rd15, %r5, 2;
add.s64 %rd16, %rd5, %rd15;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd8, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd12, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd8, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd12, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd8, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd12, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd16], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2196
.visible .entry fusion_2196(
.param .u64 fusion_2196_param_0,
.param .u64 fusion_2196_param_1,
.param .u64 fusion_2196_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .b32 %r<12>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [fusion_2196_param_0];
ld.param.u64 %rd2, [fusion_2196_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
and.b32 %r8, %r4, 60;
shr.u32 %r9, %r2, 4;
mul.wide.u32 %rd5, %r9, 65536;
add.s64 %rd6, %rd3, %rd5;
mul.wide.u32 %rd7, %r1, 128;
add.s64 %rd8, %rd6, %rd7;
mul.wide.u32 %rd9, %r8, 2;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.b16 %h1, [%rd10];
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd4, %rd11;
and.b32 %r10, %r6, 61;
mul.wide.u32 %rd13, %r10, 2;
add.s64 %rd14, %rd8, %rd13;
ld.global.nc.b16 %h2, [%rd14];
and.b32 %r11, %r7, 62;
mul.wide.u32 %rd15, %r11, 2;
add.s64 %rd16, %rd8, %rd15;
ld.global.nc.b16 %h3, [%rd16];
ld.global.nc.b16 %h4, [%rd10+6];
st.global.v4.b16 [%rd12], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2700
.visible .entry fusion_2700(
.param .u64 fusion_2700_param_0,
.param .u64 fusion_2700_param_1,
.param .u64 fusion_2700_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2700_param_0];
ld.param.u64 %rd2, [fusion_2700_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl rng_get_and_update_state_37
.visible .entry rng_get_and_update_state_37(
.param .u64 rng_get_and_update_state_37_param_0,
.param .u64 rng_get_and_update_state_37_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_37_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 524288;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 524288;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2195
.visible .entry fusion_2195(
.param .u64 fusion_2195_param_0,
.param .u64 fusion_2195_param_1,
.param .u64 fusion_2195_param_2,
.param .u64 fusion_2195_param_3,
.param .u64 fusion_2195_param_4,
.param .u64 fusion_2195_param_5
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<43>;
.reg .b32 %hh<5>;
.reg .f32 %f<13>;
.reg .b32 %r<31>;
.reg .b64 %rd<129>;
ld.param.u64 %rd1, [fusion_2195_param_0];
ld.param.u64 %rd2, [fusion_2195_param_4];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2195_param_1];
ld.param.u64 %rd5, [fusion_2195_param_3];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2195_param_2];
cvta.to.global.u64 %rd8, %rd7;
cvta.to.global.u64 %rd9, %rd4;
cvta.to.global.u64 %rd10, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
mul.wide.u32 %rd11, %r5, 2;
add.s64 %rd12, %rd9, %rd11;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd12];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
shr.u32 %r8, %r5, 2;
ld.global.nc.v2.u64 {%rd13, %rd14}, [%rd6];
cvt.u64.u32 %rd15, %r8;
add.s64 %rd16, %rd13, %rd15;
setp.lt.u64 %p1, %rd16, %rd13;
and.b64 %rd17, %rd16, 4294967295;
mul.lo.s64 %rd18, %rd17, 3528531795;
selp.u64 %rd19, 1, 0, %p1;
add.s64 %rd20, %rd14, %rd19;
xor.b64 %rd21, %rd20, %rd18;
shr.u64 %rd22, %rd21, 32;
mul.lo.s64 %rd23, %rd22, 3449720151;
shr.u64 %rd24, %rd23, 32;
and.b64 %rd25, %rd20, 4294967295;
mul.lo.s64 %rd26, %rd25, 3449720151;
and.b64 %rd27, %rd26, 4294967295;
xor.b64 %rd28, %rd27, %rd24;
xor.b64 %rd29, %rd28, 2654435769;
mul.lo.s64 %rd30, %rd29, 3528531795;
shr.u64 %rd31, %rd30, 32;
xor.b64 %rd32, %rd26, %rd16;
shr.u64 %rd33, %rd32, 32;
mul.lo.s64 %rd34, %rd33, 3528531795;
and.b64 %rd35, %rd34, 4294967295;
xor.b64 %rd36, %rd35, %rd31;
xor.b64 %rd37, %rd36, 1993301258;
mul.lo.s64 %rd38, %rd37, 3449720151;
shr.u64 %rd39, %rd38, 32;
shr.u64 %rd40, %rd34, 32;
and.b64 %rd41, %rd18, 4294967295;
xor.b64 %rd42, %rd41, %rd40;
xor.b64 %rd43, %rd42, 3144134277;
mul.lo.s64 %rd44, %rd43, 3449720151;
and.b64 %rd45, %rd44, 4294967295;
xor.b64 %rd46, %rd45, %rd39;
xor.b64 %rd47, %rd46, 3668340011;
mul.lo.s64 %rd48, %rd47, 3528531795;
shr.u64 %rd49, %rd48, 32;
shr.u64 %rd50, %rd44, 32;
and.b64 %rd51, %rd23, 4294967295;
xor.b64 %rd52, %rd51, %rd50;
xor.b64 %rd53, %rd52, 1013904242;
mul.lo.s64 %rd54, %rd53, 3528531795;
and.b64 %rd55, %rd54, 4294967295;
xor.b64 %rd56, %rd55, %rd49;
xor.b64 %rd57, %rd56, 3986602516;
mul.lo.s64 %rd58, %rd57, 3449720151;
shr.u64 %rd59, %rd58, 32;
shr.u64 %rd60, %rd54, 32;
and.b64 %rd61, %rd30, 4294967295;
xor.b64 %rd62, %rd61, %rd60;
xor.b64 %rd63, %rd62, 842468239;
mul.lo.s64 %rd64, %rd63, 3449720151;
and.b64 %rd65, %rd64, 4294967295;
xor.b64 %rd66, %rd65, %rd59;
xor.b64 %rd67, %rd66, 387276957;
mul.lo.s64 %rd68, %rd67, 3528531795;
shr.u64 %rd69, %rd68, 32;
shr.u64 %rd70, %rd64, 32;
and.b64 %rd71, %rd38, 4294967295;
xor.b64 %rd72, %rd71, %rd70;
xor.b64 %rd73, %rd72, 2027808484;
mul.lo.s64 %rd74, %rd73, 3528531795;
and.b64 %rd75, %rd74, 4294967295;
shr.u64 %rd76, %rd74, 32;
and.b64 %rd77, %rd48, 4294967295;
xor.b64 %rd78, %rd77, %rd76;
xor.b64 %rd79, %rd78, 2835769497;
mul.lo.s64 %rd80, %rd79, 3449720151;
and.b64 %rd81, %rd80, 4294967295;
shr.u64 %rd82, %rd80, 32;
and.b64 %rd83, %rd58, 4294967295;
xor.b64 %rd84, %rd83, %rd82;
xor.b64 %rd85, %rd84, 3041712726;
mul.lo.s64 %rd86, %rd85, 3528531795;
and.b64 %rd87, %rd86, 4294967295;
xor.b64 %rd88, %rd75, %rd69;
xor.b64 %rd89, %rd88, 1684936478;
mul.lo.s64 %rd90, %rd89, 3449720151;
shr.u64 %rd91, %rd90, 32;
xor.b64 %rd92, %rd81, %rd91;
xor.b64 %rd93, %rd92, 1401181199;
mul.lo.s64 %rd94, %rd93, 3528531795;
shr.u64 %rd95, %rd94, 32;
xor.b64 %rd96, %rd87, %rd95;
xor.b64 %rd97, %rd96, 3678237736;
mul.lo.s64 %rd98, %rd97, 3449720151;
shr.u64 %rd99, %rd98, 32;
cvt.u32.u64 %r9, %rd99;
shr.u64 %rd100, %rd86, 32;
xor.b64 %rd101, %rd100, %rd68;
cvt.u32.u64 %r10, %rd101;
xor.b32 %r11, %r10, 534103459;
mul.lo.s32 %r12, %r11, -845247145;
xor.b32 %r13, %r12, %r9;
shr.u32 %r14, %r13, 9;
xor.b32 %r15, %r14, 4716963;
cvt.rn.f32.u32 %f1, %r15;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h9, %f2;
mov.b16 %h10, 0x2E66;
setp.ge.f16 %p2, %h9, %h10;
add.s64 %rd102, %rd8, %rd11;
ld.global.nc.v4.b16 {%h11, %h12, %h13, %h14}, [%rd102];
mov.b32 %hh3, {%h13, %h14};
mov.b32 %hh4, {%h11, %h12};
mov.b32 {%h15, %h16}, %hh4;
mov.b32 {%h17, %h18}, %hh3;
mul.wide.u32 %rd103, %r4, 4;
add.s64 %rd104, %rd3, %rd103;
ld.global.nc.f32 %f3, [%rd104];
cvt.rn.f16.f32 %h19, %f3;
add.rn.f16 %h20, %h15, %h19;
mov.b16 %h21, 0x3C72;
mul.rn.f16 %h22, %h20, %h21;
selp.b16 %h23, %h22, 0x0000, %p2;
add.rn.f16 %h24, %h5, %h23;
add.s64 %rd105, %rd10, %rd11;
xor.b64 %rd106, %rd58, %rd82;
xor.b64 %rd107, %rd106, 3041712726;
mul.lo.s64 %rd108, %rd107, 3528531795;
xor.b64 %rd109, %rd95, %rd108;
cvt.u32.u64 %r16, %rd109;
xor.b32 %r17, %r16, -616729560;
mul.lo.s32 %r18, %r17, -845247145;
shr.u32 %r19, %r18, 9;
cvt.rn.f32.u32 %f4, %r19;
mul.rn.f32 %f5, %f4, 0f34000000;
cvt.rn.f16.f32 %h25, %f5;
setp.ge.f16 %p3, %h25, %h10;
mul.wide.u32 %rd110, %r6, 4;
add.s64 %rd111, %rd3, %rd110;
ld.global.nc.f32 %f6, [%rd111];
cvt.rn.f16.f32 %h26, %f6;
add.rn.f16 %h27, %h16, %h26;
mul.rn.f16 %h28, %h27, %h21;
selp.b16 %h29, %h28, 0x0000, %p3;
add.rn.f16 %h30, %h6, %h29;
and.b64 %rd112, %rd90, 4294967295;
and.b64 %rd113, %rd68, 4294967295;
xor.b64 %rd114, %rd113, %rd100;
xor.b64 %rd115, %rd114, 534103459;
mul.lo.s64 %rd116, %rd115, 3449720151;
shr.u64 %rd117, %rd116, 32;
xor.b64 %rd118, %rd112, %rd117;
xor.b64 %rd119, %rd118, 4055616968;
mul.lo.s64 %rd120, %rd119, 3528531795;
shr.u64 %rd121, %rd120, 32;
cvt.u32.u64 %r20, %rd121;
xor.b64 %rd122, %rd91, %rd80;
cvt.u32.u64 %r21, %rd122;
xor.b32 %r22, %r21, 1401181199;
mul.lo.s32 %r23, %r22, -766435501;
xor.b32 %r24, %r23, %r20;
shr.u32 %r25, %r24, 9;
xor.b32 %r26, %r25, 4936337;
cvt.rn.f32.u32 %f7, %r26;
mul.rn.f32 %f8, %f7, 0f34000000;
cvt.rn.f16.f32 %h31, %f8;
setp.ge.f16 %p4, %h31, %h10;
mul.wide.u32 %rd123, %r7, 4;
add.s64 %rd124, %rd3, %rd123;
ld.global.nc.f32 %f9, [%rd124];
cvt.rn.f16.f32 %h32, %f9;
add.rn.f16 %h33, %h17, %h32;
mul.rn.f16 %h34, %h33, %h21;
selp.b16 %h35, %h34, 0x0000, %p4;
add.rn.f16 %h36, %h7, %h35;
xor.b64 %rd125, %rd69, %rd74;
xor.b64 %rd126, %rd125, 1684936478;
mul.lo.s64 %rd127, %rd126, 3449720151;
xor.b64 %rd128, %rd117, %rd127;
cvt.u32.u64 %r27, %rd128;
xor.b32 %r28, %r27, -239350328;
mul.lo.s32 %r29, %r28, -766435501;
shr.u32 %r30, %r29, 9;
cvt.rn.f32.u32 %f10, %r30;
mul.rn.f32 %f11, %f10, 0f34000000;
cvt.rn.f16.f32 %h37, %f11;
setp.ge.f16 %p5, %h37, %h10;
ld.global.nc.f32 %f12, [%rd104+12];
cvt.rn.f16.f32 %h38, %f12;
add.rn.f16 %h39, %h18, %h38;
mul.rn.f16 %h40, %h39, %h21;
selp.b16 %h41, %h40, 0x0000, %p5;
add.rn.f16 %h42, %h8, %h41;
st.global.v4.b16 [%rd105], {%h24, %h30, %h36, %h42};
ret;
}
// .globl fusion_2194
.visible .entry fusion_2194(
.param .u64 fusion_2194_param_0,
.param .u64 fusion_2194_param_1,
.param .u64 fusion_2194_param_2
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot72[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<56>;
.reg .b32 %r<7>;
.reg .b64 %rd<22>;
mov.u64 %SPL, __local_depot72;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2194_param_0];
cvta.to.global.u64 %rd8, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd10, %r4, 2048;
add.s64 %rd11, %rd8, %rd10;
mul.wide.u32 %rd12, %r5, 2;
add.s64 %rd13, %rd11, %rd12;
ld.global.nc.b32 %hh1, [%rd13];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
add.rn.f32 %f3, %f2, 0f00000000;
cvt.f32.f16 %f4, %h2;
add.rn.f32 %f5, %f3, %f4;
ld.global.nc.b32 %hh2, [%rd13+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f6, %h3;
add.rn.f32 %f7, %f5, %f6;
cvt.f32.f16 %f8, %h4;
add.rn.f32 %f9, %f7, %f8;
ld.global.nc.b32 %hh3, [%rd13+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f10, %h5;
add.rn.f32 %f11, %f9, %f10;
cvt.f32.f16 %f12, %h6;
add.rn.f32 %f13, %f11, %f12;
ld.global.nc.b32 %hh4, [%rd13+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f14, %h7;
add.rn.f32 %f15, %f13, %f14;
cvt.f32.f16 %f16, %h8;
add.rn.f32 %f17, %f15, %f16;
ld.global.nc.b32 %hh5, [%rd13+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f18, %h9;
add.rn.f32 %f19, %f17, %f18;
cvt.f32.f16 %f20, %h10;
add.rn.f32 %f21, %f19, %f20;
ld.global.nc.b32 %hh6, [%rd13+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f22, %h11;
add.rn.f32 %f23, %f21, %f22;
cvt.f32.f16 %f24, %h12;
add.rn.f32 %f25, %f23, %f24;
ld.global.nc.b32 %hh7, [%rd13+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f26, %h13;
add.rn.f32 %f27, %f25, %f26;
cvt.f32.f16 %f28, %h14;
add.rn.f32 %f29, %f27, %f28;
ld.global.nc.b32 %hh8, [%rd13+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f30, %h15;
add.rn.f32 %f31, %f29, %f30;
cvt.f32.f16 %f32, %h16;
add.rn.f32 %f33, %f31, %f32;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
add.rn.f32 %f35, %f34, %f33;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
add.rn.f32 %f37, %f36, %f35;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
add.rn.f32 %f39, %f38, %f37;
shfl.sync.down.b32 %f40, %f39, 2, 31, -1;
add.rn.f32 %f41, %f40, %f39;
shfl.sync.down.b32 %f42, %f41, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd16, shared_cache_016;
@%p1 bra LBB72_3;
bra.uni LBB72_1;
LBB72_3:
mul.wide.u32 %rd15, %r3, 4;
add.s64 %rd3, %rd16, %rd15;
add.rn.f32 %f1, %f42, %f41;
st.shared.f32 [%rd3], %f1;
LBB72_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB72_4;
bra.uni LBB72_2;
LBB72_4:
add.u64 %rd9, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd17, %r2, 4;
add.s64 %rd4, %rd16, %rd17;
cvta.shared.u64 %rd19, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd21, %rd19, %rd9, %p3;
ld.f32 %f43, [%rd21];
shfl.sync.down.b32 %f44, %f43, 16, 31, -1;
add.rn.f32 %f45, %f43, %f44;
shfl.sync.down.b32 %f46, %f45, 8, 31, -1;
add.rn.f32 %f47, %f45, %f46;
shfl.sync.down.b32 %f48, %f47, 4, 31, -1;
add.rn.f32 %f49, %f47, %f48;
shfl.sync.down.b32 %f50, %f49, 2, 31, -1;
add.rn.f32 %f51, %f49, %f50;
shfl.sync.down.b32 %f52, %f51, 1, 31, -1;
add.rn.f32 %f53, %f51, %f52;
st.f32 [%rd21], %f53;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB72_2;
ld.param.u64 %rd6, [fusion_2194_param_1];
cvta.to.global.u64 %rd7, %rd6;
mul.wide.u32 %rd14, %r4, 4;
add.s64 %rd2, %rd7, %rd14;
ld.shared.f32 %f54, [%rd4];
atom.global.add.f32 %f55, [%rd2], %f54;
LBB72_2:
ret;
}
// .globl fusion_2191
.visible .entry fusion_2191(
.param .u64 fusion_2191_param_0,
.param .u64 fusion_2191_param_1,
.param .u64 fusion_2191_param_2,
.param .u64 fusion_2191_param_3
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot73[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b16 %h<17>;
.reg .b32 %hh<9>;
.reg .f32 %f<90>;
.reg .b32 %r<7>;
.reg .b64 %rd<25>;
mov.u64 %SPL, __local_depot73;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd5, [fusion_2191_param_0];
ld.param.u64 %rd6, [fusion_2191_param_2];
cvta.to.global.u64 %rd7, %rd6;
cvta.to.global.u64 %rd10, %rd5;
mov.u32 %r1, %tid.x;
mov.u32 %r4, %ctaid.x;
shl.b32 %r5, %r1, 1;
mul.wide.u32 %rd12, %r4, 2048;
add.s64 %rd13, %rd10, %rd12;
mul.wide.u32 %rd14, %r5, 2;
add.s64 %rd15, %rd13, %rd14;
ld.global.nc.b32 %hh1, [%rd15];
mov.b32 {%h1, %h2}, %hh1;
cvt.f32.f16 %f2, %h1;
mul.wide.u32 %rd16, %r4, 4;
add.s64 %rd17, %rd7, %rd16;
ld.global.nc.f32 %f3, [%rd17];
mul.rn.f32 %f4, %f3, 0f3A800000;
sub.rn.f32 %f5, %f2, %f4;
mul.rn.f32 %f6, %f5, %f5;
add.rn.f32 %f7, %f6, 0f00000000;
cvt.f32.f16 %f8, %h2;
sub.rn.f32 %f9, %f8, %f4;
mul.rn.f32 %f10, %f9, %f9;
add.rn.f32 %f11, %f7, %f10;
ld.global.nc.b32 %hh2, [%rd15+256];
mov.b32 {%h3, %h4}, %hh2;
cvt.f32.f16 %f12, %h3;
sub.rn.f32 %f13, %f12, %f4;
mul.rn.f32 %f14, %f13, %f13;
add.rn.f32 %f15, %f11, %f14;
cvt.f32.f16 %f16, %h4;
sub.rn.f32 %f17, %f16, %f4;
mul.rn.f32 %f18, %f17, %f17;
add.rn.f32 %f19, %f15, %f18;
ld.global.nc.b32 %hh3, [%rd15+512];
mov.b32 {%h5, %h6}, %hh3;
cvt.f32.f16 %f20, %h5;
sub.rn.f32 %f21, %f20, %f4;
mul.rn.f32 %f22, %f21, %f21;
add.rn.f32 %f23, %f19, %f22;
cvt.f32.f16 %f24, %h6;
sub.rn.f32 %f25, %f24, %f4;
mul.rn.f32 %f26, %f25, %f25;
add.rn.f32 %f27, %f23, %f26;
ld.global.nc.b32 %hh4, [%rd15+768];
mov.b32 {%h7, %h8}, %hh4;
cvt.f32.f16 %f28, %h7;
sub.rn.f32 %f29, %f28, %f4;
mul.rn.f32 %f30, %f29, %f29;
add.rn.f32 %f31, %f27, %f30;
cvt.f32.f16 %f32, %h8;
sub.rn.f32 %f33, %f32, %f4;
mul.rn.f32 %f34, %f33, %f33;
add.rn.f32 %f35, %f31, %f34;
ld.global.nc.b32 %hh5, [%rd15+1024];
mov.b32 {%h9, %h10}, %hh5;
cvt.f32.f16 %f36, %h9;
sub.rn.f32 %f37, %f36, %f4;
mul.rn.f32 %f38, %f37, %f37;
add.rn.f32 %f39, %f35, %f38;
cvt.f32.f16 %f40, %h10;
sub.rn.f32 %f41, %f40, %f4;
mul.rn.f32 %f42, %f41, %f41;
add.rn.f32 %f43, %f39, %f42;
ld.global.nc.b32 %hh6, [%rd15+1280];
mov.b32 {%h11, %h12}, %hh6;
cvt.f32.f16 %f44, %h11;
sub.rn.f32 %f45, %f44, %f4;
mul.rn.f32 %f46, %f45, %f45;
add.rn.f32 %f47, %f43, %f46;
cvt.f32.f16 %f48, %h12;
sub.rn.f32 %f49, %f48, %f4;
mul.rn.f32 %f50, %f49, %f49;
add.rn.f32 %f51, %f47, %f50;
ld.global.nc.b32 %hh7, [%rd15+1536];
mov.b32 {%h13, %h14}, %hh7;
cvt.f32.f16 %f52, %h13;
sub.rn.f32 %f53, %f52, %f4;
mul.rn.f32 %f54, %f53, %f53;
add.rn.f32 %f55, %f51, %f54;
cvt.f32.f16 %f56, %h14;
sub.rn.f32 %f57, %f56, %f4;
mul.rn.f32 %f58, %f57, %f57;
add.rn.f32 %f59, %f55, %f58;
ld.global.nc.b32 %hh8, [%rd15+1792];
mov.b32 {%h15, %h16}, %hh8;
cvt.f32.f16 %f60, %h15;
sub.rn.f32 %f61, %f60, %f4;
mul.rn.f32 %f62, %f61, %f61;
add.rn.f32 %f63, %f59, %f62;
cvt.f32.f16 %f64, %h16;
sub.rn.f32 %f65, %f64, %f4;
mul.rn.f32 %f66, %f65, %f65;
add.rn.f32 %f67, %f63, %f66;
and.b32 %r2, %r1, 31;
shfl.sync.down.b32 %f68, %f67, 16, 31, -1;
add.rn.f32 %f69, %f68, %f67;
shfl.sync.down.b32 %f70, %f69, 8, 31, -1;
add.rn.f32 %f71, %f70, %f69;
shfl.sync.down.b32 %f72, %f71, 4, 31, -1;
add.rn.f32 %f73, %f72, %f71;
shfl.sync.down.b32 %f74, %f73, 2, 31, -1;
add.rn.f32 %f75, %f74, %f73;
shfl.sync.down.b32 %f76, %f75, 1, 31, -1;
shr.u32 %r3, %r1, 5;
setp.eq.s32 %p1, %r2, 0;
mov.u64 %rd19, shared_cache_017;
@%p1 bra LBB73_3;
bra.uni LBB73_1;
LBB73_3:
mul.wide.u32 %rd18, %r3, 4;
add.s64 %rd3, %rd19, %rd18;
add.rn.f32 %f1, %f76, %f75;
st.shared.f32 [%rd3], %f1;
LBB73_1:
bar.sync 0;
setp.eq.s32 %p2, %r3, 0;
@%p2 bra LBB73_4;
bra.uni LBB73_2;
LBB73_4:
add.u64 %rd11, %SP, 0;
add.u64 %rd1, %SPL, 0;
mul.wide.u32 %rd20, %r2, 4;
add.s64 %rd4, %rd19, %rd20;
cvta.shared.u64 %rd22, %rd4;
mov.u32 %r6, 0;
st.local.u32 [%rd1], %r6;
setp.lt.u32 %p3, %r1, 2;
selp.b64 %rd24, %rd22, %rd11, %p3;
ld.f32 %f77, [%rd24];
shfl.sync.down.b32 %f78, %f77, 16, 31, -1;
add.rn.f32 %f79, %f77, %f78;
shfl.sync.down.b32 %f80, %f79, 8, 31, -1;
add.rn.f32 %f81, %f79, %f80;
shfl.sync.down.b32 %f82, %f81, 4, 31, -1;
add.rn.f32 %f83, %f81, %f82;
shfl.sync.down.b32 %f84, %f83, 2, 31, -1;
add.rn.f32 %f85, %f83, %f84;
shfl.sync.down.b32 %f86, %f85, 1, 31, -1;
add.rn.f32 %f87, %f85, %f86;
st.f32 [%rd24], %f87;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra LBB73_2;
ld.param.u64 %rd8, [fusion_2191_param_1];
cvta.to.global.u64 %rd9, %rd8;
add.s64 %rd2, %rd9, %rd16;
ld.shared.f32 %f88, [%rd4];
atom.global.add.f32 %f89, [%rd2], %f88;
LBB73_2:
ret;
}
// .globl fusion_2187
.visible .entry fusion_2187(
.param .u64 fusion_2187_param_0,
.param .u64 fusion_2187_param_1,
.param .u64 fusion_2187_param_2,
.param .u64 fusion_2187_param_3,
.param .u64 fusion_2187_param_4,
.param .u64 fusion_2187_param_5,
.param .u64 fusion_2187_param_6
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .b32 %hh<3>;
.reg .f32 %f<39>;
.reg .b32 %r<8>;
.reg .b64 %rd<28>;
ld.param.u64 %rd1, [fusion_2187_param_0];
ld.param.u64 %rd2, [fusion_2187_param_5];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2187_param_1];
ld.param.u64 %rd5, [fusion_2187_param_4];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2187_param_2];
ld.param.u64 %rd8, [fusion_2187_param_3];
cvta.to.global.u64 %rd9, %rd8;
cvta.to.global.u64 %rd10, %rd7;
cvta.to.global.u64 %rd11, %rd4;
cvta.to.global.u64 %rd12, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
mul.wide.u32 %rd13, %r5, 2;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd14];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
cvt.f32.f16 %f1, %h5;
mul.wide.u32 %rd15, %r1, 4;
add.s64 %rd16, %rd10, %rd15;
ld.global.nc.f32 %f2, [%rd16];
mul.rn.f32 %f3, %f2, 0f3A800000;
add.rn.f32 %f4, %f3, 0f2B8CBCCC;
rsqrt.approx.f32 %f5, %f4;
mul.wide.u32 %rd17, %r4, 4;
add.s64 %rd18, %rd3, %rd17;
ld.global.nc.f32 %f6, [%rd18];
mul.rn.f32 %f7, %f5, %f6;
mul.rn.f32 %f8, %f7, %f1;
add.s64 %rd19, %rd6, %rd17;
ld.global.nc.f32 %f9, [%rd19];
add.s64 %rd20, %rd9, %rd15;
ld.global.nc.f32 %f10, [%rd20];
mul.rn.f32 %f11, %f10, 0f3A800000;
mul.rn.f32 %f12, %f7, %f11;
sub.rn.f32 %f13, %f9, %f12;
add.rn.f32 %f14, %f8, %f13;
cvt.rn.f16.f32 %h9, %f14;
add.s64 %rd21, %rd12, %rd13;
cvt.f32.f16 %f15, %h6;
mul.wide.u32 %rd22, %r6, 4;
add.s64 %rd23, %rd3, %rd22;
ld.global.nc.f32 %f16, [%rd23];
mul.rn.f32 %f17, %f5, %f16;
mul.rn.f32 %f18, %f17, %f15;
add.s64 %rd24, %rd6, %rd22;
ld.global.nc.f32 %f19, [%rd24];
mul.rn.f32 %f20, %f11, %f17;
sub.rn.f32 %f21, %f19, %f20;
add.rn.f32 %f22, %f18, %f21;
cvt.rn.f16.f32 %h10, %f22;
cvt.f32.f16 %f23, %h7;
mul.wide.u32 %rd25, %r7, 4;
add.s64 %rd26, %rd3, %rd25;
ld.global.nc.f32 %f24, [%rd26];
mul.rn.f32 %f25, %f5, %f24;
mul.rn.f32 %f26, %f25, %f23;
add.s64 %rd27, %rd6, %rd25;
ld.global.nc.f32 %f27, [%rd27];
mul.rn.f32 %f28, %f11, %f25;
sub.rn.f32 %f29, %f27, %f28;
add.rn.f32 %f30, %f26, %f29;
cvt.rn.f16.f32 %h11, %f30;
cvt.f32.f16 %f31, %h8;
ld.global.nc.f32 %f32, [%rd18+12];
mul.rn.f32 %f33, %f5, %f32;
mul.rn.f32 %f34, %f33, %f31;
ld.global.nc.f32 %f35, [%rd19+12];
mul.rn.f32 %f36, %f11, %f33;
sub.rn.f32 %f37, %f35, %f36;
add.rn.f32 %f38, %f34, %f37;
cvt.rn.f16.f32 %h12, %f38;
st.global.v4.b16 [%rd21], {%h9, %h10, %h11, %h12};
ret;
}
// .globl convert_1585
.visible .entry convert_1585(
.param .u64 convert_1585_param_0,
.param .u64 convert_1585_param_1,
.param .u64 convert_1585_param_2
)
.reqntid 128, 1, 1
{
.reg .pred %p<3>;
.reg .b16 %h<29>;
.reg .f32 %f<29>;
.reg .b32 %r<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [convert_1585_param_0];
ld.param.u64 %rd4, [convert_1585_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd3;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
shl.b32 %r1, %r3, 9;
shl.b32 %r2, %r4, 2;
or.b32 %r5, %r1, %r2;
mul.wide.u32 %rd7, %r5, 4;
add.s64 %rd1, %rd5, %rd7;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd8, %r5, 2;
add.s64 %rd2, %rd6, %rd8;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4};
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440];
cvt.rn.f16.f32 %h5, %f5;
cvt.rn.f16.f32 %h6, %f6;
cvt.rn.f16.f32 %h7, %f7;
cvt.rn.f16.f32 %h8, %f8;
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8};
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880];
cvt.rn.f16.f32 %h9, %f9;
cvt.rn.f16.f32 %h10, %f10;
cvt.rn.f16.f32 %h11, %f11;
cvt.rn.f16.f32 %h12, %f12;
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12};
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320];
cvt.rn.f16.f32 %h13, %f13;
cvt.rn.f16.f32 %h14, %f14;
cvt.rn.f16.f32 %h15, %f15;
cvt.rn.f16.f32 %h16, %f16;
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16};
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760];
cvt.rn.f16.f32 %h17, %f17;
cvt.rn.f16.f32 %h18, %f18;
cvt.rn.f16.f32 %h19, %f19;
cvt.rn.f16.f32 %h20, %f20;
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20};
add.s32 %r6, %r5, 3276800;
setp.gt.u32 %p1, %r6, 4194303;
@%p1 bra LBB75_2;
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200];
cvt.rn.f16.f32 %h21, %f21;
cvt.rn.f16.f32 %h22, %f22;
cvt.rn.f16.f32 %h23, %f23;
cvt.rn.f16.f32 %h24, %f24;
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24};
LBB75_2:
add.s32 %r7, %r1, 3932160;
or.b32 %r8, %r7, %r2;
setp.gt.u32 %p2, %r8, 4194303;
@%p2 bra LBB75_4;
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640];
cvt.rn.f16.f32 %h25, %f25;
cvt.rn.f16.f32 %h26, %f26;
cvt.rn.f16.f32 %h27, %f27;
cvt.rn.f16.f32 %h28, %f28;
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28};
LBB75_4:
ret;
}
// .globl fusion_2182
.visible .entry fusion_2182(
.param .u64 fusion_2182_param_0,
.param .u64 fusion_2182_param_1,
.param .u64 fusion_2182_param_2,
.param .u64 fusion_2182_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<21>;
.reg .b16 %h<21>;
.reg .b32 %hh<3>;
.reg .f32 %f<150>;
.reg .b32 %r<25>;
.reg .b64 %rd<18>;
ld.param.u64 %rd1, [fusion_2182_param_0];
ld.param.u64 %rd2, [fusion_2182_param_2];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2182_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r5, 1;
or.b32 %r7, %r5, 2;
or.b32 %r8, %r5, 3;
and.b32 %r9, %r8, 4095;
and.b32 %r10, %r7, 4094;
and.b32 %r11, %r6, 4093;
and.b32 %r12, %r5, 4092;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd5, %rd7;
ld.global.nc.v4.b16 {%h1, %h2, %h3, %h4}, [%rd8];
mov.b32 %hh1, {%h3, %h4};
mov.b32 %hh2, {%h1, %h2};
mov.b32 {%h5, %h6}, %hh2;
mov.b32 {%h7, %h8}, %hh1;
mul.wide.u32 %rd9, %r12, 4;
add.s64 %rd10, %rd3, %rd9;
ld.global.nc.f32 %f1, [%rd10];
cvt.rn.f16.f32 %h9, %f1;
add.rn.f16 %h10, %h5, %h9;
cvt.f32.f16 %f2, %h10;
mul.rn.f32 %f3, %f2, %f2;
mul.rn.f32 %f4, %f3, %f2;
mul.rn.f32 %f5, %f4, 0f3D372713;
add.rn.f32 %f6, %f5, %f2;
mul.rn.f32 %f7, %f6, 0f3F4C422A;
abs.f32 %f8, %f7;
setp.lt.f32 %p1, %f8, 0f39D1B717;
setp.lt.f32 %p2, %f7, 0fC1100000;
selp.f32 %f9, 0fC1100000, %f7, %p2;
setp.gt.f32 %p3, %f9, 0f41100000;
selp.f32 %f10, 0f41100000, %f9, %p3;
mul.rn.f32 %f11, %f10, %f10;
mul.rn.f32 %f12, %f11, 0f259F25C0;
mov.f32 %f13, 0f2A61337E;
sub.rn.f32 %f14, %f13, %f12;
mul.rn.f32 %f15, %f11, %f14;
add.rn.f32 %f16, %f15, 0fAEBD37FF;
mul.rn.f32 %f17, %f11, %f16;
add.rn.f32 %f18, %f17, 0f335C0041;
mul.rn.f32 %f19, %f11, %f18;
add.rn.f32 %f20, %f19, 0f3779434A;
mul.rn.f32 %f21, %f11, %f20;
add.rn.f32 %f22, %f21, 0f3A270DED;
mul.rn.f32 %f23, %f11, %f22;
add.rn.f32 %f24, %f23, 0f3BA059DC;
mul.rn.f32 %f25, %f10, %f24;
mul.rn.f32 %f26, %f11, 0f35A0D3D8;
add.rn.f32 %f27, %f26, 0f38F895D6;
mul.rn.f32 %f28, %f11, %f27;
add.rn.f32 %f29, %f28, 0f3B14AA05;
mul.rn.f32 %f30, %f11, %f29;
add.rn.f32 %f31, %f30, 0f3BA059DD;
div.full.f32 %f32, %f25, %f31;
selp.f32 %f33, %f7, %f32, %p1;
mov.b32 %r13, %f7;
shr.u32 %r14, %r13, 31;
and.b32 %r15, %r14, 1;
setp.eq.b32 %p4, %r15, 1;
selp.f32 %f34, 0fBF800000, 0f3F800000, %p4;
setp.ltu.f32 %p5, %f8, 0f41A00000;
selp.f32 %f35, %f33, %f34, %p5;
add.rn.f32 %f36, %f35, 0f3F800000;
mul.rn.f32 %f37, %f36, 0f3F000000;
mul.rn.f32 %f38, %f37, %f2;
cvt.rn.f16.f32 %h11, %f38;
add.s64 %rd11, %rd6, %rd7;
mul.wide.u32 %rd12, %r11, 4;
add.s64 %rd13, %rd3, %rd12;
ld.global.nc.f32 %f39, [%rd13];
cvt.rn.f16.f32 %h12, %f39;
add.rn.f16 %h13, %h6, %h12;
cvt.f32.f16 %f40, %h13;
mul.rn.f32 %f41, %f40, %f40;
mul.rn.f32 %f42, %f41, %f40;
mul.rn.f32 %f43, %f42, 0f3D372713;
add.rn.f32 %f44, %f43, %f40;
mul.rn.f32 %f45, %f44, 0f3F4C422A;
abs.f32 %f46, %f45;
setp.lt.f32 %p6, %f46, 0f39D1B717;
setp.lt.f32 %p7, %f45, 0fC1100000;
selp.f32 %f47, 0fC1100000, %f45, %p7;
setp.gt.f32 %p8, %f47, 0f41100000;
selp.f32 %f48, 0f41100000, %f47, %p8;
mul.rn.f32 %f49, %f48, %f48;
mul.rn.f32 %f50, %f49, 0f259F25C0;
sub.rn.f32 %f51, %f13, %f50;
mul.rn.f32 %f52, %f49, %f51;
add.rn.f32 %f53, %f52, 0fAEBD37FF;
mul.rn.f32 %f54, %f49, %f53;
add.rn.f32 %f55, %f54, 0f335C0041;
mul.rn.f32 %f56, %f49, %f55;
add.rn.f32 %f57, %f56, 0f3779434A;
mul.rn.f32 %f58, %f49, %f57;
add.rn.f32 %f59, %f58, 0f3A270DED;
mul.rn.f32 %f60, %f49, %f59;
add.rn.f32 %f61, %f60, 0f3BA059DC;
mul.rn.f32 %f62, %f48, %f61;
mul.rn.f32 %f63, %f49, 0f35A0D3D8;
add.rn.f32 %f64, %f63, 0f38F895D6;
mul.rn.f32 %f65, %f49, %f64;
add.rn.f32 %f66, %f65, 0f3B14AA05;
mul.rn.f32 %f67, %f49, %f66;
add.rn.f32 %f68, %f67, 0f3BA059DD;
div.full.f32 %f69, %f62, %f68;
selp.f32 %f70, %f45, %f69, %p6;
mov.b32 %r16, %f45;
shr.u32 %r17, %r16, 31;
and.b32 %r18, %r17, 1;
setp.eq.b32 %p9, %r18, 1;
selp.f32 %f71, 0fBF800000, 0f3F800000, %p9;
setp.ltu.f32 %p10, %f46, 0f41A00000;
selp.f32 %f72, %f70, %f71, %p10;
add.rn.f32 %f73, %f72, 0f3F800000;
mul.rn.f32 %f74, %f73, 0f3F000000;
mul.rn.f32 %f75, %f74, %f40;
cvt.rn.f16.f32 %h14, %f75;
mul.wide.u32 %rd14, %r10, 4;
add.s64 %rd15, %rd3, %rd14;
ld.global.nc.f32 %f76, [%rd15];
cvt.rn.f16.f32 %h15, %f76;
add.rn.f16 %h16, %h7, %h15;
cvt.f32.f16 %f77, %h16;
mul.rn.f32 %f78, %f77, %f77;
mul.rn.f32 %f79, %f78, %f77;
mul.rn.f32 %f80, %f79, 0f3D372713;
add.rn.f32 %f81, %f80, %f77;
mul.rn.f32 %f82, %f81, 0f3F4C422A;
abs.f32 %f83, %f82;
setp.lt.f32 %p11, %f83, 0f39D1B717;
setp.lt.f32 %p12, %f82, 0fC1100000;
selp.f32 %f84, 0fC1100000, %f82, %p12;
setp.gt.f32 %p13, %f84, 0f41100000;
selp.f32 %f85, 0f41100000, %f84, %p13;
mul.rn.f32 %f86, %f85, %f85;
mul.rn.f32 %f87, %f86, 0f259F25C0;
sub.rn.f32 %f88, %f13, %f87;
mul.rn.f32 %f89, %f86, %f88;
add.rn.f32 %f90, %f89, 0fAEBD37FF;
mul.rn.f32 %f91, %f86, %f90;
add.rn.f32 %f92, %f91, 0f335C0041;
mul.rn.f32 %f93, %f86, %f92;
add.rn.f32 %f94, %f93, 0f3779434A;
mul.rn.f32 %f95, %f86, %f94;
add.rn.f32 %f96, %f95, 0f3A270DED;
mul.rn.f32 %f97, %f86, %f96;
add.rn.f32 %f98, %f97, 0f3BA059DC;
mul.rn.f32 %f99, %f85, %f98;
mul.rn.f32 %f100, %f86, 0f35A0D3D8;
add.rn.f32 %f101, %f100, 0f38F895D6;
mul.rn.f32 %f102, %f86, %f101;
add.rn.f32 %f103, %f102, 0f3B14AA05;
mul.rn.f32 %f104, %f86, %f103;
add.rn.f32 %f105, %f104, 0f3BA059DD;
div.full.f32 %f106, %f99, %f105;
selp.f32 %f107, %f82, %f106, %p11;
mov.b32 %r19, %f82;
shr.u32 %r20, %r19, 31;
and.b32 %r21, %r20, 1;
setp.eq.b32 %p14, %r21, 1;
selp.f32 %f108, 0fBF800000, 0f3F800000, %p14;
setp.ltu.f32 %p15, %f83, 0f41A00000;
selp.f32 %f109, %f107, %f108, %p15;
add.rn.f32 %f110, %f109, 0f3F800000;
mul.rn.f32 %f111, %f110, 0f3F000000;
mul.rn.f32 %f112, %f111, %f77;
cvt.rn.f16.f32 %h17, %f112;
mul.wide.u32 %rd16, %r9, 4;
add.s64 %rd17, %rd3, %rd16;
ld.global.nc.f32 %f113, [%rd17];
cvt.rn.f16.f32 %h18, %f113;
add.rn.f16 %h19, %h8, %h18;
cvt.f32.f16 %f114, %h19;
mul.rn.f32 %f115, %f114, %f114;
mul.rn.f32 %f116, %f115, %f114;
mul.rn.f32 %f117, %f116, 0f3D372713;
add.rn.f32 %f118, %f117, %f114;
mul.rn.f32 %f119, %f118, 0f3F4C422A;
abs.f32 %f120, %f119;
setp.lt.f32 %p16, %f120, 0f39D1B717;
setp.lt.f32 %p17, %f119, 0fC1100000;
selp.f32 %f121, 0fC1100000, %f119, %p17;
setp.gt.f32 %p18, %f121, 0f41100000;
selp.f32 %f122, 0f41100000, %f121, %p18;
mul.rn.f32 %f123, %f122, %f122;
mul.rn.f32 %f124, %f123, 0f259F25C0;
sub.rn.f32 %f125, %f13, %f124;
mul.rn.f32 %f126, %f123, %f125;
add.rn.f32 %f127, %f126, 0fAEBD37FF;
mul.rn.f32 %f128, %f123, %f127;
add.rn.f32 %f129, %f128, 0f335C0041;
mul.rn.f32 %f130, %f123, %f129;
add.rn.f32 %f131, %f130, 0f3779434A;
mul.rn.f32 %f132, %f123, %f131;
add.rn.f32 %f133, %f132, 0f3A270DED;
mul.rn.f32 %f134, %f123, %f133;
add.rn.f32 %f135, %f134, 0f3BA059DC;
mul.rn.f32 %f136, %f122, %f135;
mul.rn.f32 %f137, %f123, 0f35A0D3D8;
add.rn.f32 %f138, %f137, 0f38F895D6;
mul.rn.f32 %f139, %f123, %f138;
add.rn.f32 %f140, %f139, 0f3B14AA05;
mul.rn.f32 %f141, %f123, %f140;
add.rn.f32 %f142, %f141, 0f3BA059DD;
div.full.f32 %f143, %f136, %f142;
selp.f32 %f144, %f119, %f143, %p16;
mov.b32 %r22, %f119;
shr.u32 %r23, %r22, 31;
and.b32 %r24, %r23, 1;
setp.eq.b32 %p19, %r24, 1;
selp.f32 %f145, 0fBF800000, 0f3F800000, %p19;
setp.ltu.f32 %p20, %f120, 0f41A00000;
selp.f32 %f146, %f144, %f145, %p20;
add.rn.f32 %f147, %f146, 0f3F800000;
mul.rn.f32 %f148, %f147, 0f3F000000;
mul.rn.f32 %f149, %f148, %f114;
cvt.rn.f16.f32 %h20, %f149;
st.global.v4.b16 [%rd11], {%h11, %h14, %h17, %h20};
ret;
}
// .globl convert_1587
.visible .entry convert_1587(
.param .u64 convert_1587_param_0,
.param .u64 convert_1587_param_1,
.param .u64 convert_1587_param_2
)
.reqntid 128, 1, 1
{
.reg .pred %p<3>;
.reg .b16 %h<29>;
.reg .f32 %f<29>;
.reg .b32 %r<9>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [convert_1587_param_0];
ld.param.u64 %rd4, [convert_1587_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd3;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
shl.b32 %r1, %r3, 9;
shl.b32 %r2, %r4, 2;
or.b32 %r5, %r1, %r2;
mul.wide.u32 %rd7, %r5, 4;
add.s64 %rd1, %rd5, %rd7;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd8, %r5, 2;
add.s64 %rd2, %rd6, %rd8;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd2], {%h1, %h2, %h3, %h4};
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+2621440];
cvt.rn.f16.f32 %h5, %f5;
cvt.rn.f16.f32 %h6, %f6;
cvt.rn.f16.f32 %h7, %f7;
cvt.rn.f16.f32 %h8, %f8;
st.global.v4.b16 [%rd2+1310720], {%h5, %h6, %h7, %h8};
ld.global.nc.v4.f32 {%f9, %f10, %f11, %f12}, [%rd1+5242880];
cvt.rn.f16.f32 %h9, %f9;
cvt.rn.f16.f32 %h10, %f10;
cvt.rn.f16.f32 %h11, %f11;
cvt.rn.f16.f32 %h12, %f12;
st.global.v4.b16 [%rd2+2621440], {%h9, %h10, %h11, %h12};
ld.global.nc.v4.f32 {%f13, %f14, %f15, %f16}, [%rd1+7864320];
cvt.rn.f16.f32 %h13, %f13;
cvt.rn.f16.f32 %h14, %f14;
cvt.rn.f16.f32 %h15, %f15;
cvt.rn.f16.f32 %h16, %f16;
st.global.v4.b16 [%rd2+3932160], {%h13, %h14, %h15, %h16};
ld.global.nc.v4.f32 {%f17, %f18, %f19, %f20}, [%rd1+10485760];
cvt.rn.f16.f32 %h17, %f17;
cvt.rn.f16.f32 %h18, %f18;
cvt.rn.f16.f32 %h19, %f19;
cvt.rn.f16.f32 %h20, %f20;
st.global.v4.b16 [%rd2+5242880], {%h17, %h18, %h19, %h20};
add.s32 %r6, %r5, 3276800;
setp.gt.u32 %p1, %r6, 4194303;
@%p1 bra LBB77_2;
ld.global.nc.v4.f32 {%f21, %f22, %f23, %f24}, [%rd1+13107200];
cvt.rn.f16.f32 %h21, %f21;
cvt.rn.f16.f32 %h22, %f22;
cvt.rn.f16.f32 %h23, %f23;
cvt.rn.f16.f32 %h24, %f24;
st.global.v4.b16 [%rd2+6553600], {%h21, %h22, %h23, %h24};
LBB77_2:
add.s32 %r7, %r1, 3932160;
or.b32 %r8, %r7, %r2;
setp.gt.u32 %p2, %r8, 4194303;
@%p2 bra LBB77_4;
ld.global.nc.v4.f32 {%f25, %f26, %f27, %f28}, [%rd1+15728640];
cvt.rn.f16.f32 %h25, %f25;
cvt.rn.f16.f32 %h26, %f26;
cvt.rn.f16.f32 %h27, %f27;
cvt.rn.f16.f32 %h28, %f28;
st.global.v4.b16 [%rd2+7864320], {%h25, %h26, %h27, %h28};
LBB77_4:
ret;
}
// .globl rng_get_and_update_state_38
.visible .entry rng_get_and_update_state_38(
.param .u64 rng_get_and_update_state_38_param_0,
.param .u64 rng_get_and_update_state_38_param_1
)
{
.reg .pred %p<3>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [rng_get_and_update_state_38_param_0];
cvta.to.global.u64 %rd2, %rd1;
ld.global.u64 %rd3, [rng_state+8];
ld.global.u64 %rd4, [rng_state];
add.s64 %rd5, %rd4, 524288;
setp.lt.u64 %p1, %rd5, %rd4;
selp.u64 %rd6, 1, 0, %p1;
setp.lt.u64 %p2, %rd5, 524288;
selp.b64 %rd7, 1, %rd6, %p2;
add.s64 %rd8, %rd3, %rd7;
st.global.u64 [rng_state], %rd5;
st.global.u64 [rng_state+8], %rd8;
st.global.u64 [%rd2+8], %rd3;
st.global.u64 [%rd2], %rd4;
ret;
}
// .globl fusion_2180
.visible .entry fusion_2180(
.param .u64 fusion_2180_param_0,
.param .u64 fusion_2180_param_1,
.param .u64 fusion_2180_param_2,
.param .u64 fusion_2180_param_3,
.param .u64 fusion_2180_param_4,
.param .u64 fusion_2180_param_5,
.param .u64 fusion_2180_param_6,
.param .u64 fusion_2180_param_7,
.param .u64 fusion_2180_param_8,
.param .u64 fusion_2180_param_9
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot79[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<75>;
.reg .b16 %h<145>;
.reg .f32 %f<254>;
.reg .b32 %r<350>;
.reg .b64 %rd<2739>;
mov.u64 %SPL, __local_depot79;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd463, [fusion_2180_param_0];
ld.param.u64 %rd464, [fusion_2180_param_8];
cvta.to.global.u64 %rd1, %rd464;
ld.param.u64 %rd465, [fusion_2180_param_1];
ld.param.u64 %rd466, [fusion_2180_param_7];
cvta.to.global.u64 %rd2, %rd466;
ld.param.u64 %rd467, [fusion_2180_param_2];
ld.param.u64 %rd468, [fusion_2180_param_6];
cvta.to.global.u64 %rd3, %rd468;
ld.param.u64 %rd470, [fusion_2180_param_5];
cvta.to.global.u64 %rd4, %rd470;
ld.param.u64 %rd471, [fusion_2180_param_4];
cvta.to.global.u64 %rd5, %rd471;
cvta.to.global.u64 %rd7, %rd467;
cvta.to.global.u64 %rd8, %rd465;
cvta.to.global.u64 %rd9, %rd463;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 10;
or.b32 %r48, %r4, %r3;
shr.u32 %r49, %r48, 2;
and.b32 %r5, %r1, 1;
setp.eq.s32 %p1, %r5, 0;
ld.global.nc.u64 %rd11, [%rd7];
cvt.u64.u32 %rd473, %r49;
add.s64 %rd12, %rd11, %rd473;
setp.lt.u64 %p69, %rd12, %rd11;
and.b64 %rd2384, %rd12, 4294967295;
@%p1 bra LBB79_1;
bra.uni LBB79_4;
LBB79_1:
mul.lo.s64 %rd2446, %rd2384, 3528531795;
ld.global.nc.u64 %rd2461, [%rd7+8];
selp.u64 %rd516, 1, 0, %p69;
add.s64 %rd517, %rd2461, %rd516;
xor.b64 %rd518, %rd517, %rd2446;
shr.u64 %rd519, %rd518, 32;
mul.lo.s64 %rd2449, %rd519, 3449720151;
shr.u64 %rd520, %rd2449, 32;
and.b64 %rd521, %rd517, 4294967295;
mul.lo.s64 %rd522, %rd521, 3449720151;
and.b64 %rd523, %rd522, 4294967295;
xor.b64 %rd524, %rd523, %rd520;
xor.b64 %rd525, %rd524, 2654435769;
mul.lo.s64 %rd2452, %rd525, 3528531795;
xor.b64 %rd2442, %rd522, %rd12;
mov.u32 %r312, -1879881855;
mov.u32 %r311, -845247145;
mov.u32 %r310, 534103459;
mov.u64 %rd2460, 3678237736;
mov.u64 %rd2459, 3041712726;
mov.u64 %rd2458, 1401181199;
mov.u64 %rd2457, 2835769497;
mov.u64 %rd2456, 1684936478;
mov.u64 %rd2455, 2027808484;
mov.u64 %rd2454, 387276957;
mov.u64 %rd2453, 842468239;
mov.u64 %rd2451, 3986602516;
mov.u64 %rd2450, 1013904242;
mov.u64 %rd2448, 3668340011;
mov.u64 %rd2447, 3144134277;
mov.u64 %rd2445, 3449720151;
mov.u64 %rd2444, 1993301258;
mov.u64 %rd2443, 3528531795;
bra.uni LBB79_5;
LBB79_4:
mov.u32 %r311, -766435501;
mov.u64 %rd2459, 1684936478;
mov.u64 %rd2458, 534103459;
mov.u64 %rd2457, 387276957;
mov.u64 %rd2456, 3041712726;
mov.u64 %rd2455, 3986602516;
mov.u64 %rd2454, 2835769497;
mov.u64 %rd2453, 3668340011;
mov.u64 %rd2451, 2027808484;
mov.u64 %rd2450, 1993301258;
mov.u64 %rd2448, 842468239;
mov.u64 %rd2447, 2654435769;
mov.u64 %rd2445, 3528531795;
mov.u64 %rd2444, 1013904242;
mov.u64 %rd2443, 3449720151;
mov.u32 %r312, -1767562579;
mov.u32 %r310, 1401181199;
mov.u64 %rd2460, 4055616968;
ld.global.nc.u64 %rd2461, [%rd7+8];
selp.u64 %rd489, 1, 0, %p69;
add.s64 %rd490, %rd2461, %rd489;
and.b64 %rd491, %rd490, 4294967295;
mul.lo.s64 %rd2446, %rd491, 3449720151;
xor.b64 %rd492, %rd2446, %rd12;
shr.u64 %rd493, %rd492, 32;
mul.lo.s64 %rd2449, %rd493, 3528531795;
shr.u64 %rd494, %rd2449, 32;
mul.lo.s64 %rd496, %rd2384, 3528531795;
and.b64 %rd497, %rd496, 4294967295;
xor.b64 %rd498, %rd497, %rd494;
xor.b64 %rd499, %rd498, 3144134277;
mul.lo.s64 %rd2452, %rd499, 3449720151;
xor.b64 %rd2442, %rd490, %rd496;
LBB79_5:
shr.u64 %rd526, %rd2452, 32;
shr.u64 %rd527, %rd2442, 32;
mul.lo.s64 %rd528, %rd527, %rd2443;
and.b64 %rd529, %rd528, 4294967295;
xor.b64 %rd530, %rd529, %rd526;
xor.b64 %rd531, %rd530, %rd2444;
mul.lo.s64 %rd532, %rd531, %rd2445;
shr.u64 %rd533, %rd532, 32;
shr.u64 %rd534, %rd528, 32;
and.b64 %rd535, %rd2446, 4294967295;
xor.b64 %rd536, %rd535, %rd534;
xor.b64 %rd537, %rd536, %rd2447;
mul.lo.s64 %rd538, %rd537, %rd2445;
and.b64 %rd539, %rd538, 4294967295;
xor.b64 %rd540, %rd539, %rd533;
xor.b64 %rd541, %rd540, %rd2448;
mul.lo.s64 %rd542, %rd541, %rd2443;
shr.u64 %rd543, %rd542, 32;
shr.u64 %rd544, %rd538, 32;
and.b64 %rd545, %rd2449, 4294967295;
xor.b64 %rd546, %rd545, %rd544;
xor.b64 %rd547, %rd546, %rd2450;
mul.lo.s64 %rd548, %rd547, %rd2443;
and.b64 %rd549, %rd548, 4294967295;
xor.b64 %rd550, %rd549, %rd543;
xor.b64 %rd551, %rd550, %rd2451;
mul.lo.s64 %rd552, %rd551, %rd2445;
shr.u64 %rd553, %rd552, 32;
shr.u64 %rd554, %rd548, 32;
and.b64 %rd555, %rd2452, 4294967295;
xor.b64 %rd556, %rd555, %rd554;
xor.b64 %rd557, %rd556, %rd2453;
mul.lo.s64 %rd558, %rd557, %rd2445;
and.b64 %rd559, %rd558, 4294967295;
xor.b64 %rd560, %rd559, %rd553;
xor.b64 %rd561, %rd560, %rd2454;
mul.lo.s64 %rd562, %rd561, %rd2443;
shr.u64 %rd563, %rd562, 32;
shr.u64 %rd564, %rd558, 32;
and.b64 %rd565, %rd532, 4294967295;
xor.b64 %rd566, %rd565, %rd564;
xor.b64 %rd567, %rd566, %rd2455;
mul.lo.s64 %rd568, %rd567, %rd2443;
and.b64 %rd569, %rd568, 4294967295;
xor.b64 %rd570, %rd569, %rd563;
xor.b64 %rd571, %rd570, %rd2456;
mul.lo.s64 %rd572, %rd571, %rd2445;
shr.u64 %rd573, %rd572, 32;
shr.u64 %rd574, %rd568, 32;
and.b64 %rd575, %rd542, 4294967295;
xor.b64 %rd576, %rd575, %rd574;
xor.b64 %rd577, %rd576, %rd2457;
mul.lo.s64 %rd578, %rd577, %rd2445;
and.b64 %rd579, %rd578, 4294967295;
xor.b64 %rd580, %rd579, %rd573;
xor.b64 %rd581, %rd580, %rd2458;
mul.lo.s64 %rd582, %rd581, %rd2443;
shr.u64 %rd583, %rd582, 32;
shr.u64 %rd584, %rd578, 32;
and.b64 %rd585, %rd552, 4294967295;
xor.b64 %rd586, %rd585, %rd584;
xor.b64 %rd587, %rd586, %rd2459;
mul.lo.s64 %rd588, %rd587, %rd2443;
and.b64 %rd589, %rd588, 4294967295;
xor.b64 %rd590, %rd589, %rd583;
xor.b64 %rd591, %rd590, %rd2460;
mul.lo.s64 %rd592, %rd591, %rd2445;
shr.u64 %rd593, %rd592, 32;
cvt.u32.u64 %r56, %rd593;
shr.u64 %rd594, %rd588, 32;
xor.b64 %rd595, %rd594, %rd562;
cvt.u32.u64 %r57, %rd595;
xor.b32 %r58, %r310, %r57;
mul.lo.s32 %r59, %r58, %r311;
xor.b32 %r60, %r59, %r56;
xor.b32 %r61, %r60, %r312;
shr.u32 %r62, %r61, 9;
cvt.rn.f32.u32 %f19, %r62;
mul.rn.f32 %f20, %f19, 0f34000000;
cvt.rn.f16.f32 %h1, %f20;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p4, %h1, %h2;
mul.wide.u32 %rd596, %r2, 2048;
add.s64 %rd597, %rd9, %rd596;
mul.wide.u32 %rd598, %r3, 2;
add.s64 %rd44, %rd597, %rd598;
ld.global.nc.b16 %h3, [%rd44];
mul.wide.u32 %rd599, %r3, 4;
add.s64 %rd45, %rd1, %rd599;
ld.global.nc.f32 %f21, [%rd45];
cvt.rn.f16.f32 %h4, %f21;
add.rn.f16 %h5, %h3, %h4;
mov.b16 %h6, 0x3C72;
mul.rn.f16 %h7, %h5, %h6;
selp.b16 %h8, %h7, 0x0000, %p4;
cvt.f32.f16 %f22, %h8;
add.s64 %rd600, %rd8, %rd596;
add.s64 %rd46, %rd600, %rd598;
ld.global.nc.b16 %h9, [%rd46];
cvt.f32.f16 %f23, %h9;
mul.wide.u32 %rd601, %r2, 4;
add.s64 %rd602, %rd5, %rd601;
ld.global.nc.f32 %f24, [%rd602];
mul.rn.f32 %f25, %f24, 0f3A800000;
add.rn.f32 %f26, %f25, 0f2B8CBCCC;
rsqrt.approx.f32 %f1, %f26;
add.s64 %rd47, %rd2, %rd599;
ld.global.nc.f32 %f27, [%rd47];
mul.rn.f32 %f28, %f1, %f27;
mul.rn.f32 %f29, %f28, %f23;
add.s64 %rd48, %rd3, %rd599;
ld.global.nc.f32 %f30, [%rd48];
add.s64 %rd603, %rd4, %rd601;
ld.global.nc.f32 %f31, [%rd603];
mul.rn.f32 %f2, %f31, 0f3A800000;
mul.rn.f32 %f32, %f28, %f2;
sub.rn.f32 %f33, %f30, %f32;
add.rn.f32 %f34, %f29, %f33;
add.rn.f32 %f35, %f34, %f22;
add.rn.f32 %f3, %f35, 0f00000000;
or.b32 %r63, %r3, 1;
and.b32 %r64, %r63, 3;
setp.ne.s32 %p5, %r64, 1;
@%p5 bra LBB79_7;
mul.lo.s64 %rd2466, %rd2384, 3528531795;
selp.u64 %rd644, 1, 0, %p69;
add.s64 %rd645, %rd2461, %rd644;
xor.b64 %rd646, %rd645, %rd2466;
shr.u64 %rd647, %rd646, 32;
mul.lo.s64 %rd2469, %rd647, 3449720151;
shr.u64 %rd648, %rd2469, 32;
and.b64 %rd649, %rd645, 4294967295;
mul.lo.s64 %rd650, %rd649, 3449720151;
and.b64 %rd651, %rd650, 4294967295;
xor.b64 %rd652, %rd651, %rd648;
xor.b64 %rd653, %rd652, 2654435769;
mul.lo.s64 %rd2472, %rd653, 3528531795;
xor.b64 %rd2462, %rd650, %rd12;
mov.u32 %r314, -845247145;
mov.u32 %r313, -616729560;
mov.u64 %rd2479, 3041712726;
mov.u64 %rd2478, 1401181199;
mov.u64 %rd2477, 2835769497;
mov.u64 %rd2476, 1684936478;
mov.u64 %rd2475, 2027808484;
mov.u64 %rd2474, 387276957;
mov.u64 %rd2473, 842468239;
mov.u64 %rd2471, 3986602516;
mov.u64 %rd2470, 1013904242;
mov.u64 %rd2468, 3668340011;
mov.u64 %rd2467, 3144134277;
mov.u64 %rd2465, 3449720151;
mov.u64 %rd2464, 1993301258;
mov.u64 %rd2463, 3528531795;
bra.uni LBB79_8;
LBB79_7:
mov.u32 %r313, -239350328;
selp.u64 %rd618, 1, 0, %p69;
add.s64 %rd619, %rd2461, %rd618;
and.b64 %rd620, %rd619, 4294967295;
mul.lo.s64 %rd2466, %rd620, 3449720151;
xor.b64 %rd621, %rd2466, %rd12;
shr.u64 %rd622, %rd621, 32;
mul.lo.s64 %rd2469, %rd622, 3528531795;
shr.u64 %rd623, %rd2469, 32;
mul.lo.s64 %rd625, %rd2384, 3528531795;
and.b64 %rd626, %rd625, 4294967295;
xor.b64 %rd627, %rd626, %rd623;
xor.b64 %rd628, %rd627, 3144134277;
mul.lo.s64 %rd2472, %rd628, 3449720151;
xor.b64 %rd2462, %rd619, %rd625;
mov.u32 %r314, -766435501;
mov.u64 %rd2479, 1684936478;
mov.u64 %rd2478, 534103459;
mov.u64 %rd2477, 387276957;
mov.u64 %rd2476, 3041712726;
mov.u64 %rd2475, 3986602516;
mov.u64 %rd2474, 2835769497;
mov.u64 %rd2473, 3668340011;
mov.u64 %rd2471, 2027808484;
mov.u64 %rd2470, 1993301258;
mov.u64 %rd2468, 842468239;
mov.u64 %rd2467, 2654435769;
mov.u64 %rd2465, 3528531795;
mov.u64 %rd2464, 1013904242;
mov.u64 %rd2463, 3449720151;
LBB79_8:
setp.ne.s32 %p8, %r5, 0;
shr.u64 %rd654, %rd2472, 32;
shr.u64 %rd655, %rd2462, 32;
mul.lo.s64 %rd656, %rd655, %rd2463;
and.b64 %rd657, %rd656, 4294967295;
xor.b64 %rd658, %rd657, %rd654;
xor.b64 %rd659, %rd658, %rd2464;
mul.lo.s64 %rd660, %rd659, %rd2465;
shr.u64 %rd661, %rd660, 32;
shr.u64 %rd662, %rd656, 32;
and.b64 %rd663, %rd2466, 4294967295;
xor.b64 %rd664, %rd663, %rd662;
xor.b64 %rd665, %rd664, %rd2467;
mul.lo.s64 %rd666, %rd665, %rd2465;
and.b64 %rd667, %rd666, 4294967295;
xor.b64 %rd668, %rd667, %rd661;
xor.b64 %rd669, %rd668, %rd2468;
mul.lo.s64 %rd670, %rd669, %rd2463;
shr.u64 %rd671, %rd670, 32;
shr.u64 %rd672, %rd666, 32;
and.b64 %rd673, %rd2469, 4294967295;
xor.b64 %rd674, %rd673, %rd672;
xor.b64 %rd675, %rd674, %rd2470;
mul.lo.s64 %rd676, %rd675, %rd2463;
and.b64 %rd677, %rd676, 4294967295;
xor.b64 %rd678, %rd677, %rd671;
xor.b64 %rd679, %rd678, %rd2471;
mul.lo.s64 %rd680, %rd679, %rd2465;
shr.u64 %rd681, %rd680, 32;
shr.u64 %rd682, %rd676, 32;
and.b64 %rd683, %rd2472, 4294967295;
xor.b64 %rd684, %rd683, %rd682;
xor.b64 %rd685, %rd684, %rd2473;
mul.lo.s64 %rd686, %rd685, %rd2465;
and.b64 %rd687, %rd686, 4294967295;
xor.b64 %rd688, %rd687, %rd681;
xor.b64 %rd689, %rd688, %rd2474;
mul.lo.s64 %rd690, %rd689, %rd2463;
shr.u64 %rd691, %rd690, 32;
shr.u64 %rd692, %rd686, 32;
and.b64 %rd693, %rd660, 4294967295;
xor.b64 %rd694, %rd693, %rd692;
xor.b64 %rd695, %rd694, %rd2475;
mul.lo.s64 %rd696, %rd695, %rd2463;
and.b64 %rd697, %rd696, 4294967295;
xor.b64 %rd698, %rd697, %rd691;
xor.b64 %rd699, %rd698, %rd2476;
mul.lo.s64 %rd700, %rd699, %rd2465;
shr.u64 %rd701, %rd700, 32;
shr.u64 %rd702, %rd696, 32;
and.b64 %rd703, %rd670, 4294967295;
xor.b64 %rd704, %rd703, %rd702;
xor.b64 %rd705, %rd704, %rd2477;
mul.lo.s64 %rd706, %rd705, %rd2465;
and.b64 %rd707, %rd706, 4294967295;
xor.b64 %rd708, %rd707, %rd701;
xor.b64 %rd709, %rd708, %rd2478;
mul.lo.s64 %rd710, %rd709, %rd2463;
shr.u64 %rd711, %rd710, 32;
shr.u64 %rd712, %rd706, 32;
xor.b64 %rd713, %rd680, %rd712;
xor.b64 %rd714, %rd713, %rd2479;
mul.lo.s64 %rd715, %rd714, %rd2463;
xor.b64 %rd716, %rd711, %rd715;
cvt.u32.u64 %r69, %rd716;
xor.b32 %r70, %r313, %r69;
mul.lo.s32 %r71, %r70, %r314;
shr.u32 %r72, %r71, 9;
cvt.rn.f32.u32 %f36, %r72;
mul.rn.f32 %f37, %f36, 0f34000000;
cvt.rn.f16.f32 %h10, %f37;
mov.b16 %h11, 0x2E66;
setp.ge.f16 %p9, %h10, %h11;
ld.global.nc.b16 %h12, [%rd44+2];
ld.global.nc.f32 %f38, [%rd45+4];
cvt.rn.f16.f32 %h13, %f38;
add.rn.f16 %h14, %h12, %h13;
mov.b16 %h15, 0x3C72;
mul.rn.f16 %h16, %h14, %h15;
selp.b16 %h17, %h16, 0x0000, %p9;
cvt.f32.f16 %f39, %h17;
ld.global.nc.b16 %h18, [%rd46+2];
cvt.f32.f16 %f40, %h18;
ld.global.nc.f32 %f41, [%rd47+4];
mul.rn.f32 %f42, %f1, %f41;
mul.rn.f32 %f43, %f42, %f40;
ld.global.nc.f32 %f44, [%rd48+4];
mul.rn.f32 %f45, %f2, %f42;
sub.rn.f32 %f46, %f44, %f45;
add.rn.f32 %f47, %f43, %f46;
add.rn.f32 %f48, %f47, %f39;
add.rn.f32 %f4, %f3, %f48;
or.b32 %r73, %r3, %r4;
or.b32 %r74, %r73, 128;
shr.u32 %r75, %r74, 2;
cvt.u64.u32 %rd717, %r75;
add.s64 %rd75, %rd11, %rd717;
and.b64 %rd2433, %rd75, 4294967295;
setp.lt.u64 %p74, %rd75, %rd11;
@%p8 bra LBB79_10;
mul.lo.s64 %rd2484, %rd2433, 3528531795;
selp.u64 %rd760, 1, 0, %p74;
add.s64 %rd761, %rd2461, %rd760;
xor.b64 %rd762, %rd761, %rd2484;
shr.u64 %rd763, %rd762, 32;
mul.lo.s64 %rd2487, %rd763, 3449720151;
shr.u64 %rd764, %rd2487, 32;
and.b64 %rd765, %rd761, 4294967295;
mul.lo.s64 %rd766, %rd765, 3449720151;
and.b64 %rd767, %rd766, 4294967295;
xor.b64 %rd768, %rd767, %rd764;
xor.b64 %rd769, %rd768, 2654435769;
mul.lo.s64 %rd2490, %rd769, 3528531795;
xor.b64 %rd2480, %rd766, %rd75;
mov.u32 %r317, -1879881855;
mov.u32 %r316, -845247145;
mov.u32 %r315, 534103459;
mov.u64 %rd2498, 3678237736;
mov.u64 %rd2497, 3041712726;
mov.u64 %rd2496, 1401181199;
mov.u64 %rd2495, 2835769497;
mov.u64 %rd2494, 1684936478;
mov.u64 %rd2493, 2027808484;
mov.u64 %rd2492, 387276957;
mov.u64 %rd2491, 842468239;
mov.u64 %rd2489, 3986602516;
mov.u64 %rd2488, 1013904242;
mov.u64 %rd2486, 3668340011;
mov.u64 %rd2485, 3144134277;
mov.u64 %rd2483, 3449720151;
mov.u64 %rd2482, 1993301258;
mov.u64 %rd2481, 3528531795;
bra.uni LBB79_11;
LBB79_10:
selp.u64 %rd733, 1, 0, %p74;
add.s64 %rd734, %rd2461, %rd733;
and.b64 %rd735, %rd734, 4294967295;
mul.lo.s64 %rd2484, %rd735, 3449720151;
xor.b64 %rd736, %rd2484, %rd75;
shr.u64 %rd737, %rd736, 32;
mul.lo.s64 %rd2487, %rd737, 3528531795;
shr.u64 %rd738, %rd2487, 32;
mul.lo.s64 %rd740, %rd2433, 3528531795;
and.b64 %rd741, %rd740, 4294967295;
xor.b64 %rd742, %rd741, %rd738;
xor.b64 %rd743, %rd742, 3144134277;
mul.lo.s64 %rd2490, %rd743, 3449720151;
xor.b64 %rd2480, %rd734, %rd740;
mov.u32 %r317, -1767562579;
mov.u32 %r316, -766435501;
mov.u32 %r315, 1401181199;
mov.u64 %rd2498, 4055616968;
mov.u64 %rd2497, 1684936478;
mov.u64 %rd2496, 534103459;
mov.u64 %rd2495, 387276957;
mov.u64 %rd2494, 3041712726;
mov.u64 %rd2493, 3986602516;
mov.u64 %rd2492, 2835769497;
mov.u64 %rd2491, 3668340011;
mov.u64 %rd2489, 2027808484;
mov.u64 %rd2488, 1993301258;
mov.u64 %rd2486, 842468239;
mov.u64 %rd2485, 2654435769;
mov.u64 %rd2483, 3528531795;
mov.u64 %rd2482, 1013904242;
mov.u64 %rd2481, 3449720151;
LBB79_11:
shr.u64 %rd770, %rd2490, 32;
shr.u64 %rd771, %rd2480, 32;
mul.lo.s64 %rd772, %rd771, %rd2481;
and.b64 %rd773, %rd772, 4294967295;
xor.b64 %rd774, %rd773, %rd770;
xor.b64 %rd775, %rd774, %rd2482;
mul.lo.s64 %rd776, %rd775, %rd2483;
shr.u64 %rd777, %rd776, 32;
shr.u64 %rd778, %rd772, 32;
and.b64 %rd779, %rd2484, 4294967295;
xor.b64 %rd780, %rd779, %rd778;
xor.b64 %rd781, %rd780, %rd2485;
mul.lo.s64 %rd782, %rd781, %rd2483;
and.b64 %rd783, %rd782, 4294967295;
xor.b64 %rd784, %rd783, %rd777;
xor.b64 %rd785, %rd784, %rd2486;
mul.lo.s64 %rd786, %rd785, %rd2481;
shr.u64 %rd787, %rd786, 32;
shr.u64 %rd788, %rd782, 32;
and.b64 %rd789, %rd2487, 4294967295;
xor.b64 %rd790, %rd789, %rd788;
xor.b64 %rd791, %rd790, %rd2488;
mul.lo.s64 %rd792, %rd791, %rd2481;
and.b64 %rd793, %rd792, 4294967295;
xor.b64 %rd794, %rd793, %rd787;
xor.b64 %rd795, %rd794, %rd2489;
mul.lo.s64 %rd796, %rd795, %rd2483;
shr.u64 %rd797, %rd796, 32;
shr.u64 %rd798, %rd792, 32;
and.b64 %rd799, %rd2490, 4294967295;
xor.b64 %rd800, %rd799, %rd798;
xor.b64 %rd801, %rd800, %rd2491;
mul.lo.s64 %rd802, %rd801, %rd2483;
and.b64 %rd803, %rd802, 4294967295;
xor.b64 %rd804, %rd803, %rd797;
xor.b64 %rd805, %rd804, %rd2492;
mul.lo.s64 %rd806, %rd805, %rd2481;
shr.u64 %rd807, %rd806, 32;
shr.u64 %rd808, %rd802, 32;
and.b64 %rd809, %rd776, 4294967295;
xor.b64 %rd810, %rd809, %rd808;
xor.b64 %rd811, %rd810, %rd2493;
mul.lo.s64 %rd812, %rd811, %rd2481;
and.b64 %rd813, %rd812, 4294967295;
xor.b64 %rd814, %rd813, %rd807;
xor.b64 %rd815, %rd814, %rd2494;
mul.lo.s64 %rd816, %rd815, %rd2483;
shr.u64 %rd817, %rd816, 32;
shr.u64 %rd818, %rd812, 32;
and.b64 %rd819, %rd786, 4294967295;
xor.b64 %rd820, %rd819, %rd818;
xor.b64 %rd821, %rd820, %rd2495;
mul.lo.s64 %rd822, %rd821, %rd2483;
and.b64 %rd823, %rd822, 4294967295;
xor.b64 %rd824, %rd823, %rd817;
xor.b64 %rd825, %rd824, %rd2496;
mul.lo.s64 %rd826, %rd825, %rd2481;
shr.u64 %rd827, %rd826, 32;
shr.u64 %rd828, %rd822, 32;
and.b64 %rd829, %rd796, 4294967295;
xor.b64 %rd830, %rd829, %rd828;
xor.b64 %rd831, %rd830, %rd2497;
mul.lo.s64 %rd832, %rd831, %rd2481;
and.b64 %rd833, %rd832, 4294967295;
xor.b64 %rd834, %rd833, %rd827;
xor.b64 %rd835, %rd834, %rd2498;
mul.lo.s64 %rd836, %rd835, %rd2483;
shr.u64 %rd837, %rd836, 32;
cvt.u32.u64 %r82, %rd837;
shr.u64 %rd838, %rd832, 32;
xor.b64 %rd839, %rd838, %rd806;
cvt.u32.u64 %r83, %rd839;
xor.b32 %r84, %r315, %r83;
mul.lo.s32 %r85, %r84, %r316;
xor.b32 %r86, %r85, %r82;
xor.b32 %r87, %r86, %r317;
shr.u32 %r88, %r87, 9;
cvt.rn.f32.u32 %f49, %r88;
mul.rn.f32 %f50, %f49, 0f34000000;
cvt.rn.f16.f32 %h19, %f50;
mov.b16 %h20, 0x2E66;
setp.ge.f16 %p12, %h19, %h20;
ld.global.nc.b16 %h21, [%rd44+256];
ld.global.nc.f32 %f51, [%rd45+512];
cvt.rn.f16.f32 %h22, %f51;
add.rn.f16 %h23, %h21, %h22;
mov.b16 %h24, 0x3C72;
mul.rn.f16 %h25, %h23, %h24;
selp.b16 %h26, %h25, 0x0000, %p12;
cvt.f32.f16 %f52, %h26;
ld.global.nc.b16 %h27, [%rd46+256];
cvt.f32.f16 %f53, %h27;
ld.global.nc.f32 %f54, [%rd47+512];
mul.rn.f32 %f55, %f1, %f54;
mul.rn.f32 %f56, %f55, %f53;
ld.global.nc.f32 %f57, [%rd48+512];
mul.rn.f32 %f58, %f2, %f55;
sub.rn.f32 %f59, %f57, %f58;
add.rn.f32 %f60, %f56, %f59;
add.rn.f32 %f61, %f60, %f52;
add.rn.f32 %f5, %f4, %f61;
or.b32 %r89, %r3, 129;
or.b32 %r90, %r89, %r4;
and.b32 %r91, %r89, 3;
shr.u32 %r92, %r90, 2;
setp.ne.s32 %p13, %r91, 1;
cvt.u64.u32 %rd840, %r92;
add.s64 %rd103, %rd11, %rd840;
and.b64 %rd2430, %rd103, 4294967295;
setp.lt.u64 %p73, %rd103, %rd11;
@%p13 bra LBB79_13;
mul.lo.s64 %rd2503, %rd2430, 3528531795;
selp.u64 %rd881, 1, 0, %p73;
add.s64 %rd882, %rd2461, %rd881;
xor.b64 %rd883, %rd882, %rd2503;
shr.u64 %rd884, %rd883, 32;
mul.lo.s64 %rd2506, %rd884, 3449720151;
shr.u64 %rd885, %rd2506, 32;
and.b64 %rd886, %rd882, 4294967295;
mul.lo.s64 %rd887, %rd886, 3449720151;
and.b64 %rd888, %rd887, 4294967295;
xor.b64 %rd889, %rd888, %rd885;
xor.b64 %rd890, %rd889, 2654435769;
mul.lo.s64 %rd2509, %rd890, 3528531795;
xor.b64 %rd2499, %rd887, %rd103;
mov.u32 %r319, -845247145;
mov.u32 %r318, -616729560;
mov.u64 %rd2516, 3041712726;
mov.u64 %rd2515, 1401181199;
mov.u64 %rd2514, 2835769497;
mov.u64 %rd2513, 1684936478;
mov.u64 %rd2512, 2027808484;
mov.u64 %rd2511, 387276957;
mov.u64 %rd2510, 842468239;
mov.u64 %rd2508, 3986602516;
mov.u64 %rd2507, 1013904242;
mov.u64 %rd2505, 3668340011;
mov.u64 %rd2504, 3144134277;
mov.u64 %rd2502, 3449720151;
mov.u64 %rd2501, 1993301258;
mov.u64 %rd2500, 3528531795;
bra.uni LBB79_14;
LBB79_13:
selp.u64 %rd855, 1, 0, %p73;
add.s64 %rd856, %rd2461, %rd855;
and.b64 %rd857, %rd856, 4294967295;
mul.lo.s64 %rd2503, %rd857, 3449720151;
xor.b64 %rd858, %rd2503, %rd103;
shr.u64 %rd859, %rd858, 32;
mul.lo.s64 %rd2506, %rd859, 3528531795;
shr.u64 %rd860, %rd2506, 32;
mul.lo.s64 %rd862, %rd2430, 3528531795;
and.b64 %rd863, %rd862, 4294967295;
xor.b64 %rd864, %rd863, %rd860;
xor.b64 %rd865, %rd864, 3144134277;
mul.lo.s64 %rd2509, %rd865, 3449720151;
xor.b64 %rd2499, %rd856, %rd862;
mov.u32 %r319, -766435501;
mov.u32 %r318, -239350328;
mov.u64 %rd2516, 1684936478;
mov.u64 %rd2515, 534103459;
mov.u64 %rd2514, 387276957;
mov.u64 %rd2513, 3041712726;
mov.u64 %rd2512, 3986602516;
mov.u64 %rd2511, 2835769497;
mov.u64 %rd2510, 3668340011;
mov.u64 %rd2508, 2027808484;
mov.u64 %rd2507, 1993301258;
mov.u64 %rd2505, 842468239;
mov.u64 %rd2504, 2654435769;
mov.u64 %rd2502, 3528531795;
mov.u64 %rd2501, 1013904242;
mov.u64 %rd2500, 3449720151;
LBB79_14:
shr.u64 %rd891, %rd2509, 32;
shr.u64 %rd892, %rd2499, 32;
mul.lo.s64 %rd893, %rd892, %rd2500;
and.b64 %rd894, %rd893, 4294967295;
xor.b64 %rd895, %rd894, %rd891;
xor.b64 %rd896, %rd895, %rd2501;
mul.lo.s64 %rd897, %rd896, %rd2502;
shr.u64 %rd898, %rd897, 32;
shr.u64 %rd899, %rd893, 32;
and.b64 %rd900, %rd2503, 4294967295;
xor.b64 %rd901, %rd900, %rd899;
xor.b64 %rd902, %rd901, %rd2504;
mul.lo.s64 %rd903, %rd902, %rd2502;
and.b64 %rd904, %rd903, 4294967295;
xor.b64 %rd905, %rd904, %rd898;
xor.b64 %rd906, %rd905, %rd2505;
mul.lo.s64 %rd907, %rd906, %rd2500;
shr.u64 %rd908, %rd907, 32;
shr.u64 %rd909, %rd903, 32;
and.b64 %rd910, %rd2506, 4294967295;
xor.b64 %rd911, %rd910, %rd909;
xor.b64 %rd912, %rd911, %rd2507;
mul.lo.s64 %rd913, %rd912, %rd2500;
and.b64 %rd914, %rd913, 4294967295;
xor.b64 %rd915, %rd914, %rd908;
xor.b64 %rd916, %rd915, %rd2508;
mul.lo.s64 %rd917, %rd916, %rd2502;
shr.u64 %rd918, %rd917, 32;
shr.u64 %rd919, %rd913, 32;
and.b64 %rd920, %rd2509, 4294967295;
xor.b64 %rd921, %rd920, %rd919;
xor.b64 %rd922, %rd921, %rd2510;
mul.lo.s64 %rd923, %rd922, %rd2502;
and.b64 %rd924, %rd923, 4294967295;
xor.b64 %rd925, %rd924, %rd918;
xor.b64 %rd926, %rd925, %rd2511;
mul.lo.s64 %rd927, %rd926, %rd2500;
shr.u64 %rd928, %rd927, 32;
shr.u64 %rd929, %rd923, 32;
and.b64 %rd930, %rd897, 4294967295;
xor.b64 %rd931, %rd930, %rd929;
xor.b64 %rd932, %rd931, %rd2512;
mul.lo.s64 %rd933, %rd932, %rd2500;
and.b64 %rd934, %rd933, 4294967295;
xor.b64 %rd935, %rd934, %rd928;
xor.b64 %rd936, %rd935, %rd2513;
mul.lo.s64 %rd937, %rd936, %rd2502;
shr.u64 %rd938, %rd937, 32;
shr.u64 %rd939, %rd933, 32;
and.b64 %rd940, %rd907, 4294967295;
xor.b64 %rd941, %rd940, %rd939;
xor.b64 %rd942, %rd941, %rd2514;
mul.lo.s64 %rd943, %rd942, %rd2502;
and.b64 %rd944, %rd943, 4294967295;
xor.b64 %rd945, %rd944, %rd938;
xor.b64 %rd946, %rd945, %rd2515;
mul.lo.s64 %rd947, %rd946, %rd2500;
shr.u64 %rd948, %rd947, 32;
shr.u64 %rd949, %rd943, 32;
xor.b64 %rd950, %rd917, %rd949;
xor.b64 %rd951, %rd950, %rd2516;
mul.lo.s64 %rd952, %rd951, %rd2500;
xor.b64 %rd953, %rd948, %rd952;
cvt.u32.u64 %r97, %rd953;
xor.b32 %r98, %r318, %r97;
mul.lo.s32 %r99, %r98, %r319;
shr.u32 %r100, %r99, 9;
cvt.rn.f32.u32 %f62, %r100;
mul.rn.f32 %f63, %f62, 0f34000000;
cvt.rn.f16.f32 %h28, %f63;
mov.b16 %h29, 0x2E66;
setp.ge.f16 %p17, %h28, %h29;
ld.global.nc.b16 %h30, [%rd44+258];
ld.global.nc.f32 %f64, [%rd45+516];
cvt.rn.f16.f32 %h31, %f64;
add.rn.f16 %h32, %h30, %h31;
mov.b16 %h33, 0x3C72;
mul.rn.f16 %h34, %h32, %h33;
selp.b16 %h35, %h34, 0x0000, %p17;
cvt.f32.f16 %f65, %h35;
ld.global.nc.b16 %h36, [%rd46+258];
cvt.f32.f16 %f66, %h36;
ld.global.nc.f32 %f67, [%rd47+516];
mul.rn.f32 %f68, %f1, %f67;
mul.rn.f32 %f69, %f68, %f66;
ld.global.nc.f32 %f70, [%rd48+516];
mul.rn.f32 %f71, %f2, %f68;
sub.rn.f32 %f72, %f70, %f71;
add.rn.f32 %f73, %f69, %f72;
add.rn.f32 %f74, %f73, %f65;
add.rn.f32 %f6, %f5, %f74;
or.b32 %r102, %r73, 256;
shr.u32 %r103, %r102, 2;
cvt.u64.u32 %rd954, %r103;
add.s64 %rd130, %rd11, %rd954;
and.b64 %rd2426, %rd130, 4294967295;
setp.lt.u64 %p72, %rd130, %rd11;
@%p8 bra LBB79_16;
mul.lo.s64 %rd2521, %rd2426, 3528531795;
selp.u64 %rd997, 1, 0, %p72;
add.s64 %rd998, %rd2461, %rd997;
xor.b64 %rd999, %rd998, %rd2521;
shr.u64 %rd1000, %rd999, 32;
mul.lo.s64 %rd2524, %rd1000, 3449720151;
shr.u64 %rd1001, %rd2524, 32;
and.b64 %rd1002, %rd998, 4294967295;
mul.lo.s64 %rd1003, %rd1002, 3449720151;
and.b64 %rd1004, %rd1003, 4294967295;
xor.b64 %rd1005, %rd1004, %rd1001;
xor.b64 %rd1006, %rd1005, 2654435769;
mul.lo.s64 %rd2527, %rd1006, 3528531795;
xor.b64 %rd2517, %rd1003, %rd130;
mov.u32 %r322, -1879881855;
mov.u32 %r321, -845247145;
mov.u32 %r320, 534103459;
mov.u64 %rd2535, 3678237736;
mov.u64 %rd2534, 3041712726;
mov.u64 %rd2533, 1401181199;
mov.u64 %rd2532, 2835769497;
mov.u64 %rd2531, 1684936478;
mov.u64 %rd2530, 2027808484;
mov.u64 %rd2529, 387276957;
mov.u64 %rd2528, 842468239;
mov.u64 %rd2526, 3986602516;
mov.u64 %rd2525, 1013904242;
mov.u64 %rd2523, 3668340011;
mov.u64 %rd2522, 3144134277;
mov.u64 %rd2520, 3449720151;
mov.u64 %rd2519, 1993301258;
mov.u64 %rd2518, 3528531795;
bra.uni LBB79_17;
LBB79_16:
selp.u64 %rd970, 1, 0, %p72;
add.s64 %rd971, %rd2461, %rd970;
and.b64 %rd972, %rd971, 4294967295;
mul.lo.s64 %rd2521, %rd972, 3449720151;
xor.b64 %rd973, %rd2521, %rd130;
shr.u64 %rd974, %rd973, 32;
mul.lo.s64 %rd2524, %rd974, 3528531795;
shr.u64 %rd975, %rd2524, 32;
mul.lo.s64 %rd977, %rd2426, 3528531795;
and.b64 %rd978, %rd977, 4294967295;
xor.b64 %rd979, %rd978, %rd975;
xor.b64 %rd980, %rd979, 3144134277;
mul.lo.s64 %rd2527, %rd980, 3449720151;
xor.b64 %rd2517, %rd971, %rd977;
mov.u32 %r322, -1767562579;
mov.u32 %r321, -766435501;
mov.u32 %r320, 1401181199;
mov.u64 %rd2535, 4055616968;
mov.u64 %rd2534, 1684936478;
mov.u64 %rd2533, 534103459;
mov.u64 %rd2532, 387276957;
mov.u64 %rd2531, 3041712726;
mov.u64 %rd2530, 3986602516;
mov.u64 %rd2529, 2835769497;
mov.u64 %rd2528, 3668340011;
mov.u64 %rd2526, 2027808484;
mov.u64 %rd2525, 1993301258;
mov.u64 %rd2523, 842468239;
mov.u64 %rd2522, 2654435769;
mov.u64 %rd2520, 3528531795;
mov.u64 %rd2519, 1013904242;
mov.u64 %rd2518, 3449720151;
LBB79_17:
shr.u64 %rd1007, %rd2527, 32;
shr.u64 %rd1008, %rd2517, 32;
mul.lo.s64 %rd1009, %rd1008, %rd2518;
and.b64 %rd1010, %rd1009, 4294967295;
xor.b64 %rd1011, %rd1010, %rd1007;
xor.b64 %rd1012, %rd1011, %rd2519;
mul.lo.s64 %rd1013, %rd1012, %rd2520;
shr.u64 %rd1014, %rd1013, 32;
shr.u64 %rd1015, %rd1009, 32;
and.b64 %rd1016, %rd2521, 4294967295;
xor.b64 %rd1017, %rd1016, %rd1015;
xor.b64 %rd1018, %rd1017, %rd2522;
mul.lo.s64 %rd1019, %rd1018, %rd2520;
and.b64 %rd1020, %rd1019, 4294967295;
xor.b64 %rd1021, %rd1020, %rd1014;
xor.b64 %rd1022, %rd1021, %rd2523;
mul.lo.s64 %rd1023, %rd1022, %rd2518;
shr.u64 %rd1024, %rd1023, 32;
shr.u64 %rd1025, %rd1019, 32;
and.b64 %rd1026, %rd2524, 4294967295;
xor.b64 %rd1027, %rd1026, %rd1025;
xor.b64 %rd1028, %rd1027, %rd2525;
mul.lo.s64 %rd1029, %rd1028, %rd2518;
and.b64 %rd1030, %rd1029, 4294967295;
xor.b64 %rd1031, %rd1030, %rd1024;
xor.b64 %rd1032, %rd1031, %rd2526;
mul.lo.s64 %rd1033, %rd1032, %rd2520;
shr.u64 %rd1034, %rd1033, 32;
shr.u64 %rd1035, %rd1029, 32;
and.b64 %rd1036, %rd2527, 4294967295;
xor.b64 %rd1037, %rd1036, %rd1035;
xor.b64 %rd1038, %rd1037, %rd2528;
mul.lo.s64 %rd1039, %rd1038, %rd2520;
and.b64 %rd1040, %rd1039, 4294967295;
xor.b64 %rd1041, %rd1040, %rd1034;
xor.b64 %rd1042, %rd1041, %rd2529;
mul.lo.s64 %rd1043, %rd1042, %rd2518;
shr.u64 %rd1044, %rd1043, 32;
shr.u64 %rd1045, %rd1039, 32;
and.b64 %rd1046, %rd1013, 4294967295;
xor.b64 %rd1047, %rd1046, %rd1045;
xor.b64 %rd1048, %rd1047, %rd2530;
mul.lo.s64 %rd1049, %rd1048, %rd2518;
and.b64 %rd1050, %rd1049, 4294967295;
xor.b64 %rd1051, %rd1050, %rd1044;
xor.b64 %rd1052, %rd1051, %rd2531;
mul.lo.s64 %rd1053, %rd1052, %rd2520;
shr.u64 %rd1054, %rd1053, 32;
shr.u64 %rd1055, %rd1049, 32;
and.b64 %rd1056, %rd1023, 4294967295;
xor.b64 %rd1057, %rd1056, %rd1055;
xor.b64 %rd1058, %rd1057, %rd2532;
mul.lo.s64 %rd1059, %rd1058, %rd2520;
and.b64 %rd1060, %rd1059, 4294967295;
xor.b64 %rd1061, %rd1060, %rd1054;
xor.b64 %rd1062, %rd1061, %rd2533;
mul.lo.s64 %rd1063, %rd1062, %rd2518;
shr.u64 %rd1064, %rd1063, 32;
shr.u64 %rd1065, %rd1059, 32;
and.b64 %rd1066, %rd1033, 4294967295;
xor.b64 %rd1067, %rd1066, %rd1065;
xor.b64 %rd1068, %rd1067, %rd2534;
mul.lo.s64 %rd1069, %rd1068, %rd2518;
and.b64 %rd1070, %rd1069, 4294967295;
xor.b64 %rd1071, %rd1070, %rd1064;
xor.b64 %rd1072, %rd1071, %rd2535;
mul.lo.s64 %rd1073, %rd1072, %rd2520;
shr.u64 %rd1074, %rd1073, 32;
cvt.u32.u64 %r110, %rd1074;
shr.u64 %rd1075, %rd1069, 32;
xor.b64 %rd1076, %rd1075, %rd1043;
cvt.u32.u64 %r111, %rd1076;
xor.b32 %r112, %r320, %r111;
mul.lo.s32 %r113, %r112, %r321;
xor.b32 %r114, %r113, %r110;
xor.b32 %r115, %r114, %r322;
shr.u32 %r116, %r115, 9;
cvt.rn.f32.u32 %f75, %r116;
mul.rn.f32 %f76, %f75, 0f34000000;
cvt.rn.f16.f32 %h37, %f76;
mov.b16 %h38, 0x2E66;
setp.ge.f16 %p20, %h37, %h38;
ld.global.nc.b16 %h39, [%rd44+512];
ld.global.nc.f32 %f77, [%rd45+1024];
cvt.rn.f16.f32 %h40, %f77;
add.rn.f16 %h41, %h39, %h40;
mov.b16 %h42, 0x3C72;
mul.rn.f16 %h43, %h41, %h42;
selp.b16 %h44, %h43, 0x0000, %p20;
cvt.f32.f16 %f78, %h44;
ld.global.nc.b16 %h45, [%rd46+512];
cvt.f32.f16 %f79, %h45;
ld.global.nc.f32 %f80, [%rd47+1024];
mul.rn.f32 %f81, %f1, %f80;
mul.rn.f32 %f82, %f81, %f79;
ld.global.nc.f32 %f83, [%rd48+1024];
mul.rn.f32 %f84, %f2, %f81;
sub.rn.f32 %f85, %f83, %f84;
add.rn.f32 %f86, %f82, %f85;
add.rn.f32 %f87, %f86, %f78;
add.rn.f32 %f7, %f6, %f87;
or.b32 %r117, %r3, 257;
or.b32 %r118, %r117, %r4;
and.b32 %r119, %r117, 3;
shr.u32 %r120, %r118, 2;
setp.ne.s32 %p21, %r119, 1;
cvt.u64.u32 %rd1077, %r120;
add.s64 %rd158, %rd11, %rd1077;
and.b64 %rd2423, %rd158, 4294967295;
setp.lt.u64 %p71, %rd158, %rd11;
@%p21 bra LBB79_19;
mul.lo.s64 %rd2540, %rd2423, 3528531795;
selp.u64 %rd1118, 1, 0, %p71;
add.s64 %rd1119, %rd2461, %rd1118;
xor.b64 %rd1120, %rd1119, %rd2540;
shr.u64 %rd1121, %rd1120, 32;
mul.lo.s64 %rd2543, %rd1121, 3449720151;
shr.u64 %rd1122, %rd2543, 32;
and.b64 %rd1123, %rd1119, 4294967295;
mul.lo.s64 %rd1124, %rd1123, 3449720151;
and.b64 %rd1125, %rd1124, 4294967295;
xor.b64 %rd1126, %rd1125, %rd1122;
xor.b64 %rd1127, %rd1126, 2654435769;
mul.lo.s64 %rd2546, %rd1127, 3528531795;
xor.b64 %rd2536, %rd1124, %rd158;
mov.u32 %r324, -845247145;
mov.u32 %r323, -616729560;
mov.u64 %rd2553, 3041712726;
mov.u64 %rd2552, 1401181199;
mov.u64 %rd2551, 2835769497;
mov.u64 %rd2550, 1684936478;
mov.u64 %rd2549, 2027808484;
mov.u64 %rd2548, 387276957;
mov.u64 %rd2547, 842468239;
mov.u64 %rd2545, 3986602516;
mov.u64 %rd2544, 1013904242;
mov.u64 %rd2542, 3668340011;
mov.u64 %rd2541, 3144134277;
mov.u64 %rd2539, 3449720151;
mov.u64 %rd2538, 1993301258;
mov.u64 %rd2537, 3528531795;
bra.uni LBB79_20;
LBB79_19:
selp.u64 %rd1092, 1, 0, %p71;
add.s64 %rd1093, %rd2461, %rd1092;
and.b64 %rd1094, %rd1093, 4294967295;
mul.lo.s64 %rd2540, %rd1094, 3449720151;
xor.b64 %rd1095, %rd2540, %rd158;
shr.u64 %rd1096, %rd1095, 32;
mul.lo.s64 %rd2543, %rd1096, 3528531795;
shr.u64 %rd1097, %rd2543, 32;
mul.lo.s64 %rd1099, %rd2423, 3528531795;
and.b64 %rd1100, %rd1099, 4294967295;
xor.b64 %rd1101, %rd1100, %rd1097;
xor.b64 %rd1102, %rd1101, 3144134277;
mul.lo.s64 %rd2546, %rd1102, 3449720151;
xor.b64 %rd2536, %rd1093, %rd1099;
mov.u32 %r324, -766435501;
mov.u32 %r323, -239350328;
mov.u64 %rd2553, 1684936478;
mov.u64 %rd2552, 534103459;
mov.u64 %rd2551, 387276957;
mov.u64 %rd2550, 3041712726;
mov.u64 %rd2549, 3986602516;
mov.u64 %rd2548, 2835769497;
mov.u64 %rd2547, 3668340011;
mov.u64 %rd2545, 2027808484;
mov.u64 %rd2544, 1993301258;
mov.u64 %rd2542, 842468239;
mov.u64 %rd2541, 2654435769;
mov.u64 %rd2539, 3528531795;
mov.u64 %rd2538, 1013904242;
mov.u64 %rd2537, 3449720151;
LBB79_20:
shr.u64 %rd1128, %rd2546, 32;
shr.u64 %rd1129, %rd2536, 32;
mul.lo.s64 %rd1130, %rd1129, %rd2537;
and.b64 %rd1131, %rd1130, 4294967295;
xor.b64 %rd1132, %rd1131, %rd1128;
xor.b64 %rd1133, %rd1132, %rd2538;
mul.lo.s64 %rd1134, %rd1133, %rd2539;
shr.u64 %rd1135, %rd1134, 32;
shr.u64 %rd1136, %rd1130, 32;
and.b64 %rd1137, %rd2540, 4294967295;
xor.b64 %rd1138, %rd1137, %rd1136;
xor.b64 %rd1139, %rd1138, %rd2541;
mul.lo.s64 %rd1140, %rd1139, %rd2539;
and.b64 %rd1141, %rd1140, 4294967295;
xor.b64 %rd1142, %rd1141, %rd1135;
xor.b64 %rd1143, %rd1142, %rd2542;
mul.lo.s64 %rd1144, %rd1143, %rd2537;
shr.u64 %rd1145, %rd1144, 32;
shr.u64 %rd1146, %rd1140, 32;
and.b64 %rd1147, %rd2543, 4294967295;
xor.b64 %rd1148, %rd1147, %rd1146;
xor.b64 %rd1149, %rd1148, %rd2544;
mul.lo.s64 %rd1150, %rd1149, %rd2537;
and.b64 %rd1151, %rd1150, 4294967295;
xor.b64 %rd1152, %rd1151, %rd1145;
xor.b64 %rd1153, %rd1152, %rd2545;
mul.lo.s64 %rd1154, %rd1153, %rd2539;
shr.u64 %rd1155, %rd1154, 32;
shr.u64 %rd1156, %rd1150, 32;
and.b64 %rd1157, %rd2546, 4294967295;
xor.b64 %rd1158, %rd1157, %rd1156;
xor.b64 %rd1159, %rd1158, %rd2547;
mul.lo.s64 %rd1160, %rd1159, %rd2539;
and.b64 %rd1161, %rd1160, 4294967295;
xor.b64 %rd1162, %rd1161, %rd1155;
xor.b64 %rd1163, %rd1162, %rd2548;
mul.lo.s64 %rd1164, %rd1163, %rd2537;
shr.u64 %rd1165, %rd1164, 32;
shr.u64 %rd1166, %rd1160, 32;
and.b64 %rd1167, %rd1134, 4294967295;
xor.b64 %rd1168, %rd1167, %rd1166;
xor.b64 %rd1169, %rd1168, %rd2549;
mul.lo.s64 %rd1170, %rd1169, %rd2537;
and.b64 %rd1171, %rd1170, 4294967295;
xor.b64 %rd1172, %rd1171, %rd1165;
xor.b64 %rd1173, %rd1172, %rd2550;
mul.lo.s64 %rd1174, %rd1173, %rd2539;
shr.u64 %rd1175, %rd1174, 32;
shr.u64 %rd1176, %rd1170, 32;
and.b64 %rd1177, %rd1144, 4294967295;
xor.b64 %rd1178, %rd1177, %rd1176;
xor.b64 %rd1179, %rd1178, %rd2551;
mul.lo.s64 %rd1180, %rd1179, %rd2539;
and.b64 %rd1181, %rd1180, 4294967295;
xor.b64 %rd1182, %rd1181, %rd1175;
xor.b64 %rd1183, %rd1182, %rd2552;
mul.lo.s64 %rd1184, %rd1183, %rd2537;
shr.u64 %rd1185, %rd1184, 32;
shr.u64 %rd1186, %rd1180, 32;
xor.b64 %rd1187, %rd1154, %rd1186;
xor.b64 %rd1188, %rd1187, %rd2553;
mul.lo.s64 %rd1189, %rd1188, %rd2537;
xor.b64 %rd1190, %rd1185, %rd1189;
cvt.u32.u64 %r125, %rd1190;
xor.b32 %r126, %r323, %r125;
mul.lo.s32 %r127, %r126, %r324;
shr.u32 %r128, %r127, 9;
cvt.rn.f32.u32 %f88, %r128;
mul.rn.f32 %f89, %f88, 0f34000000;
cvt.rn.f16.f32 %h46, %f89;
mov.b16 %h47, 0x2E66;
setp.ge.f16 %p25, %h46, %h47;
ld.global.nc.b16 %h48, [%rd44+514];
ld.global.nc.f32 %f90, [%rd45+1028];
cvt.rn.f16.f32 %h49, %f90;
add.rn.f16 %h50, %h48, %h49;
mov.b16 %h51, 0x3C72;
mul.rn.f16 %h52, %h50, %h51;
selp.b16 %h53, %h52, 0x0000, %p25;
cvt.f32.f16 %f91, %h53;
ld.global.nc.b16 %h54, [%rd46+514];
cvt.f32.f16 %f92, %h54;
ld.global.nc.f32 %f93, [%rd47+1028];
mul.rn.f32 %f94, %f1, %f93;
mul.rn.f32 %f95, %f94, %f92;
ld.global.nc.f32 %f96, [%rd48+1028];
mul.rn.f32 %f97, %f2, %f94;
sub.rn.f32 %f98, %f96, %f97;
add.rn.f32 %f99, %f95, %f98;
add.rn.f32 %f100, %f99, %f91;
add.rn.f32 %f8, %f7, %f100;
or.b32 %r130, %r73, 384;
shr.u32 %r131, %r130, 2;
cvt.u64.u32 %rd1191, %r131;
add.s64 %rd185, %rd11, %rd1191;
and.b64 %rd2419, %rd185, 4294967295;
setp.lt.u64 %p70, %rd185, %rd11;
@%p8 bra LBB79_22;
mul.lo.s64 %rd2558, %rd2419, 3528531795;
selp.u64 %rd1234, 1, 0, %p70;
add.s64 %rd1235, %rd2461, %rd1234;
xor.b64 %rd1236, %rd1235, %rd2558;
shr.u64 %rd1237, %rd1236, 32;
mul.lo.s64 %rd2561, %rd1237, 3449720151;
shr.u64 %rd1238, %rd2561, 32;
and.b64 %rd1239, %rd1235, 4294967295;
mul.lo.s64 %rd1240, %rd1239, 3449720151;
and.b64 %rd1241, %rd1240, 4294967295;
xor.b64 %rd1242, %rd1241, %rd1238;
xor.b64 %rd1243, %rd1242, 2654435769;
mul.lo.s64 %rd2564, %rd1243, 3528531795;
xor.b64 %rd2554, %rd1240, %rd185;
mov.u32 %r327, -1879881855;
mov.u32 %r326, -845247145;
mov.u32 %r325, 534103459;
mov.u64 %rd2572, 3678237736;
mov.u64 %rd2571, 3041712726;
mov.u64 %rd2570, 1401181199;
mov.u64 %rd2569, 2835769497;
mov.u64 %rd2568, 1684936478;
mov.u64 %rd2567, 2027808484;
mov.u64 %rd2566, 387276957;
mov.u64 %rd2565, 842468239;
mov.u64 %rd2563, 3986602516;
mov.u64 %rd2562, 1013904242;
mov.u64 %rd2560, 3668340011;
mov.u64 %rd2559, 3144134277;
mov.u64 %rd2557, 3449720151;
mov.u64 %rd2556, 1993301258;
mov.u64 %rd2555, 3528531795;
bra.uni LBB79_23;
LBB79_22:
selp.u64 %rd1207, 1, 0, %p70;
add.s64 %rd1208, %rd2461, %rd1207;
and.b64 %rd1209, %rd1208, 4294967295;
mul.lo.s64 %rd2558, %rd1209, 3449720151;
xor.b64 %rd1210, %rd2558, %rd185;
shr.u64 %rd1211, %rd1210, 32;
mul.lo.s64 %rd2561, %rd1211, 3528531795;
shr.u64 %rd1212, %rd2561, 32;
mul.lo.s64 %rd1214, %rd2419, 3528531795;
and.b64 %rd1215, %rd1214, 4294967295;
xor.b64 %rd1216, %rd1215, %rd1212;
xor.b64 %rd1217, %rd1216, 3144134277;
mul.lo.s64 %rd2564, %rd1217, 3449720151;
xor.b64 %rd2554, %rd1208, %rd1214;
mov.u32 %r327, -1767562579;
mov.u32 %r326, -766435501;
mov.u32 %r325, 1401181199;
mov.u64 %rd2572, 4055616968;
mov.u64 %rd2571, 1684936478;
mov.u64 %rd2570, 534103459;
mov.u64 %rd2569, 387276957;
mov.u64 %rd2568, 3041712726;
mov.u64 %rd2567, 3986602516;
mov.u64 %rd2566, 2835769497;
mov.u64 %rd2565, 3668340011;
mov.u64 %rd2563, 2027808484;
mov.u64 %rd2562, 1993301258;
mov.u64 %rd2560, 842468239;
mov.u64 %rd2559, 2654435769;
mov.u64 %rd2557, 3528531795;
mov.u64 %rd2556, 1013904242;
mov.u64 %rd2555, 3449720151;
LBB79_23:
shr.u64 %rd1244, %rd2564, 32;
shr.u64 %rd1245, %rd2554, 32;
mul.lo.s64 %rd1246, %rd1245, %rd2555;
and.b64 %rd1247, %rd1246, 4294967295;
xor.b64 %rd1248, %rd1247, %rd1244;
xor.b64 %rd1249, %rd1248, %rd2556;
mul.lo.s64 %rd1250, %rd1249, %rd2557;
shr.u64 %rd1251, %rd1250, 32;
shr.u64 %rd1252, %rd1246, 32;
and.b64 %rd1253, %rd2558, 4294967295;
xor.b64 %rd1254, %rd1253, %rd1252;
xor.b64 %rd1255, %rd1254, %rd2559;
mul.lo.s64 %rd1256, %rd1255, %rd2557;
and.b64 %rd1257, %rd1256, 4294967295;
xor.b64 %rd1258, %rd1257, %rd1251;
xor.b64 %rd1259, %rd1258, %rd2560;
mul.lo.s64 %rd1260, %rd1259, %rd2555;
shr.u64 %rd1261, %rd1260, 32;
shr.u64 %rd1262, %rd1256, 32;
and.b64 %rd1263, %rd2561, 4294967295;
xor.b64 %rd1264, %rd1263, %rd1262;
xor.b64 %rd1265, %rd1264, %rd2562;
mul.lo.s64 %rd1266, %rd1265, %rd2555;
and.b64 %rd1267, %rd1266, 4294967295;
xor.b64 %rd1268, %rd1267, %rd1261;
xor.b64 %rd1269, %rd1268, %rd2563;
mul.lo.s64 %rd1270, %rd1269, %rd2557;
shr.u64 %rd1271, %rd1270, 32;
shr.u64 %rd1272, %rd1266, 32;
and.b64 %rd1273, %rd2564, 4294967295;
xor.b64 %rd1274, %rd1273, %rd1272;
xor.b64 %rd1275, %rd1274, %rd2565;
mul.lo.s64 %rd1276, %rd1275, %rd2557;
and.b64 %rd1277, %rd1276, 4294967295;
xor.b64 %rd1278, %rd1277, %rd1271;
xor.b64 %rd1279, %rd1278, %rd2566;
mul.lo.s64 %rd1280, %rd1279, %rd2555;
shr.u64 %rd1281, %rd1280, 32;
shr.u64 %rd1282, %rd1276, 32;
and.b64 %rd1283, %rd1250, 4294967295;
xor.b64 %rd1284, %rd1283, %rd1282;
xor.b64 %rd1285, %rd1284, %rd2567;
mul.lo.s64 %rd1286, %rd1285, %rd2555;
and.b64 %rd1287, %rd1286, 4294967295;
xor.b64 %rd1288, %rd1287, %rd1281;
xor.b64 %rd1289, %rd1288, %rd2568;
mul.lo.s64 %rd1290, %rd1289, %rd2557;
shr.u64 %rd1291, %rd1290, 32;
shr.u64 %rd1292, %rd1286, 32;
and.b64 %rd1293, %rd1260, 4294967295;
xor.b64 %rd1294, %rd1293, %rd1292;
xor.b64 %rd1295, %rd1294, %rd2569;
mul.lo.s64 %rd1296, %rd1295, %rd2557;
and.b64 %rd1297, %rd1296, 4294967295;
xor.b64 %rd1298, %rd1297, %rd1291;
xor.b64 %rd1299, %rd1298, %rd2570;
mul.lo.s64 %rd1300, %rd1299, %rd2555;
shr.u64 %rd1301, %rd1300, 32;
shr.u64 %rd1302, %rd1296, 32;
and.b64 %rd1303, %rd1270, 4294967295;
xor.b64 %rd1304, %rd1303, %rd1302;
xor.b64 %rd1305, %rd1304, %rd2571;
mul.lo.s64 %rd1306, %rd1305, %rd2555;
and.b64 %rd1307, %rd1306, 4294967295;
xor.b64 %rd1308, %rd1307, %rd1301;
xor.b64 %rd1309, %rd1308, %rd2572;
mul.lo.s64 %rd1310, %rd1309, %rd2557;
shr.u64 %rd1311, %rd1310, 32;
cvt.u32.u64 %r138, %rd1311;
shr.u64 %rd1312, %rd1306, 32;
xor.b64 %rd1313, %rd1312, %rd1280;
cvt.u32.u64 %r139, %rd1313;
xor.b32 %r140, %r325, %r139;
mul.lo.s32 %r141, %r140, %r326;
xor.b32 %r142, %r141, %r138;
xor.b32 %r143, %r142, %r327;
shr.u32 %r144, %r143, 9;
cvt.rn.f32.u32 %f101, %r144;
mul.rn.f32 %f102, %f101, 0f34000000;
cvt.rn.f16.f32 %h55, %f102;
mov.b16 %h56, 0x2E66;
setp.ge.f16 %p28, %h55, %h56;
ld.global.nc.b16 %h57, [%rd44+768];
ld.global.nc.f32 %f103, [%rd45+1536];
cvt.rn.f16.f32 %h58, %f103;
add.rn.f16 %h59, %h57, %h58;
mov.b16 %h60, 0x3C72;
mul.rn.f16 %h61, %h59, %h60;
selp.b16 %h62, %h61, 0x0000, %p28;
cvt.f32.f16 %f104, %h62;
ld.global.nc.b16 %h63, [%rd46+768];
cvt.f32.f16 %f105, %h63;
ld.global.nc.f32 %f106, [%rd47+1536];
mul.rn.f32 %f107, %f1, %f106;
mul.rn.f32 %f108, %f107, %f105;
ld.global.nc.f32 %f109, [%rd48+1536];
mul.rn.f32 %f110, %f2, %f107;
sub.rn.f32 %f111, %f109, %f110;
add.rn.f32 %f112, %f108, %f111;
add.rn.f32 %f113, %f112, %f104;
add.rn.f32 %f9, %f8, %f113;
or.b32 %r145, %r3, 385;
or.b32 %r146, %r145, %r4;
and.b32 %r147, %r145, 3;
shr.u32 %r148, %r146, 2;
setp.ne.s32 %p29, %r147, 1;
cvt.u64.u32 %rd1314, %r148;
add.s64 %rd213, %rd11, %rd1314;
@%p29 bra LBB79_25;
and.b64 %rd1354, %rd213, 4294967295;
mul.lo.s64 %rd2577, %rd1354, 3528531795;
setp.lt.u64 %p31, %rd213, %rd11;
selp.u64 %rd1355, 1, 0, %p31;
add.s64 %rd1356, %rd2461, %rd1355;
xor.b64 %rd1357, %rd1356, %rd2577;
shr.u64 %rd1358, %rd1357, 32;
mul.lo.s64 %rd2580, %rd1358, 3449720151;
shr.u64 %rd1359, %rd2580, 32;
and.b64 %rd1360, %rd1356, 4294967295;
mul.lo.s64 %rd1361, %rd1360, 3449720151;
and.b64 %rd1362, %rd1361, 4294967295;
xor.b64 %rd1363, %rd1362, %rd1359;
xor.b64 %rd1364, %rd1363, 2654435769;
mul.lo.s64 %rd2583, %rd1364, 3528531795;
xor.b64 %rd2573, %rd1361, %rd213;
mov.u32 %r329, -845247145;
mov.u32 %r328, -616729560;
mov.u64 %rd2590, 3041712726;
mov.u64 %rd2589, 1401181199;
mov.u64 %rd2588, 2835769497;
mov.u64 %rd2587, 1684936478;
mov.u64 %rd2586, 2027808484;
mov.u64 %rd2585, 387276957;
mov.u64 %rd2584, 842468239;
mov.u64 %rd2582, 3986602516;
mov.u64 %rd2581, 1013904242;
mov.u64 %rd2579, 3668340011;
mov.u64 %rd2578, 3144134277;
mov.u64 %rd2576, 3449720151;
mov.u64 %rd2575, 1993301258;
mov.u64 %rd2574, 3528531795;
bra.uni LBB79_26;
LBB79_25:
setp.lt.u64 %p30, %rd213, %rd11;
selp.u64 %rd1329, 1, 0, %p30;
add.s64 %rd1330, %rd2461, %rd1329;
and.b64 %rd1331, %rd1330, 4294967295;
mul.lo.s64 %rd2577, %rd1331, 3449720151;
xor.b64 %rd1332, %rd2577, %rd213;
shr.u64 %rd1333, %rd1332, 32;
mul.lo.s64 %rd2580, %rd1333, 3528531795;
shr.u64 %rd1334, %rd2580, 32;
and.b64 %rd1335, %rd213, 4294967295;
mul.lo.s64 %rd1336, %rd1335, 3528531795;
and.b64 %rd1337, %rd1336, 4294967295;
xor.b64 %rd1338, %rd1337, %rd1334;
xor.b64 %rd1339, %rd1338, 3144134277;
mul.lo.s64 %rd2583, %rd1339, 3449720151;
xor.b64 %rd2573, %rd1330, %rd1336;
mov.u32 %r329, -766435501;
mov.u32 %r328, -239350328;
mov.u64 %rd2590, 1684936478;
mov.u64 %rd2589, 534103459;
mov.u64 %rd2588, 387276957;
mov.u64 %rd2587, 3041712726;
mov.u64 %rd2586, 3986602516;
mov.u64 %rd2585, 2835769497;
mov.u64 %rd2584, 3668340011;
mov.u64 %rd2582, 2027808484;
mov.u64 %rd2581, 1993301258;
mov.u64 %rd2579, 842468239;
mov.u64 %rd2578, 2654435769;
mov.u64 %rd2576, 3528531795;
mov.u64 %rd2575, 1013904242;
mov.u64 %rd2574, 3449720151;
LBB79_26:
shr.u64 %rd1365, %rd2583, 32;
shr.u64 %rd1366, %rd2573, 32;
mul.lo.s64 %rd1367, %rd1366, %rd2574;
and.b64 %rd1368, %rd1367, 4294967295;
xor.b64 %rd1369, %rd1368, %rd1365;
xor.b64 %rd1370, %rd1369, %rd2575;
mul.lo.s64 %rd1371, %rd1370, %rd2576;
shr.u64 %rd1372, %rd1371, 32;
shr.u64 %rd1373, %rd1367, 32;
and.b64 %rd1374, %rd2577, 4294967295;
xor.b64 %rd1375, %rd1374, %rd1373;
xor.b64 %rd1376, %rd1375, %rd2578;
mul.lo.s64 %rd1377, %rd1376, %rd2576;
and.b64 %rd1378, %rd1377, 4294967295;
xor.b64 %rd1379, %rd1378, %rd1372;
xor.b64 %rd1380, %rd1379, %rd2579;
mul.lo.s64 %rd1381, %rd1380, %rd2574;
shr.u64 %rd1382, %rd1381, 32;
shr.u64 %rd1383, %rd1377, 32;
and.b64 %rd1384, %rd2580, 4294967295;
xor.b64 %rd1385, %rd1384, %rd1383;
xor.b64 %rd1386, %rd1385, %rd2581;
mul.lo.s64 %rd1387, %rd1386, %rd2574;
and.b64 %rd1388, %rd1387, 4294967295;
xor.b64 %rd1389, %rd1388, %rd1382;
xor.b64 %rd1390, %rd1389, %rd2582;
mul.lo.s64 %rd1391, %rd1390, %rd2576;
shr.u64 %rd1392, %rd1391, 32;
shr.u64 %rd1393, %rd1387, 32;
and.b64 %rd1394, %rd2583, 4294967295;
xor.b64 %rd1395, %rd1394, %rd1393;
xor.b64 %rd1396, %rd1395, %rd2584;
mul.lo.s64 %rd1397, %rd1396, %rd2576;
and.b64 %rd1398, %rd1397, 4294967295;
xor.b64 %rd1399, %rd1398, %rd1392;
xor.b64 %rd1400, %rd1399, %rd2585;
mul.lo.s64 %rd1401, %rd1400, %rd2574;
shr.u64 %rd1402, %rd1401, 32;
shr.u64 %rd1403, %rd1397, 32;
and.b64 %rd1404, %rd1371, 4294967295;
xor.b64 %rd1405, %rd1404, %rd1403;
xor.b64 %rd1406, %rd1405, %rd2586;
mul.lo.s64 %rd1407, %rd1406, %rd2574;
and.b64 %rd1408, %rd1407, 4294967295;
xor.b64 %rd1409, %rd1408, %rd1402;
xor.b64 %rd1410, %rd1409, %rd2587;
mul.lo.s64 %rd1411, %rd1410, %rd2576;
shr.u64 %rd1412, %rd1411, 32;
shr.u64 %rd1413, %rd1407, 32;
and.b64 %rd1414, %rd1381, 4294967295;
xor.b64 %rd1415, %rd1414, %rd1413;
xor.b64 %rd1416, %rd1415, %rd2588;
mul.lo.s64 %rd1417, %rd1416, %rd2576;
and.b64 %rd1418, %rd1417, 4294967295;
xor.b64 %rd1419, %rd1418, %rd1412;
xor.b64 %rd1420, %rd1419, %rd2589;
mul.lo.s64 %rd1421, %rd1420, %rd2574;
shr.u64 %rd1422, %rd1421, 32;
shr.u64 %rd1423, %rd1417, 32;
xor.b64 %rd1424, %rd1391, %rd1423;
xor.b64 %rd1425, %rd1424, %rd2590;
mul.lo.s64 %rd1426, %rd1425, %rd2574;
xor.b64 %rd1427, %rd1422, %rd1426;
cvt.u32.u64 %r153, %rd1427;
xor.b32 %r154, %r328, %r153;
mul.lo.s32 %r155, %r154, %r329;
shr.u32 %r156, %r155, 9;
cvt.rn.f32.u32 %f114, %r156;
mul.rn.f32 %f115, %f114, 0f34000000;
cvt.rn.f16.f32 %h64, %f115;
mov.b16 %h65, 0x2E66;
setp.ge.f16 %p33, %h64, %h65;
ld.global.nc.b16 %h66, [%rd44+770];
ld.global.nc.f32 %f116, [%rd45+1540];
cvt.rn.f16.f32 %h67, %f116;
add.rn.f16 %h68, %h66, %h67;
mov.b16 %h69, 0x3C72;
mul.rn.f16 %h70, %h68, %h69;
selp.b16 %h71, %h70, 0x0000, %p33;
cvt.f32.f16 %f117, %h71;
ld.global.nc.b16 %h72, [%rd46+770];
cvt.f32.f16 %f118, %h72;
ld.global.nc.f32 %f119, [%rd47+1540];
mul.rn.f32 %f120, %f1, %f119;
mul.rn.f32 %f121, %f120, %f118;
ld.global.nc.f32 %f122, [%rd48+1540];
mul.rn.f32 %f123, %f2, %f120;
sub.rn.f32 %f124, %f122, %f123;
add.rn.f32 %f125, %f121, %f124;
add.rn.f32 %f126, %f125, %f117;
add.rn.f32 %f10, %f9, %f126;
or.b32 %r158, %r73, 512;
shr.u32 %r159, %r158, 2;
cvt.u64.u32 %rd1428, %r159;
add.s64 %rd240, %rd11, %rd1428;
@%p8 bra LBB79_28;
and.b64 %rd1470, %rd240, 4294967295;
mul.lo.s64 %rd2595, %rd1470, 3528531795;
setp.lt.u64 %p35, %rd240, %rd11;
selp.u64 %rd1471, 1, 0, %p35;
add.s64 %rd1472, %rd2461, %rd1471;
xor.b64 %rd1473, %rd1472, %rd2595;
shr.u64 %rd1474, %rd1473, 32;
mul.lo.s64 %rd2598, %rd1474, 3449720151;
shr.u64 %rd1475, %rd2598, 32;
and.b64 %rd1476, %rd1472, 4294967295;
mul.lo.s64 %rd1477, %rd1476, 3449720151;
and.b64 %rd1478, %rd1477, 4294967295;
xor.b64 %rd1479, %rd1478, %rd1475;
xor.b64 %rd1480, %rd1479, 2654435769;
mul.lo.s64 %rd2601, %rd1480, 3528531795;
xor.b64 %rd2591, %rd1477, %rd240;
mov.u32 %r332, -1879881855;
mov.u32 %r331, -845247145;
mov.u32 %r330, 534103459;
mov.u64 %rd2609, 3678237736;
mov.u64 %rd2608, 3041712726;
mov.u64 %rd2607, 1401181199;
mov.u64 %rd2606, 2835769497;
mov.u64 %rd2605, 1684936478;
mov.u64 %rd2604, 2027808484;
mov.u64 %rd2603, 387276957;
mov.u64 %rd2602, 842468239;
mov.u64 %rd2600, 3986602516;
mov.u64 %rd2599, 1013904242;
mov.u64 %rd2597, 3668340011;
mov.u64 %rd2596, 3144134277;
mov.u64 %rd2594, 3449720151;
mov.u64 %rd2593, 1993301258;
mov.u64 %rd2592, 3528531795;
bra.uni LBB79_29;
LBB79_28:
setp.lt.u64 %p34, %rd240, %rd11;
selp.u64 %rd1444, 1, 0, %p34;
add.s64 %rd1445, %rd2461, %rd1444;
and.b64 %rd1446, %rd1445, 4294967295;
mul.lo.s64 %rd2595, %rd1446, 3449720151;
xor.b64 %rd1447, %rd2595, %rd240;
shr.u64 %rd1448, %rd1447, 32;
mul.lo.s64 %rd2598, %rd1448, 3528531795;
shr.u64 %rd1449, %rd2598, 32;
and.b64 %rd1450, %rd240, 4294967295;
mul.lo.s64 %rd1451, %rd1450, 3528531795;
and.b64 %rd1452, %rd1451, 4294967295;
xor.b64 %rd1453, %rd1452, %rd1449;
xor.b64 %rd1454, %rd1453, 3144134277;
mul.lo.s64 %rd2601, %rd1454, 3449720151;
xor.b64 %rd2591, %rd1445, %rd1451;
mov.u32 %r332, -1767562579;
mov.u32 %r331, -766435501;
mov.u32 %r330, 1401181199;
mov.u64 %rd2609, 4055616968;
mov.u64 %rd2608, 1684936478;
mov.u64 %rd2607, 534103459;
mov.u64 %rd2606, 387276957;
mov.u64 %rd2605, 3041712726;
mov.u64 %rd2604, 3986602516;
mov.u64 %rd2603, 2835769497;
mov.u64 %rd2602, 3668340011;
mov.u64 %rd2600, 2027808484;
mov.u64 %rd2599, 1993301258;
mov.u64 %rd2597, 842468239;
mov.u64 %rd2596, 2654435769;
mov.u64 %rd2594, 3528531795;
mov.u64 %rd2593, 1013904242;
mov.u64 %rd2592, 3449720151;
LBB79_29:
shr.u64 %rd1481, %rd2601, 32;
shr.u64 %rd1482, %rd2591, 32;
mul.lo.s64 %rd1483, %rd1482, %rd2592;
and.b64 %rd1484, %rd1483, 4294967295;
xor.b64 %rd1485, %rd1484, %rd1481;
xor.b64 %rd1486, %rd1485, %rd2593;
mul.lo.s64 %rd1487, %rd1486, %rd2594;
shr.u64 %rd1488, %rd1487, 32;
shr.u64 %rd1489, %rd1483, 32;
and.b64 %rd1490, %rd2595, 4294967295;
xor.b64 %rd1491, %rd1490, %rd1489;
xor.b64 %rd1492, %rd1491, %rd2596;
mul.lo.s64 %rd1493, %rd1492, %rd2594;
and.b64 %rd1494, %rd1493, 4294967295;
xor.b64 %rd1495, %rd1494, %rd1488;
xor.b64 %rd1496, %rd1495, %rd2597;
mul.lo.s64 %rd1497, %rd1496, %rd2592;
shr.u64 %rd1498, %rd1497, 32;
shr.u64 %rd1499, %rd1493, 32;
and.b64 %rd1500, %rd2598, 4294967295;
xor.b64 %rd1501, %rd1500, %rd1499;
xor.b64 %rd1502, %rd1501, %rd2599;
mul.lo.s64 %rd1503, %rd1502, %rd2592;
and.b64 %rd1504, %rd1503, 4294967295;
xor.b64 %rd1505, %rd1504, %rd1498;
xor.b64 %rd1506, %rd1505, %rd2600;
mul.lo.s64 %rd1507, %rd1506, %rd2594;
shr.u64 %rd1508, %rd1507, 32;
shr.u64 %rd1509, %rd1503, 32;
and.b64 %rd1510, %rd2601, 4294967295;
xor.b64 %rd1511, %rd1510, %rd1509;
xor.b64 %rd1512, %rd1511, %rd2602;
mul.lo.s64 %rd1513, %rd1512, %rd2594;
and.b64 %rd1514, %rd1513, 4294967295;
xor.b64 %rd1515, %rd1514, %rd1508;
xor.b64 %rd1516, %rd1515, %rd2603;
mul.lo.s64 %rd1517, %rd1516, %rd2592;
shr.u64 %rd1518, %rd1517, 32;
shr.u64 %rd1519, %rd1513, 32;
and.b64 %rd1520, %rd1487, 4294967295;
xor.b64 %rd1521, %rd1520, %rd1519;
xor.b64 %rd1522, %rd1521, %rd2604;
mul.lo.s64 %rd1523, %rd1522, %rd2592;
and.b64 %rd1524, %rd1523, 4294967295;
xor.b64 %rd1525, %rd1524, %rd1518;
xor.b64 %rd1526, %rd1525, %rd2605;
mul.lo.s64 %rd1527, %rd1526, %rd2594;
shr.u64 %rd1528, %rd1527, 32;
shr.u64 %rd1529, %rd1523, 32;
and.b64 %rd1530, %rd1497, 4294967295;
xor.b64 %rd1531, %rd1530, %rd1529;
xor.b64 %rd1532, %rd1531, %rd2606;
mul.lo.s64 %rd1533, %rd1532, %rd2594;
and.b64 %rd1534, %rd1533, 4294967295;
xor.b64 %rd1535, %rd1534, %rd1528;
xor.b64 %rd1536, %rd1535, %rd2607;
mul.lo.s64 %rd1537, %rd1536, %rd2592;
shr.u64 %rd1538, %rd1537, 32;
shr.u64 %rd1539, %rd1533, 32;
and.b64 %rd1540, %rd1507, 4294967295;
xor.b64 %rd1541, %rd1540, %rd1539;
xor.b64 %rd1542, %rd1541, %rd2608;
mul.lo.s64 %rd1543, %rd1542, %rd2592;
and.b64 %rd1544, %rd1543, 4294967295;
xor.b64 %rd1545, %rd1544, %rd1538;
xor.b64 %rd1546, %rd1545, %rd2609;
mul.lo.s64 %rd1547, %rd1546, %rd2594;
shr.u64 %rd1548, %rd1547, 32;
cvt.u32.u64 %r166, %rd1548;
shr.u64 %rd1549, %rd1543, 32;
xor.b64 %rd1550, %rd1549, %rd1517;
cvt.u32.u64 %r167, %rd1550;
xor.b32 %r168, %r330, %r167;
mul.lo.s32 %r169, %r168, %r331;
xor.b32 %r170, %r169, %r166;
xor.b32 %r171, %r170, %r332;
shr.u32 %r172, %r171, 9;
cvt.rn.f32.u32 %f127, %r172;
mul.rn.f32 %f128, %f127, 0f34000000;
cvt.rn.f16.f32 %h73, %f128;
mov.b16 %h74, 0x2E66;
setp.ge.f16 %p36, %h73, %h74;
ld.global.nc.b16 %h75, [%rd44+1024];
ld.global.nc.f32 %f129, [%rd45+2048];
cvt.rn.f16.f32 %h76, %f129;
add.rn.f16 %h77, %h75, %h76;
mov.b16 %h78, 0x3C72;
mul.rn.f16 %h79, %h77, %h78;
selp.b16 %h80, %h79, 0x0000, %p36;
cvt.f32.f16 %f130, %h80;
ld.global.nc.b16 %h81, [%rd46+1024];
cvt.f32.f16 %f131, %h81;
ld.global.nc.f32 %f132, [%rd47+2048];
mul.rn.f32 %f133, %f1, %f132;
mul.rn.f32 %f134, %f133, %f131;
ld.global.nc.f32 %f135, [%rd48+2048];
mul.rn.f32 %f136, %f2, %f133;
sub.rn.f32 %f137, %f135, %f136;
add.rn.f32 %f138, %f134, %f137;
add.rn.f32 %f139, %f138, %f130;
add.rn.f32 %f11, %f10, %f139;
or.b32 %r173, %r3, 513;
or.b32 %r174, %r173, %r4;
and.b32 %r175, %r173, 3;
shr.u32 %r176, %r174, 2;
setp.ne.s32 %p37, %r175, 1;
cvt.u64.u32 %rd1551, %r176;
add.s64 %rd268, %rd11, %rd1551;
@%p37 bra LBB79_31;
and.b64 %rd1591, %rd268, 4294967295;
mul.lo.s64 %rd2614, %rd1591, 3528531795;
setp.lt.u64 %p39, %rd268, %rd11;
selp.u64 %rd1592, 1, 0, %p39;
add.s64 %rd1593, %rd2461, %rd1592;
xor.b64 %rd1594, %rd1593, %rd2614;
shr.u64 %rd1595, %rd1594, 32;
mul.lo.s64 %rd2617, %rd1595, 3449720151;
shr.u64 %rd1596, %rd2617, 32;
and.b64 %rd1597, %rd1593, 4294967295;
mul.lo.s64 %rd1598, %rd1597, 3449720151;
and.b64 %rd1599, %rd1598, 4294967295;
xor.b64 %rd1600, %rd1599, %rd1596;
xor.b64 %rd1601, %rd1600, 2654435769;
mul.lo.s64 %rd2620, %rd1601, 3528531795;
xor.b64 %rd2610, %rd1598, %rd268;
mov.u32 %r334, -845247145;
mov.u32 %r333, -616729560;
mov.u64 %rd2627, 3041712726;
mov.u64 %rd2626, 1401181199;
mov.u64 %rd2625, 2835769497;
mov.u64 %rd2624, 1684936478;
mov.u64 %rd2623, 2027808484;
mov.u64 %rd2622, 387276957;
mov.u64 %rd2621, 842468239;
mov.u64 %rd2619, 3986602516;
mov.u64 %rd2618, 1013904242;
mov.u64 %rd2616, 3668340011;
mov.u64 %rd2615, 3144134277;
mov.u64 %rd2613, 3449720151;
mov.u64 %rd2612, 1993301258;
mov.u64 %rd2611, 3528531795;
bra.uni LBB79_32;
LBB79_31:
setp.lt.u64 %p38, %rd268, %rd11;
selp.u64 %rd1566, 1, 0, %p38;
add.s64 %rd1567, %rd2461, %rd1566;
and.b64 %rd1568, %rd1567, 4294967295;
mul.lo.s64 %rd2614, %rd1568, 3449720151;
xor.b64 %rd1569, %rd2614, %rd268;
shr.u64 %rd1570, %rd1569, 32;
mul.lo.s64 %rd2617, %rd1570, 3528531795;
shr.u64 %rd1571, %rd2617, 32;
and.b64 %rd1572, %rd268, 4294967295;
mul.lo.s64 %rd1573, %rd1572, 3528531795;
and.b64 %rd1574, %rd1573, 4294967295;
xor.b64 %rd1575, %rd1574, %rd1571;
xor.b64 %rd1576, %rd1575, 3144134277;
mul.lo.s64 %rd2620, %rd1576, 3449720151;
xor.b64 %rd2610, %rd1567, %rd1573;
mov.u32 %r334, -766435501;
mov.u32 %r333, -239350328;
mov.u64 %rd2627, 1684936478;
mov.u64 %rd2626, 534103459;
mov.u64 %rd2625, 387276957;
mov.u64 %rd2624, 3041712726;
mov.u64 %rd2623, 3986602516;
mov.u64 %rd2622, 2835769497;
mov.u64 %rd2621, 3668340011;
mov.u64 %rd2619, 2027808484;
mov.u64 %rd2618, 1993301258;
mov.u64 %rd2616, 842468239;
mov.u64 %rd2615, 2654435769;
mov.u64 %rd2613, 3528531795;
mov.u64 %rd2612, 1013904242;
mov.u64 %rd2611, 3449720151;
LBB79_32:
shr.u64 %rd1602, %rd2620, 32;
shr.u64 %rd1603, %rd2610, 32;
mul.lo.s64 %rd1604, %rd1603, %rd2611;
and.b64 %rd1605, %rd1604, 4294967295;
xor.b64 %rd1606, %rd1605, %rd1602;
xor.b64 %rd1607, %rd1606, %rd2612;
mul.lo.s64 %rd1608, %rd1607, %rd2613;
shr.u64 %rd1609, %rd1608, 32;
shr.u64 %rd1610, %rd1604, 32;
and.b64 %rd1611, %rd2614, 4294967295;
xor.b64 %rd1612, %rd1611, %rd1610;
xor.b64 %rd1613, %rd1612, %rd2615;
mul.lo.s64 %rd1614, %rd1613, %rd2613;
and.b64 %rd1615, %rd1614, 4294967295;
xor.b64 %rd1616, %rd1615, %rd1609;
xor.b64 %rd1617, %rd1616, %rd2616;
mul.lo.s64 %rd1618, %rd1617, %rd2611;
shr.u64 %rd1619, %rd1618, 32;
shr.u64 %rd1620, %rd1614, 32;
and.b64 %rd1621, %rd2617, 4294967295;
xor.b64 %rd1622, %rd1621, %rd1620;
xor.b64 %rd1623, %rd1622, %rd2618;
mul.lo.s64 %rd1624, %rd1623, %rd2611;
and.b64 %rd1625, %rd1624, 4294967295;
xor.b64 %rd1626, %rd1625, %rd1619;
xor.b64 %rd1627, %rd1626, %rd2619;
mul.lo.s64 %rd1628, %rd1627, %rd2613;
shr.u64 %rd1629, %rd1628, 32;
shr.u64 %rd1630, %rd1624, 32;
and.b64 %rd1631, %rd2620, 4294967295;
xor.b64 %rd1632, %rd1631, %rd1630;
xor.b64 %rd1633, %rd1632, %rd2621;
mul.lo.s64 %rd1634, %rd1633, %rd2613;
and.b64 %rd1635, %rd1634, 4294967295;
xor.b64 %rd1636, %rd1635, %rd1629;
xor.b64 %rd1637, %rd1636, %rd2622;
mul.lo.s64 %rd1638, %rd1637, %rd2611;
shr.u64 %rd1639, %rd1638, 32;
shr.u64 %rd1640, %rd1634, 32;
and.b64 %rd1641, %rd1608, 4294967295;
xor.b64 %rd1642, %rd1641, %rd1640;
xor.b64 %rd1643, %rd1642, %rd2623;
mul.lo.s64 %rd1644, %rd1643, %rd2611;
and.b64 %rd1645, %rd1644, 4294967295;
xor.b64 %rd1646, %rd1645, %rd1639;
xor.b64 %rd1647, %rd1646, %rd2624;
mul.lo.s64 %rd1648, %rd1647, %rd2613;
shr.u64 %rd1649, %rd1648, 32;
shr.u64 %rd1650, %rd1644, 32;
and.b64 %rd1651, %rd1618, 4294967295;
xor.b64 %rd1652, %rd1651, %rd1650;
xor.b64 %rd1653, %rd1652, %rd2625;
mul.lo.s64 %rd1654, %rd1653, %rd2613;
and.b64 %rd1655, %rd1654, 4294967295;
xor.b64 %rd1656, %rd1655, %rd1649;
xor.b64 %rd1657, %rd1656, %rd2626;
mul.lo.s64 %rd1658, %rd1657, %rd2611;
shr.u64 %rd1659, %rd1658, 32;
shr.u64 %rd1660, %rd1654, 32;
xor.b64 %rd1661, %rd1628, %rd1660;
xor.b64 %rd1662, %rd1661, %rd2627;
mul.lo.s64 %rd1663, %rd1662, %rd2611;
xor.b64 %rd1664, %rd1659, %rd1663;
cvt.u32.u64 %r181, %rd1664;
xor.b32 %r182, %r333, %r181;
mul.lo.s32 %r183, %r182, %r334;
shr.u32 %r184, %r183, 9;
cvt.rn.f32.u32 %f140, %r184;
mul.rn.f32 %f141, %f140, 0f34000000;
cvt.rn.f16.f32 %h82, %f141;
mov.b16 %h83, 0x2E66;
setp.ge.f16 %p41, %h82, %h83;
ld.global.nc.b16 %h84, [%rd44+1026];
ld.global.nc.f32 %f142, [%rd45+2052];
cvt.rn.f16.f32 %h85, %f142;
add.rn.f16 %h86, %h84, %h85;
mov.b16 %h87, 0x3C72;
mul.rn.f16 %h88, %h86, %h87;
selp.b16 %h89, %h88, 0x0000, %p41;
cvt.f32.f16 %f143, %h89;
ld.global.nc.b16 %h90, [%rd46+1026];
cvt.f32.f16 %f144, %h90;
ld.global.nc.f32 %f145, [%rd47+2052];
mul.rn.f32 %f146, %f1, %f145;
mul.rn.f32 %f147, %f146, %f144;
ld.global.nc.f32 %f148, [%rd48+2052];
mul.rn.f32 %f149, %f2, %f146;
sub.rn.f32 %f150, %f148, %f149;
add.rn.f32 %f151, %f147, %f150;
add.rn.f32 %f152, %f151, %f143;
add.rn.f32 %f12, %f11, %f152;
or.b32 %r186, %r73, 640;
shr.u32 %r187, %r186, 2;
cvt.u64.u32 %rd1665, %r187;
add.s64 %rd295, %rd11, %rd1665;
@%p8 bra LBB79_34;
and.b64 %rd1707, %rd295, 4294967295;
mul.lo.s64 %rd2632, %rd1707, 3528531795;
setp.lt.u64 %p43, %rd295, %rd11;
selp.u64 %rd1708, 1, 0, %p43;
add.s64 %rd1709, %rd2461, %rd1708;
xor.b64 %rd1710, %rd1709, %rd2632;
shr.u64 %rd1711, %rd1710, 32;
mul.lo.s64 %rd2635, %rd1711, 3449720151;
shr.u64 %rd1712, %rd2635, 32;
and.b64 %rd1713, %rd1709, 4294967295;
mul.lo.s64 %rd1714, %rd1713, 3449720151;
and.b64 %rd1715, %rd1714, 4294967295;
xor.b64 %rd1716, %rd1715, %rd1712;
xor.b64 %rd1717, %rd1716, 2654435769;
mul.lo.s64 %rd2638, %rd1717, 3528531795;
xor.b64 %rd2628, %rd1714, %rd295;
mov.u32 %r337, -1879881855;
mov.u32 %r336, -845247145;
mov.u32 %r335, 534103459;
mov.u64 %rd2646, 3678237736;
mov.u64 %rd2645, 3041712726;
mov.u64 %rd2644, 1401181199;
mov.u64 %rd2643, 2835769497;
mov.u64 %rd2642, 1684936478;
mov.u64 %rd2641, 2027808484;
mov.u64 %rd2640, 387276957;
mov.u64 %rd2639, 842468239;
mov.u64 %rd2637, 3986602516;
mov.u64 %rd2636, 1013904242;
mov.u64 %rd2634, 3668340011;
mov.u64 %rd2633, 3144134277;
mov.u64 %rd2631, 3449720151;
mov.u64 %rd2630, 1993301258;
mov.u64 %rd2629, 3528531795;
bra.uni LBB79_35;
LBB79_34:
setp.lt.u64 %p42, %rd295, %rd11;
selp.u64 %rd1681, 1, 0, %p42;
add.s64 %rd1682, %rd2461, %rd1681;
and.b64 %rd1683, %rd1682, 4294967295;
mul.lo.s64 %rd2632, %rd1683, 3449720151;
xor.b64 %rd1684, %rd2632, %rd295;
shr.u64 %rd1685, %rd1684, 32;
mul.lo.s64 %rd2635, %rd1685, 3528531795;
shr.u64 %rd1686, %rd2635, 32;
and.b64 %rd1687, %rd295, 4294967295;
mul.lo.s64 %rd1688, %rd1687, 3528531795;
and.b64 %rd1689, %rd1688, 4294967295;
xor.b64 %rd1690, %rd1689, %rd1686;
xor.b64 %rd1691, %rd1690, 3144134277;
mul.lo.s64 %rd2638, %rd1691, 3449720151;
xor.b64 %rd2628, %rd1682, %rd1688;
mov.u32 %r337, -1767562579;
mov.u32 %r336, -766435501;
mov.u32 %r335, 1401181199;
mov.u64 %rd2646, 4055616968;
mov.u64 %rd2645, 1684936478;
mov.u64 %rd2644, 534103459;
mov.u64 %rd2643, 387276957;
mov.u64 %rd2642, 3041712726;
mov.u64 %rd2641, 3986602516;
mov.u64 %rd2640, 2835769497;
mov.u64 %rd2639, 3668340011;
mov.u64 %rd2637, 2027808484;
mov.u64 %rd2636, 1993301258;
mov.u64 %rd2634, 842468239;
mov.u64 %rd2633, 2654435769;
mov.u64 %rd2631, 3528531795;
mov.u64 %rd2630, 1013904242;
mov.u64 %rd2629, 3449720151;
LBB79_35:
shr.u64 %rd1718, %rd2638, 32;
shr.u64 %rd1719, %rd2628, 32;
mul.lo.s64 %rd1720, %rd1719, %rd2629;
and.b64 %rd1721, %rd1720, 4294967295;
xor.b64 %rd1722, %rd1721, %rd1718;
xor.b64 %rd1723, %rd1722, %rd2630;
mul.lo.s64 %rd1724, %rd1723, %rd2631;
shr.u64 %rd1725, %rd1724, 32;
shr.u64 %rd1726, %rd1720, 32;
and.b64 %rd1727, %rd2632, 4294967295;
xor.b64 %rd1728, %rd1727, %rd1726;
xor.b64 %rd1729, %rd1728, %rd2633;
mul.lo.s64 %rd1730, %rd1729, %rd2631;
and.b64 %rd1731, %rd1730, 4294967295;
xor.b64 %rd1732, %rd1731, %rd1725;
xor.b64 %rd1733, %rd1732, %rd2634;
mul.lo.s64 %rd1734, %rd1733, %rd2629;
shr.u64 %rd1735, %rd1734, 32;
shr.u64 %rd1736, %rd1730, 32;
and.b64 %rd1737, %rd2635, 4294967295;
xor.b64 %rd1738, %rd1737, %rd1736;
xor.b64 %rd1739, %rd1738, %rd2636;
mul.lo.s64 %rd1740, %rd1739, %rd2629;
and.b64 %rd1741, %rd1740, 4294967295;
xor.b64 %rd1742, %rd1741, %rd1735;
xor.b64 %rd1743, %rd1742, %rd2637;
mul.lo.s64 %rd1744, %rd1743, %rd2631;
shr.u64 %rd1745, %rd1744, 32;
shr.u64 %rd1746, %rd1740, 32;
and.b64 %rd1747, %rd2638, 4294967295;
xor.b64 %rd1748, %rd1747, %rd1746;
xor.b64 %rd1749, %rd1748, %rd2639;
mul.lo.s64 %rd1750, %rd1749, %rd2631;
and.b64 %rd1751, %rd1750, 4294967295;
xor.b64 %rd1752, %rd1751, %rd1745;
xor.b64 %rd1753, %rd1752, %rd2640;
mul.lo.s64 %rd1754, %rd1753, %rd2629;
shr.u64 %rd1755, %rd1754, 32;
shr.u64 %rd1756, %rd1750, 32;
and.b64 %rd1757, %rd1724, 4294967295;
xor.b64 %rd1758, %rd1757, %rd1756;
xor.b64 %rd1759, %rd1758, %rd2641;
mul.lo.s64 %rd1760, %rd1759, %rd2629;
and.b64 %rd1761, %rd1760, 4294967295;
xor.b64 %rd1762, %rd1761, %rd1755;
xor.b64 %rd1763, %rd1762, %rd2642;
mul.lo.s64 %rd1764, %rd1763, %rd2631;
shr.u64 %rd1765, %rd1764, 32;
shr.u64 %rd1766, %rd1760, 32;
and.b64 %rd1767, %rd1734, 4294967295;
xor.b64 %rd1768, %rd1767, %rd1766;
xor.b64 %rd1769, %rd1768, %rd2643;
mul.lo.s64 %rd1770, %rd1769, %rd2631;
and.b64 %rd1771, %rd1770, 4294967295;
xor.b64 %rd1772, %rd1771, %rd1765;
xor.b64 %rd1773, %rd1772, %rd2644;
mul.lo.s64 %rd1774, %rd1773, %rd2629;
shr.u64 %rd1775, %rd1774, 32;
shr.u64 %rd1776, %rd1770, 32;
and.b64 %rd1777, %rd1744, 4294967295;
xor.b64 %rd1778, %rd1777, %rd1776;
xor.b64 %rd1779, %rd1778, %rd2645;
mul.lo.s64 %rd1780, %rd1779, %rd2629;
and.b64 %rd1781, %rd1780, 4294967295;
xor.b64 %rd1782, %rd1781, %rd1775;
xor.b64 %rd1783, %rd1782, %rd2646;
mul.lo.s64 %rd1784, %rd1783, %rd2631;
shr.u64 %rd1785, %rd1784, 32;
cvt.u32.u64 %r194, %rd1785;
shr.u64 %rd1786, %rd1780, 32;
xor.b64 %rd1787, %rd1786, %rd1754;
cvt.u32.u64 %r195, %rd1787;
xor.b32 %r196, %r335, %r195;
mul.lo.s32 %r197, %r196, %r336;
xor.b32 %r198, %r197, %r194;
xor.b32 %r199, %r198, %r337;
shr.u32 %r200, %r199, 9;
cvt.rn.f32.u32 %f153, %r200;
mul.rn.f32 %f154, %f153, 0f34000000;
cvt.rn.f16.f32 %h91, %f154;
mov.b16 %h92, 0x2E66;
setp.ge.f16 %p44, %h91, %h92;
ld.global.nc.b16 %h93, [%rd44+1280];
ld.global.nc.f32 %f155, [%rd45+2560];
cvt.rn.f16.f32 %h94, %f155;
add.rn.f16 %h95, %h93, %h94;
mov.b16 %h96, 0x3C72;
mul.rn.f16 %h97, %h95, %h96;
selp.b16 %h98, %h97, 0x0000, %p44;
cvt.f32.f16 %f156, %h98;
ld.global.nc.b16 %h99, [%rd46+1280];
cvt.f32.f16 %f157, %h99;
ld.global.nc.f32 %f158, [%rd47+2560];
mul.rn.f32 %f159, %f1, %f158;
mul.rn.f32 %f160, %f159, %f157;
ld.global.nc.f32 %f161, [%rd48+2560];
mul.rn.f32 %f162, %f2, %f159;
sub.rn.f32 %f163, %f161, %f162;
add.rn.f32 %f164, %f160, %f163;
add.rn.f32 %f165, %f164, %f156;
add.rn.f32 %f13, %f12, %f165;
or.b32 %r201, %r3, 641;
or.b32 %r202, %r201, %r4;
and.b32 %r203, %r201, 3;
shr.u32 %r204, %r202, 2;
setp.ne.s32 %p45, %r203, 1;
cvt.u64.u32 %rd1788, %r204;
add.s64 %rd323, %rd11, %rd1788;
@%p45 bra LBB79_37;
and.b64 %rd1828, %rd323, 4294967295;
mul.lo.s64 %rd2651, %rd1828, 3528531795;
setp.lt.u64 %p47, %rd323, %rd11;
selp.u64 %rd1829, 1, 0, %p47;
add.s64 %rd1830, %rd2461, %rd1829;
xor.b64 %rd1831, %rd1830, %rd2651;
shr.u64 %rd1832, %rd1831, 32;
mul.lo.s64 %rd2654, %rd1832, 3449720151;
shr.u64 %rd1833, %rd2654, 32;
and.b64 %rd1834, %rd1830, 4294967295;
mul.lo.s64 %rd1835, %rd1834, 3449720151;
and.b64 %rd1836, %rd1835, 4294967295;
xor.b64 %rd1837, %rd1836, %rd1833;
xor.b64 %rd1838, %rd1837, 2654435769;
mul.lo.s64 %rd2657, %rd1838, 3528531795;
xor.b64 %rd2647, %rd1835, %rd323;
mov.u32 %r339, -845247145;
mov.u32 %r338, -616729560;
mov.u64 %rd2664, 3041712726;
mov.u64 %rd2663, 1401181199;
mov.u64 %rd2662, 2835769497;
mov.u64 %rd2661, 1684936478;
mov.u64 %rd2660, 2027808484;
mov.u64 %rd2659, 387276957;
mov.u64 %rd2658, 842468239;
mov.u64 %rd2656, 3986602516;
mov.u64 %rd2655, 1013904242;
mov.u64 %rd2653, 3668340011;
mov.u64 %rd2652, 3144134277;
mov.u64 %rd2650, 3449720151;
mov.u64 %rd2649, 1993301258;
mov.u64 %rd2648, 3528531795;
bra.uni LBB79_38;
LBB79_37:
setp.lt.u64 %p46, %rd323, %rd11;
selp.u64 %rd1803, 1, 0, %p46;
add.s64 %rd1804, %rd2461, %rd1803;
and.b64 %rd1805, %rd1804, 4294967295;
mul.lo.s64 %rd2651, %rd1805, 3449720151;
xor.b64 %rd1806, %rd2651, %rd323;
shr.u64 %rd1807, %rd1806, 32;
mul.lo.s64 %rd2654, %rd1807, 3528531795;
shr.u64 %rd1808, %rd2654, 32;
and.b64 %rd1809, %rd323, 4294967295;
mul.lo.s64 %rd1810, %rd1809, 3528531795;
and.b64 %rd1811, %rd1810, 4294967295;
xor.b64 %rd1812, %rd1811, %rd1808;
xor.b64 %rd1813, %rd1812, 3144134277;
mul.lo.s64 %rd2657, %rd1813, 3449720151;
xor.b64 %rd2647, %rd1804, %rd1810;
mov.u32 %r339, -766435501;
mov.u32 %r338, -239350328;
mov.u64 %rd2664, 1684936478;
mov.u64 %rd2663, 534103459;
mov.u64 %rd2662, 387276957;
mov.u64 %rd2661, 3041712726;
mov.u64 %rd2660, 3986602516;
mov.u64 %rd2659, 2835769497;
mov.u64 %rd2658, 3668340011;
mov.u64 %rd2656, 2027808484;
mov.u64 %rd2655, 1993301258;
mov.u64 %rd2653, 842468239;
mov.u64 %rd2652, 2654435769;
mov.u64 %rd2650, 3528531795;
mov.u64 %rd2649, 1013904242;
mov.u64 %rd2648, 3449720151;
LBB79_38:
shr.u64 %rd1839, %rd2657, 32;
shr.u64 %rd1840, %rd2647, 32;
mul.lo.s64 %rd1841, %rd1840, %rd2648;
and.b64 %rd1842, %rd1841, 4294967295;
xor.b64 %rd1843, %rd1842, %rd1839;
xor.b64 %rd1844, %rd1843, %rd2649;
mul.lo.s64 %rd1845, %rd1844, %rd2650;
shr.u64 %rd1846, %rd1845, 32;
shr.u64 %rd1847, %rd1841, 32;
and.b64 %rd1848, %rd2651, 4294967295;
xor.b64 %rd1849, %rd1848, %rd1847;
xor.b64 %rd1850, %rd1849, %rd2652;
mul.lo.s64 %rd1851, %rd1850, %rd2650;
and.b64 %rd1852, %rd1851, 4294967295;
xor.b64 %rd1853, %rd1852, %rd1846;
xor.b64 %rd1854, %rd1853, %rd2653;
mul.lo.s64 %rd1855, %rd1854, %rd2648;
shr.u64 %rd1856, %rd1855, 32;
shr.u64 %rd1857, %rd1851, 32;
and.b64 %rd1858, %rd2654, 4294967295;
xor.b64 %rd1859, %rd1858, %rd1857;
xor.b64 %rd1860, %rd1859, %rd2655;
mul.lo.s64 %rd1861, %rd1860, %rd2648;
and.b64 %rd1862, %rd1861, 4294967295;
xor.b64 %rd1863, %rd1862, %rd1856;
xor.b64 %rd1864, %rd1863, %rd2656;
mul.lo.s64 %rd1865, %rd1864, %rd2650;
shr.u64 %rd1866, %rd1865, 32;
shr.u64 %rd1867, %rd1861, 32;
and.b64 %rd1868, %rd2657, 4294967295;
xor.b64 %rd1869, %rd1868, %rd1867;
xor.b64 %rd1870, %rd1869, %rd2658;
mul.lo.s64 %rd1871, %rd1870, %rd2650;
and.b64 %rd1872, %rd1871, 4294967295;
xor.b64 %rd1873, %rd1872, %rd1866;
xor.b64 %rd1874, %rd1873, %rd2659;
mul.lo.s64 %rd1875, %rd1874, %rd2648;
shr.u64 %rd1876, %rd1875, 32;
shr.u64 %rd1877, %rd1871, 32;
and.b64 %rd1878, %rd1845, 4294967295;
xor.b64 %rd1879, %rd1878, %rd1877;
xor.b64 %rd1880, %rd1879, %rd2660;
mul.lo.s64 %rd1881, %rd1880, %rd2648;
and.b64 %rd1882, %rd1881, 4294967295;
xor.b64 %rd1883, %rd1882, %rd1876;
xor.b64 %rd1884, %rd1883, %rd2661;
mul.lo.s64 %rd1885, %rd1884, %rd2650;
shr.u64 %rd1886, %rd1885, 32;
shr.u64 %rd1887, %rd1881, 32;
and.b64 %rd1888, %rd1855, 4294967295;
xor.b64 %rd1889, %rd1888, %rd1887;
xor.b64 %rd1890, %rd1889, %rd2662;
mul.lo.s64 %rd1891, %rd1890, %rd2650;
and.b64 %rd1892, %rd1891, 4294967295;
xor.b64 %rd1893, %rd1892, %rd1886;
xor.b64 %rd1894, %rd1893, %rd2663;
mul.lo.s64 %rd1895, %rd1894, %rd2648;
shr.u64 %rd1896, %rd1895, 32;
shr.u64 %rd1897, %rd1891, 32;
xor.b64 %rd1898, %rd1865, %rd1897;
xor.b64 %rd1899, %rd1898, %rd2664;
mul.lo.s64 %rd1900, %rd1899, %rd2648;
xor.b64 %rd1901, %rd1896, %rd1900;
cvt.u32.u64 %r209, %rd1901;
xor.b32 %r210, %r338, %r209;
mul.lo.s32 %r211, %r210, %r339;
shr.u32 %r212, %r211, 9;
cvt.rn.f32.u32 %f166, %r212;
mul.rn.f32 %f167, %f166, 0f34000000;
cvt.rn.f16.f32 %h100, %f167;
mov.b16 %h101, 0x2E66;
setp.ge.f16 %p49, %h100, %h101;
ld.global.nc.b16 %h102, [%rd44+1282];
ld.global.nc.f32 %f168, [%rd45+2564];
cvt.rn.f16.f32 %h103, %f168;
add.rn.f16 %h104, %h102, %h103;
mov.b16 %h105, 0x3C72;
mul.rn.f16 %h106, %h104, %h105;
selp.b16 %h107, %h106, 0x0000, %p49;
cvt.f32.f16 %f169, %h107;
ld.global.nc.b16 %h108, [%rd46+1282];
cvt.f32.f16 %f170, %h108;
ld.global.nc.f32 %f171, [%rd47+2564];
mul.rn.f32 %f172, %f1, %f171;
mul.rn.f32 %f173, %f172, %f170;
ld.global.nc.f32 %f174, [%rd48+2564];
mul.rn.f32 %f175, %f2, %f172;
sub.rn.f32 %f176, %f174, %f175;
add.rn.f32 %f177, %f173, %f176;
add.rn.f32 %f178, %f177, %f169;
add.rn.f32 %f14, %f13, %f178;
or.b32 %r214, %r73, 768;
shr.u32 %r215, %r214, 2;
cvt.u64.u32 %rd1902, %r215;
add.s64 %rd350, %rd11, %rd1902;
@%p8 bra LBB79_40;
and.b64 %rd1944, %rd350, 4294967295;
mul.lo.s64 %rd2669, %rd1944, 3528531795;
setp.lt.u64 %p51, %rd350, %rd11;
selp.u64 %rd1945, 1, 0, %p51;
add.s64 %rd1946, %rd2461, %rd1945;
xor.b64 %rd1947, %rd1946, %rd2669;
shr.u64 %rd1948, %rd1947, 32;
mul.lo.s64 %rd2672, %rd1948, 3449720151;
shr.u64 %rd1949, %rd2672, 32;
and.b64 %rd1950, %rd1946, 4294967295;
mul.lo.s64 %rd1951, %rd1950, 3449720151;
and.b64 %rd1952, %rd1951, 4294967295;
xor.b64 %rd1953, %rd1952, %rd1949;
xor.b64 %rd1954, %rd1953, 2654435769;
mul.lo.s64 %rd2675, %rd1954, 3528531795;
xor.b64 %rd2665, %rd1951, %rd350;
mov.u32 %r342, -1879881855;
mov.u32 %r341, -845247145;
mov.u32 %r340, 534103459;
mov.u64 %rd2683, 3678237736;
mov.u64 %rd2682, 3041712726;
mov.u64 %rd2681, 1401181199;
mov.u64 %rd2680, 2835769497;
mov.u64 %rd2679, 1684936478;
mov.u64 %rd2678, 2027808484;
mov.u64 %rd2677, 387276957;
mov.u64 %rd2676, 842468239;
mov.u64 %rd2674, 3986602516;
mov.u64 %rd2673, 1013904242;
mov.u64 %rd2671, 3668340011;
mov.u64 %rd2670, 3144134277;
mov.u64 %rd2668, 3449720151;
mov.u64 %rd2667, 1993301258;
mov.u64 %rd2666, 3528531795;
bra.uni LBB79_41;
LBB79_40:
setp.lt.u64 %p50, %rd350, %rd11;
selp.u64 %rd1918, 1, 0, %p50;
add.s64 %rd1919, %rd2461, %rd1918;
and.b64 %rd1920, %rd1919, 4294967295;
mul.lo.s64 %rd2669, %rd1920, 3449720151;
xor.b64 %rd1921, %rd2669, %rd350;
shr.u64 %rd1922, %rd1921, 32;
mul.lo.s64 %rd2672, %rd1922, 3528531795;
shr.u64 %rd1923, %rd2672, 32;
and.b64 %rd1924, %rd350, 4294967295;
mul.lo.s64 %rd1925, %rd1924, 3528531795;
and.b64 %rd1926, %rd1925, 4294967295;
xor.b64 %rd1927, %rd1926, %rd1923;
xor.b64 %rd1928, %rd1927, 3144134277;
mul.lo.s64 %rd2675, %rd1928, 3449720151;
xor.b64 %rd2665, %rd1919, %rd1925;
mov.u32 %r342, -1767562579;
mov.u32 %r341, -766435501;
mov.u32 %r340, 1401181199;
mov.u64 %rd2683, 4055616968;
mov.u64 %rd2682, 1684936478;
mov.u64 %rd2681, 534103459;
mov.u64 %rd2680, 387276957;
mov.u64 %rd2679, 3041712726;
mov.u64 %rd2678, 3986602516;
mov.u64 %rd2677, 2835769497;
mov.u64 %rd2676, 3668340011;
mov.u64 %rd2674, 2027808484;
mov.u64 %rd2673, 1993301258;
mov.u64 %rd2671, 842468239;
mov.u64 %rd2670, 2654435769;
mov.u64 %rd2668, 3528531795;
mov.u64 %rd2667, 1013904242;
mov.u64 %rd2666, 3449720151;
LBB79_41:
shr.u64 %rd1955, %rd2675, 32;
shr.u64 %rd1956, %rd2665, 32;
mul.lo.s64 %rd1957, %rd1956, %rd2666;
and.b64 %rd1958, %rd1957, 4294967295;
xor.b64 %rd1959, %rd1958, %rd1955;
xor.b64 %rd1960, %rd1959, %rd2667;
mul.lo.s64 %rd1961, %rd1960, %rd2668;
shr.u64 %rd1962, %rd1961, 32;
shr.u64 %rd1963, %rd1957, 32;
and.b64 %rd1964, %rd2669, 4294967295;
xor.b64 %rd1965, %rd1964, %rd1963;
xor.b64 %rd1966, %rd1965, %rd2670;
mul.lo.s64 %rd1967, %rd1966, %rd2668;
and.b64 %rd1968, %rd1967, 4294967295;
xor.b64 %rd1969, %rd1968, %rd1962;
xor.b64 %rd1970, %rd1969, %rd2671;
mul.lo.s64 %rd1971, %rd1970, %rd2666;
shr.u64 %rd1972, %rd1971, 32;
shr.u64 %rd1973, %rd1967, 32;
and.b64 %rd1974, %rd2672, 4294967295;
xor.b64 %rd1975, %rd1974, %rd1973;
xor.b64 %rd1976, %rd1975, %rd2673;
mul.lo.s64 %rd1977, %rd1976, %rd2666;
and.b64 %rd1978, %rd1977, 4294967295;
xor.b64 %rd1979, %rd1978, %rd1972;
xor.b64 %rd1980, %rd1979, %rd2674;
mul.lo.s64 %rd1981, %rd1980, %rd2668;
shr.u64 %rd1982, %rd1981, 32;
shr.u64 %rd1983, %rd1977, 32;
and.b64 %rd1984, %rd2675, 4294967295;
xor.b64 %rd1985, %rd1984, %rd1983;
xor.b64 %rd1986, %rd1985, %rd2676;
mul.lo.s64 %rd1987, %rd1986, %rd2668;
and.b64 %rd1988, %rd1987, 4294967295;
xor.b64 %rd1989, %rd1988, %rd1982;
xor.b64 %rd1990, %rd1989, %rd2677;
mul.lo.s64 %rd1991, %rd1990, %rd2666;
shr.u64 %rd1992, %rd1991, 32;
shr.u64 %rd1993, %rd1987, 32;
and.b64 %rd1994, %rd1961, 4294967295;
xor.b64 %rd1995, %rd1994, %rd1993;
xor.b64 %rd1996, %rd1995, %rd2678;
mul.lo.s64 %rd1997, %rd1996, %rd2666;
and.b64 %rd1998, %rd1997, 4294967295;
xor.b64 %rd1999, %rd1998, %rd1992;
xor.b64 %rd2000, %rd1999, %rd2679;
mul.lo.s64 %rd2001, %rd2000, %rd2668;
shr.u64 %rd2002, %rd2001, 32;
shr.u64 %rd2003, %rd1997, 32;
and.b64 %rd2004, %rd1971, 4294967295;
xor.b64 %rd2005, %rd2004, %rd2003;
xor.b64 %rd2006, %rd2005, %rd2680;
mul.lo.s64 %rd2007, %rd2006, %rd2668;
and.b64 %rd2008, %rd2007, 4294967295;
xor.b64 %rd2009, %rd2008, %rd2002;
xor.b64 %rd2010, %rd2009, %rd2681;
mul.lo.s64 %rd2011, %rd2010, %rd2666;
shr.u64 %rd2012, %rd2011, 32;
shr.u64 %rd2013, %rd2007, 32;
and.b64 %rd2014, %rd1981, 4294967295;
xor.b64 %rd2015, %rd2014, %rd2013;
xor.b64 %rd2016, %rd2015, %rd2682;
mul.lo.s64 %rd2017, %rd2016, %rd2666;
and.b64 %rd2018, %rd2017, 4294967295;
xor.b64 %rd2019, %rd2018, %rd2012;
xor.b64 %rd2020, %rd2019, %rd2683;
mul.lo.s64 %rd2021, %rd2020, %rd2668;
shr.u64 %rd2022, %rd2021, 32;
cvt.u32.u64 %r222, %rd2022;
shr.u64 %rd2023, %rd2017, 32;
xor.b64 %rd2024, %rd2023, %rd1991;
cvt.u32.u64 %r223, %rd2024;
xor.b32 %r224, %r340, %r223;
mul.lo.s32 %r225, %r224, %r341;
xor.b32 %r226, %r225, %r222;
xor.b32 %r227, %r226, %r342;
shr.u32 %r228, %r227, 9;
cvt.rn.f32.u32 %f179, %r228;
mul.rn.f32 %f180, %f179, 0f34000000;
cvt.rn.f16.f32 %h109, %f180;
mov.b16 %h110, 0x2E66;
setp.ge.f16 %p52, %h109, %h110;
ld.global.nc.b16 %h111, [%rd44+1536];
ld.global.nc.f32 %f181, [%rd45+3072];
cvt.rn.f16.f32 %h112, %f181;
add.rn.f16 %h113, %h111, %h112;
mov.b16 %h114, 0x3C72;
mul.rn.f16 %h115, %h113, %h114;
selp.b16 %h116, %h115, 0x0000, %p52;
cvt.f32.f16 %f182, %h116;
ld.global.nc.b16 %h117, [%rd46+1536];
cvt.f32.f16 %f183, %h117;
ld.global.nc.f32 %f184, [%rd47+3072];
mul.rn.f32 %f185, %f1, %f184;
mul.rn.f32 %f186, %f185, %f183;
ld.global.nc.f32 %f187, [%rd48+3072];
mul.rn.f32 %f188, %f2, %f185;
sub.rn.f32 %f189, %f187, %f188;
add.rn.f32 %f190, %f186, %f189;
add.rn.f32 %f191, %f190, %f182;
add.rn.f32 %f15, %f14, %f191;
or.b32 %r229, %r3, 769;
or.b32 %r230, %r229, %r4;
and.b32 %r231, %r229, 3;
shr.u32 %r232, %r230, 2;
setp.ne.s32 %p53, %r231, 1;
cvt.u64.u32 %rd2025, %r232;
add.s64 %rd378, %rd11, %rd2025;
@%p53 bra LBB79_43;
and.b64 %rd2065, %rd378, 4294967295;
mul.lo.s64 %rd2688, %rd2065, 3528531795;
setp.lt.u64 %p55, %rd378, %rd11;
selp.u64 %rd2066, 1, 0, %p55;
add.s64 %rd2067, %rd2461, %rd2066;
xor.b64 %rd2068, %rd2067, %rd2688;
shr.u64 %rd2069, %rd2068, 32;
mul.lo.s64 %rd2691, %rd2069, 3449720151;
shr.u64 %rd2070, %rd2691, 32;
and.b64 %rd2071, %rd2067, 4294967295;
mul.lo.s64 %rd2072, %rd2071, 3449720151;
and.b64 %rd2073, %rd2072, 4294967295;
xor.b64 %rd2074, %rd2073, %rd2070;
xor.b64 %rd2075, %rd2074, 2654435769;
mul.lo.s64 %rd2694, %rd2075, 3528531795;
xor.b64 %rd2684, %rd2072, %rd378;
mov.u32 %r344, -845247145;
mov.u32 %r343, -616729560;
mov.u64 %rd2701, 3041712726;
mov.u64 %rd2700, 1401181199;
mov.u64 %rd2699, 2835769497;
mov.u64 %rd2698, 1684936478;
mov.u64 %rd2697, 2027808484;
mov.u64 %rd2696, 387276957;
mov.u64 %rd2695, 842468239;
mov.u64 %rd2693, 3986602516;
mov.u64 %rd2692, 1013904242;
mov.u64 %rd2690, 3668340011;
mov.u64 %rd2689, 3144134277;
mov.u64 %rd2687, 3449720151;
mov.u64 %rd2686, 1993301258;
mov.u64 %rd2685, 3528531795;
bra.uni LBB79_44;
LBB79_43:
setp.lt.u64 %p54, %rd378, %rd11;
selp.u64 %rd2040, 1, 0, %p54;
add.s64 %rd2041, %rd2461, %rd2040;
and.b64 %rd2042, %rd2041, 4294967295;
mul.lo.s64 %rd2688, %rd2042, 3449720151;
xor.b64 %rd2043, %rd2688, %rd378;
shr.u64 %rd2044, %rd2043, 32;
mul.lo.s64 %rd2691, %rd2044, 3528531795;
shr.u64 %rd2045, %rd2691, 32;
and.b64 %rd2046, %rd378, 4294967295;
mul.lo.s64 %rd2047, %rd2046, 3528531795;
and.b64 %rd2048, %rd2047, 4294967295;
xor.b64 %rd2049, %rd2048, %rd2045;
xor.b64 %rd2050, %rd2049, 3144134277;
mul.lo.s64 %rd2694, %rd2050, 3449720151;
xor.b64 %rd2684, %rd2041, %rd2047;
mov.u32 %r344, -766435501;
mov.u32 %r343, -239350328;
mov.u64 %rd2701, 1684936478;
mov.u64 %rd2700, 534103459;
mov.u64 %rd2699, 387276957;
mov.u64 %rd2698, 3041712726;
mov.u64 %rd2697, 3986602516;
mov.u64 %rd2696, 2835769497;
mov.u64 %rd2695, 3668340011;
mov.u64 %rd2693, 2027808484;
mov.u64 %rd2692, 1993301258;
mov.u64 %rd2690, 842468239;
mov.u64 %rd2689, 2654435769;
mov.u64 %rd2687, 3528531795;
mov.u64 %rd2686, 1013904242;
mov.u64 %rd2685, 3449720151;
LBB79_44:
shr.u64 %rd2076, %rd2694, 32;
shr.u64 %rd2077, %rd2684, 32;
mul.lo.s64 %rd2078, %rd2077, %rd2685;
and.b64 %rd2079, %rd2078, 4294967295;
xor.b64 %rd2080, %rd2079, %rd2076;
xor.b64 %rd2081, %rd2080, %rd2686;
mul.lo.s64 %rd2082, %rd2081, %rd2687;
shr.u64 %rd2083, %rd2082, 32;
shr.u64 %rd2084, %rd2078, 32;
and.b64 %rd2085, %rd2688, 4294967295;
xor.b64 %rd2086, %rd2085, %rd2084;
xor.b64 %rd2087, %rd2086, %rd2689;
mul.lo.s64 %rd2088, %rd2087, %rd2687;
and.b64 %rd2089, %rd2088, 4294967295;
xor.b64 %rd2090, %rd2089, %rd2083;
xor.b64 %rd2091, %rd2090, %rd2690;
mul.lo.s64 %rd2092, %rd2091, %rd2685;
shr.u64 %rd2093, %rd2092, 32;
shr.u64 %rd2094, %rd2088, 32;
and.b64 %rd2095, %rd2691, 4294967295;
xor.b64 %rd2096, %rd2095, %rd2094;
xor.b64 %rd2097, %rd2096, %rd2692;
mul.lo.s64 %rd2098, %rd2097, %rd2685;
and.b64 %rd2099, %rd2098, 4294967295;
xor.b64 %rd2100, %rd2099, %rd2093;
xor.b64 %rd2101, %rd2100, %rd2693;
mul.lo.s64 %rd2102, %rd2101, %rd2687;
shr.u64 %rd2103, %rd2102, 32;
shr.u64 %rd2104, %rd2098, 32;
and.b64 %rd2105, %rd2694, 4294967295;
xor.b64 %rd2106, %rd2105, %rd2104;
xor.b64 %rd2107, %rd2106, %rd2695;
mul.lo.s64 %rd2108, %rd2107, %rd2687;
and.b64 %rd2109, %rd2108, 4294967295;
xor.b64 %rd2110, %rd2109, %rd2103;
xor.b64 %rd2111, %rd2110, %rd2696;
mul.lo.s64 %rd2112, %rd2111, %rd2685;
shr.u64 %rd2113, %rd2112, 32;
shr.u64 %rd2114, %rd2108, 32;
and.b64 %rd2115, %rd2082, 4294967295;
xor.b64 %rd2116, %rd2115, %rd2114;
xor.b64 %rd2117, %rd2116, %rd2697;
mul.lo.s64 %rd2118, %rd2117, %rd2685;
and.b64 %rd2119, %rd2118, 4294967295;
xor.b64 %rd2120, %rd2119, %rd2113;
xor.b64 %rd2121, %rd2120, %rd2698;
mul.lo.s64 %rd2122, %rd2121, %rd2687;
shr.u64 %rd2123, %rd2122, 32;
shr.u64 %rd2124, %rd2118, 32;
and.b64 %rd2125, %rd2092, 4294967295;
xor.b64 %rd2126, %rd2125, %rd2124;
xor.b64 %rd2127, %rd2126, %rd2699;
mul.lo.s64 %rd2128, %rd2127, %rd2687;
and.b64 %rd2129, %rd2128, 4294967295;
xor.b64 %rd2130, %rd2129, %rd2123;
xor.b64 %rd2131, %rd2130, %rd2700;
mul.lo.s64 %rd2132, %rd2131, %rd2685;
shr.u64 %rd2133, %rd2132, 32;
shr.u64 %rd2134, %rd2128, 32;
xor.b64 %rd2135, %rd2102, %rd2134;
xor.b64 %rd2136, %rd2135, %rd2701;
mul.lo.s64 %rd2137, %rd2136, %rd2685;
xor.b64 %rd2138, %rd2133, %rd2137;
cvt.u32.u64 %r237, %rd2138;
xor.b32 %r238, %r343, %r237;
mul.lo.s32 %r239, %r238, %r344;
shr.u32 %r240, %r239, 9;
cvt.rn.f32.u32 %f192, %r240;
mul.rn.f32 %f193, %f192, 0f34000000;
cvt.rn.f16.f32 %h118, %f193;
mov.b16 %h119, 0x2E66;
setp.ge.f16 %p57, %h118, %h119;
ld.global.nc.b16 %h120, [%rd44+1538];
ld.global.nc.f32 %f194, [%rd45+3076];
cvt.rn.f16.f32 %h121, %f194;
add.rn.f16 %h122, %h120, %h121;
mov.b16 %h123, 0x3C72;
mul.rn.f16 %h124, %h122, %h123;
selp.b16 %h125, %h124, 0x0000, %p57;
cvt.f32.f16 %f195, %h125;
ld.global.nc.b16 %h126, [%rd46+1538];
cvt.f32.f16 %f196, %h126;
ld.global.nc.f32 %f197, [%rd47+3076];
mul.rn.f32 %f198, %f1, %f197;
mul.rn.f32 %f199, %f198, %f196;
ld.global.nc.f32 %f200, [%rd48+3076];
mul.rn.f32 %f201, %f2, %f198;
sub.rn.f32 %f202, %f200, %f201;
add.rn.f32 %f203, %f199, %f202;
add.rn.f32 %f204, %f203, %f195;
add.rn.f32 %f16, %f15, %f204;
or.b32 %r242, %r73, 896;
shr.u32 %r243, %r242, 2;
cvt.u64.u32 %rd2139, %r243;
add.s64 %rd405, %rd11, %rd2139;
@%p8 bra LBB79_46;
mov.u32 %r347, -1879881855;
mov.u32 %r345, 534103459;
mov.u64 %rd2720, 3678237736;
and.b64 %rd2181, %rd405, 4294967295;
mul.lo.s64 %rd2706, %rd2181, 3528531795;
setp.lt.u64 %p59, %rd405, %rd11;
selp.u64 %rd2182, 1, 0, %p59;
add.s64 %rd2183, %rd2461, %rd2182;
xor.b64 %rd2184, %rd2183, %rd2706;
shr.u64 %rd2185, %rd2184, 32;
mul.lo.s64 %rd2709, %rd2185, 3449720151;
shr.u64 %rd2186, %rd2709, 32;
and.b64 %rd2187, %rd2183, 4294967295;
mul.lo.s64 %rd2188, %rd2187, 3449720151;
and.b64 %rd2189, %rd2188, 4294967295;
xor.b64 %rd2190, %rd2189, %rd2186;
xor.b64 %rd2191, %rd2190, 2654435769;
mul.lo.s64 %rd2712, %rd2191, 3528531795;
xor.b64 %rd2702, %rd2188, %rd405;
mov.u32 %r346, -845247145;
mov.u64 %rd2719, 3041712726;
mov.u64 %rd2718, 1401181199;
mov.u64 %rd2717, 2835769497;
mov.u64 %rd2716, 1684936478;
mov.u64 %rd2715, 2027808484;
mov.u64 %rd2714, 387276957;
mov.u64 %rd2713, 842468239;
mov.u64 %rd2711, 3986602516;
mov.u64 %rd2710, 1013904242;
mov.u64 %rd2708, 3668340011;
mov.u64 %rd2707, 3144134277;
mov.u64 %rd2705, 3449720151;
mov.u64 %rd2704, 1993301258;
mov.u64 %rd2703, 3528531795;
bra.uni LBB79_47;
LBB79_46:
setp.lt.u64 %p58, %rd405, %rd11;
selp.u64 %rd2155, 1, 0, %p58;
add.s64 %rd2156, %rd2461, %rd2155;
and.b64 %rd2157, %rd2156, 4294967295;
mul.lo.s64 %rd2706, %rd2157, 3449720151;
xor.b64 %rd2158, %rd2706, %rd405;
shr.u64 %rd2159, %rd2158, 32;
mul.lo.s64 %rd2709, %rd2159, 3528531795;
shr.u64 %rd2160, %rd2709, 32;
and.b64 %rd2161, %rd405, 4294967295;
mul.lo.s64 %rd2162, %rd2161, 3528531795;
and.b64 %rd2163, %rd2162, 4294967295;
xor.b64 %rd2164, %rd2163, %rd2160;
xor.b64 %rd2165, %rd2164, 3144134277;
mul.lo.s64 %rd2712, %rd2165, 3449720151;
xor.b64 %rd2702, %rd2156, %rd2162;
mov.u32 %r347, -1767562579;
mov.u32 %r346, -766435501;
mov.u32 %r345, 1401181199;
mov.u64 %rd2720, 4055616968;
mov.u64 %rd2719, 1684936478;
mov.u64 %rd2718, 534103459;
mov.u64 %rd2717, 387276957;
mov.u64 %rd2716, 3041712726;
mov.u64 %rd2715, 3986602516;
mov.u64 %rd2714, 2835769497;
mov.u64 %rd2713, 3668340011;
mov.u64 %rd2711, 2027808484;
mov.u64 %rd2710, 1993301258;
mov.u64 %rd2708, 842468239;
mov.u64 %rd2707, 2654435769;
mov.u64 %rd2705, 3528531795;
mov.u64 %rd2704, 1013904242;
mov.u64 %rd2703, 3449720151;
LBB79_47:
shr.u64 %rd2192, %rd2712, 32;
shr.u64 %rd2193, %rd2702, 32;
mul.lo.s64 %rd2194, %rd2193, %rd2703;
and.b64 %rd2195, %rd2194, 4294967295;
xor.b64 %rd2196, %rd2195, %rd2192;
xor.b64 %rd2197, %rd2196, %rd2704;
mul.lo.s64 %rd2198, %rd2197, %rd2705;
shr.u64 %rd2199, %rd2198, 32;
shr.u64 %rd2200, %rd2194, 32;
and.b64 %rd2201, %rd2706, 4294967295;
xor.b64 %rd2202, %rd2201, %rd2200;
xor.b64 %rd2203, %rd2202, %rd2707;
mul.lo.s64 %rd2204, %rd2203, %rd2705;
and.b64 %rd2205, %rd2204, 4294967295;
xor.b64 %rd2206, %rd2205, %rd2199;
xor.b64 %rd2207, %rd2206, %rd2708;
mul.lo.s64 %rd2208, %rd2207, %rd2703;
shr.u64 %rd2209, %rd2208, 32;
shr.u64 %rd2210, %rd2204, 32;
and.b64 %rd2211, %rd2709, 4294967295;
xor.b64 %rd2212, %rd2211, %rd2210;
xor.b64 %rd2213, %rd2212, %rd2710;
mul.lo.s64 %rd2214, %rd2213, %rd2703;
and.b64 %rd2215, %rd2214, 4294967295;
xor.b64 %rd2216, %rd2215, %rd2209;
xor.b64 %rd2217, %rd2216, %rd2711;
mul.lo.s64 %rd2218, %rd2217, %rd2705;
shr.u64 %rd2219, %rd2218, 32;
shr.u64 %rd2220, %rd2214, 32;
and.b64 %rd2221, %rd2712, 4294967295;
xor.b64 %rd2222, %rd2221, %rd2220;
xor.b64 %rd2223, %rd2222, %rd2713;
mul.lo.s64 %rd2224, %rd2223, %rd2705;
and.b64 %rd2225, %rd2224, 4294967295;
xor.b64 %rd2226, %rd2225, %rd2219;
xor.b64 %rd2227, %rd2226, %rd2714;
mul.lo.s64 %rd2228, %rd2227, %rd2703;
shr.u64 %rd2229, %rd2228, 32;
shr.u64 %rd2230, %rd2224, 32;
and.b64 %rd2231, %rd2198, 4294967295;
xor.b64 %rd2232, %rd2231, %rd2230;
xor.b64 %rd2233, %rd2232, %rd2715;
mul.lo.s64 %rd2234, %rd2233, %rd2703;
and.b64 %rd2235, %rd2234, 4294967295;
xor.b64 %rd2236, %rd2235, %rd2229;
xor.b64 %rd2237, %rd2236, %rd2716;
mul.lo.s64 %rd2238, %rd2237, %rd2705;
shr.u64 %rd2239, %rd2238, 32;
shr.u64 %rd2240, %rd2234, 32;
and.b64 %rd2241, %rd2208, 4294967295;
xor.b64 %rd2242, %rd2241, %rd2240;
xor.b64 %rd2243, %rd2242, %rd2717;
mul.lo.s64 %rd2244, %rd2243, %rd2705;
and.b64 %rd2245, %rd2244, 4294967295;
xor.b64 %rd2246, %rd2245, %rd2239;
xor.b64 %rd2247, %rd2246, %rd2718;
mul.lo.s64 %rd2248, %rd2247, %rd2703;
shr.u64 %rd2249, %rd2248, 32;
shr.u64 %rd2250, %rd2244, 32;
and.b64 %rd2251, %rd2218, 4294967295;
xor.b64 %rd2252, %rd2251, %rd2250;
xor.b64 %rd2253, %rd2252, %rd2719;
mul.lo.s64 %rd2254, %rd2253, %rd2703;
and.b64 %rd2255, %rd2254, 4294967295;
xor.b64 %rd2256, %rd2255, %rd2249;
xor.b64 %rd2257, %rd2256, %rd2720;
mul.lo.s64 %rd2258, %rd2257, %rd2705;
shr.u64 %rd2259, %rd2258, 32;
cvt.u32.u64 %r250, %rd2259;
shr.u64 %rd2260, %rd2254, 32;
xor.b64 %rd2261, %rd2260, %rd2228;
cvt.u32.u64 %r251, %rd2261;
xor.b32 %r252, %r345, %r251;
mul.lo.s32 %r253, %r252, %r346;
xor.b32 %r254, %r253, %r250;
xor.b32 %r255, %r254, %r347;
shr.u32 %r256, %r255, 9;
cvt.rn.f32.u32 %f205, %r256;
mul.rn.f32 %f206, %f205, 0f34000000;
cvt.rn.f16.f32 %h127, %f206;
mov.b16 %h128, 0x2E66;
setp.ge.f16 %p60, %h127, %h128;
ld.global.nc.b16 %h129, [%rd44+1792];
ld.global.nc.f32 %f207, [%rd45+3584];
cvt.rn.f16.f32 %h130, %f207;
add.rn.f16 %h131, %h129, %h130;
mov.b16 %h132, 0x3C72;
mul.rn.f16 %h133, %h131, %h132;
selp.b16 %h134, %h133, 0x0000, %p60;
cvt.f32.f16 %f208, %h134;
ld.global.nc.b16 %h135, [%rd46+1792];
cvt.f32.f16 %f209, %h135;
ld.global.nc.f32 %f210, [%rd47+3584];
mul.rn.f32 %f211, %f1, %f210;
mul.rn.f32 %f212, %f211, %f209;
ld.global.nc.f32 %f213, [%rd48+3584];
mul.rn.f32 %f214, %f2, %f211;
sub.rn.f32 %f215, %f213, %f214;
add.rn.f32 %f216, %f212, %f215;
add.rn.f32 %f217, %f216, %f208;
add.rn.f32 %f17, %f16, %f217;
or.b32 %r257, %r3, 897;
or.b32 %r258, %r257, %r4;
and.b32 %r259, %r257, 3;
shr.u32 %r260, %r258, 2;
setp.ne.s32 %p61, %r259, 1;
cvt.u64.u32 %rd2262, %r260;
add.s64 %rd433, %rd11, %rd2262;
@%p61 bra LBB79_49;
mov.u32 %r349, -845247145;
mov.u64 %rd2737, 1401181199;
mov.u64 %rd2726, 3144134277;
mov.u32 %r348, -616729560;
and.b64 %rd2302, %rd433, 4294967295;
mul.lo.s64 %rd2725, %rd2302, 3528531795;
setp.lt.u64 %p63, %rd433, %rd11;
selp.u64 %rd2303, 1, 0, %p63;
add.s64 %rd2304, %rd2461, %rd2303;
xor.b64 %rd2305, %rd2304, %rd2725;
shr.u64 %rd2306, %rd2305, 32;
mul.lo.s64 %rd2728, %rd2306, 3449720151;
shr.u64 %rd2307, %rd2728, 32;
and.b64 %rd2308, %rd2304, 4294967295;
mul.lo.s64 %rd2309, %rd2308, 3449720151;
and.b64 %rd2310, %rd2309, 4294967295;
xor.b64 %rd2311, %rd2310, %rd2307;
xor.b64 %rd2312, %rd2311, 2654435769;
mul.lo.s64 %rd2731, %rd2312, 3528531795;
xor.b64 %rd2721, %rd2309, %rd433;
mov.u64 %rd2738, 3041712726;
mov.u64 %rd2736, 2835769497;
mov.u64 %rd2735, 1684936478;
mov.u64 %rd2734, 2027808484;
mov.u64 %rd2733, 387276957;
mov.u64 %rd2732, 842468239;
mov.u64 %rd2730, 3986602516;
mov.u64 %rd2729, 1013904242;
mov.u64 %rd2727, 3668340011;
mov.u64 %rd2724, 3449720151;
mov.u64 %rd2723, 1993301258;
mov.u64 %rd2722, 3528531795;
bra.uni LBB79_50;
LBB79_49:
setp.lt.u64 %p62, %rd433, %rd11;
selp.u64 %rd2277, 1, 0, %p62;
add.s64 %rd2278, %rd2461, %rd2277;
and.b64 %rd2279, %rd2278, 4294967295;
mul.lo.s64 %rd2725, %rd2279, 3449720151;
xor.b64 %rd2280, %rd2725, %rd433;
shr.u64 %rd2281, %rd2280, 32;
mul.lo.s64 %rd2728, %rd2281, 3528531795;
shr.u64 %rd2282, %rd2728, 32;
and.b64 %rd2283, %rd433, 4294967295;
mul.lo.s64 %rd2284, %rd2283, 3528531795;
and.b64 %rd2285, %rd2284, 4294967295;
xor.b64 %rd2286, %rd2285, %rd2282;
xor.b64 %rd2287, %rd2286, 3144134277;
mul.lo.s64 %rd2731, %rd2287, 3449720151;
xor.b64 %rd2721, %rd2278, %rd2284;
mov.u32 %r349, -766435501;
mov.u32 %r348, -239350328;
mov.u64 %rd2738, 1684936478;
mov.u64 %rd2737, 534103459;
mov.u64 %rd2736, 387276957;
mov.u64 %rd2735, 3041712726;
mov.u64 %rd2734, 3986602516;
mov.u64 %rd2733, 2835769497;
mov.u64 %rd2732, 3668340011;
mov.u64 %rd2730, 2027808484;
mov.u64 %rd2729, 1993301258;
mov.u64 %rd2727, 842468239;
mov.u64 %rd2726, 2654435769;
mov.u64 %rd2724, 3528531795;
mov.u64 %rd2723, 1013904242;
mov.u64 %rd2722, 3449720151;
LBB79_50:
shr.u64 %rd2313, %rd2731, 32;
shr.u64 %rd2314, %rd2721, 32;
mul.lo.s64 %rd2315, %rd2314, %rd2722;
and.b64 %rd2316, %rd2315, 4294967295;
xor.b64 %rd2317, %rd2316, %rd2313;
xor.b64 %rd2318, %rd2317, %rd2723;
mul.lo.s64 %rd2319, %rd2318, %rd2724;
shr.u64 %rd2320, %rd2319, 32;
shr.u64 %rd2321, %rd2315, 32;
and.b64 %rd2322, %rd2725, 4294967295;
xor.b64 %rd2323, %rd2322, %rd2321;
xor.b64 %rd2324, %rd2323, %rd2726;
mul.lo.s64 %rd2325, %rd2324, %rd2724;
and.b64 %rd2326, %rd2325, 4294967295;
xor.b64 %rd2327, %rd2326, %rd2320;
xor.b64 %rd2328, %rd2327, %rd2727;
mul.lo.s64 %rd2329, %rd2328, %rd2722;
shr.u64 %rd2330, %rd2329, 32;
shr.u64 %rd2331, %rd2325, 32;
and.b64 %rd2332, %rd2728, 4294967295;
xor.b64 %rd2333, %rd2332, %rd2331;
xor.b64 %rd2334, %rd2333, %rd2729;
mul.lo.s64 %rd2335, %rd2334, %rd2722;
and.b64 %rd2336, %rd2335, 4294967295;
xor.b64 %rd2337, %rd2336, %rd2330;
xor.b64 %rd2338, %rd2337, %rd2730;
mul.lo.s64 %rd2339, %rd2338, %rd2724;
shr.u64 %rd2340, %rd2339, 32;
shr.u64 %rd2341, %rd2335, 32;
and.b64 %rd2342, %rd2731, 4294967295;
xor.b64 %rd2343, %rd2342, %rd2341;
xor.b64 %rd2344, %rd2343, %rd2732;
mul.lo.s64 %rd2345, %rd2344, %rd2724;
and.b64 %rd2346, %rd2345, 4294967295;
xor.b64 %rd2347, %rd2346, %rd2340;
xor.b64 %rd2348, %rd2347, %rd2733;
mul.lo.s64 %rd2349, %rd2348, %rd2722;
shr.u64 %rd2350, %rd2349, 32;
shr.u64 %rd2351, %rd2345, 32;
and.b64 %rd2352, %rd2319, 4294967295;
xor.b64 %rd2353, %rd2352, %rd2351;
xor.b64 %rd2354, %rd2353, %rd2734;
mul.lo.s64 %rd2355, %rd2354, %rd2722;
and.b64 %rd2356, %rd2355, 4294967295;
xor.b64 %rd2357, %rd2356, %rd2350;
xor.b64 %rd2358, %rd2357, %rd2735;
mul.lo.s64 %rd2359, %rd2358, %rd2724;
shr.u64 %rd2360, %rd2359, 32;
shr.u64 %rd2361, %rd2355, 32;
and.b64 %rd2362, %rd2329, 4294967295;
xor.b64 %rd2363, %rd2362, %rd2361;
xor.b64 %rd2364, %rd2363, %rd2736;
mul.lo.s64 %rd2365, %rd2364, %rd2724;
and.b64 %rd2366, %rd2365, 4294967295;
xor.b64 %rd2367, %rd2366, %rd2360;
xor.b64 %rd2368, %rd2367, %rd2737;
mul.lo.s64 %rd2369, %rd2368, %rd2722;
shr.u64 %rd2370, %rd2369, 32;
shr.u64 %rd2371, %rd2365, 32;
xor.b64 %rd2372, %rd2339, %rd2371;
xor.b64 %rd2373, %rd2372, %rd2738;
mul.lo.s64 %rd2374, %rd2373, %rd2722;
xor.b64 %rd2375, %rd2370, %rd2374;
cvt.u32.u64 %r265, %rd2375;
xor.b32 %r266, %r348, %r265;
mul.lo.s32 %r267, %r266, %r349;
shr.u32 %r268, %r267, 9;
cvt.rn.f32.u32 %f218, %r268;
mul.rn.f32 %f219, %f218, 0f34000000;
cvt.rn.f16.f32 %h136, %f219;
mov.b16 %h137, 0x2E66;
setp.ge.f16 %p64, %h136, %h137;
ld.global.nc.b16 %h138, [%rd44+1794];
ld.global.nc.f32 %f220, [%rd45+3588];
cvt.rn.f16.f32 %h139, %f220;
add.rn.f16 %h140, %h138, %h139;
mov.b16 %h141, 0x3C72;
mul.rn.f16 %h142, %h140, %h141;
selp.b16 %h143, %h142, 0x0000, %p64;
cvt.f32.f16 %f221, %h143;
ld.global.nc.b16 %h144, [%rd46+1794];
cvt.f32.f16 %f222, %h144;
ld.global.nc.f32 %f223, [%rd47+3588];
mul.rn.f32 %f224, %f1, %f223;
mul.rn.f32 %f225, %f224, %f222;
ld.global.nc.f32 %f226, [%rd48+3588];
mul.rn.f32 %f227, %f2, %f224;
sub.rn.f32 %f228, %f226, %f227;
add.rn.f32 %f229, %f225, %f228;
add.rn.f32 %f230, %f229, %f221;
add.rn.f32 %f231, %f17, %f230;
and.b32 %r46, %r1, 31;
shfl.sync.down.b32 %f232, %f231, 16, 31, -1;
add.rn.f32 %f233, %f232, %f231;
shfl.sync.down.b32 %f234, %f233, 8, 31, -1;
add.rn.f32 %f235, %f234, %f233;
shfl.sync.down.b32 %f236, %f235, 4, 31, -1;
add.rn.f32 %f237, %f236, %f235;
shfl.sync.down.b32 %f238, %f237, 2, 31, -1;
add.rn.f32 %f239, %f238, %f237;
shfl.sync.down.b32 %f240, %f239, 1, 31, -1;
shr.u32 %r47, %r1, 5;
setp.ne.s32 %p65, %r46, 0;
mov.u64 %rd2378, shared_cache_018;
@%p65 bra LBB79_2;
mul.wide.u32 %rd2377, %r47, 4;
add.s64 %rd461, %rd2378, %rd2377;
add.rn.f32 %f18, %f240, %f239;
st.shared.f32 [%rd461], %f18;
LBB79_2:
bar.sync 0;
setp.eq.s32 %p66, %r47, 0;
@%p66 bra LBB79_52;
bra.uni LBB79_3;
LBB79_52:
add.u64 %rd472, %SP, 0;
add.u64 %rd10, %SPL, 0;
mul.wide.u32 %rd2379, %r46, 4;
add.s64 %rd462, %rd2378, %rd2379;
cvta.shared.u64 %rd2381, %rd462;
mov.u32 %r269, 0;
st.local.u32 [%rd10], %r269;
setp.lt.u32 %p67, %r1, 2;
selp.b64 %rd2383, %rd2381, %rd472, %p67;
ld.f32 %f241, [%rd2383];
shfl.sync.down.b32 %f242, %f241, 16, 31, -1;
add.rn.f32 %f243, %f241, %f242;
shfl.sync.down.b32 %f244, %f243, 8, 31, -1;
add.rn.f32 %f245, %f243, %f244;
shfl.sync.down.b32 %f246, %f245, 4, 31, -1;
add.rn.f32 %f247, %f245, %f246;
shfl.sync.down.b32 %f248, %f247, 2, 31, -1;
add.rn.f32 %f249, %f247, %f248;
shfl.sync.down.b32 %f250, %f249, 1, 31, -1;
add.rn.f32 %f251, %f249, %f250;
st.f32 [%rd2383], %f251;
setp.ne.s32 %p68, %r1, 0;
@%p68 bra LBB79_3;
ld.param.u64 %rd469, [fusion_2180_param_3];
cvt.u64.u32 %rd43, %r2;
cvta.to.global.u64 %rd6, %rd469;
shl.b64 %rd2376, %rd43, 2;
add.s64 %rd460, %rd6, %rd2376;
ld.shared.f32 %f252, [%rd462];
atom.global.add.f32 %f253, [%rd460], %f252;
LBB79_3:
ret;
}
// .globl fusion_2178
.visible .entry fusion_2178(
.param .u64 fusion_2178_param_0,
.param .u64 fusion_2178_param_1,
.param .u64 fusion_2178_param_2,
.param .u64 fusion_2178_param_3,
.param .u64 fusion_2178_param_4,
.param .u64 fusion_2178_param_5,
.param .u64 fusion_2178_param_6,
.param .u64 fusion_2178_param_7,
.param .u64 fusion_2178_param_8,
.param .u64 fusion_2178_param_9,
.param .u64 fusion_2178_param_10
)
.reqntid 64, 1, 1
{
.local .align 4 .b8 __local_depot80[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<75>;
.reg .b16 %h<145>;
.reg .f32 %f<288>;
.reg .b32 %r<350>;
.reg .b64 %rd<2742>;
mov.u64 %SPL, __local_depot80;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd464, [fusion_2178_param_0];
ld.param.u64 %rd465, [fusion_2178_param_9];
cvta.to.global.u64 %rd1, %rd465;
ld.param.u64 %rd466, [fusion_2178_param_1];
ld.param.u64 %rd467, [fusion_2178_param_8];
cvta.to.global.u64 %rd2, %rd467;
ld.param.u64 %rd468, [fusion_2178_param_2];
ld.param.u64 %rd469, [fusion_2178_param_7];
cvta.to.global.u64 %rd3, %rd469;
ld.param.u64 %rd471, [fusion_2178_param_6];
cvta.to.global.u64 %rd4, %rd471;
ld.param.u64 %rd472, [fusion_2178_param_4];
ld.param.u64 %rd473, [fusion_2178_param_5];
cvta.to.global.u64 %rd5, %rd473;
cvta.to.global.u64 %rd6, %rd472;
cvta.to.global.u64 %rd8, %rd468;
cvta.to.global.u64 %rd9, %rd466;
cvta.to.global.u64 %rd10, %rd464;
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
shl.b32 %r3, %r1, 1;
shl.b32 %r4, %r2, 10;
or.b32 %r48, %r4, %r3;
shr.u32 %r49, %r48, 2;
and.b32 %r5, %r1, 1;
setp.eq.s32 %p1, %r5, 0;
ld.global.nc.u64 %rd12, [%rd8];
cvt.u64.u32 %rd475, %r49;
add.s64 %rd13, %rd12, %rd475;
setp.lt.u64 %p69, %rd13, %rd12;
and.b64 %rd2387, %rd13, 4294967295;
@%p1 bra LBB80_1;
bra.uni LBB80_4;
LBB80_1:
mul.lo.s64 %rd2449, %rd2387, 3528531795;
ld.global.nc.u64 %rd2464, [%rd8+8];
selp.u64 %rd518, 1, 0, %p69;
add.s64 %rd519, %rd2464, %rd518;
xor.b64 %rd520, %rd519, %rd2449;
shr.u64 %rd521, %rd520, 32;
mul.lo.s64 %rd2452, %rd521, 3449720151;
shr.u64 %rd522, %rd2452, 32;
and.b64 %rd523, %rd519, 4294967295;
mul.lo.s64 %rd524, %rd523, 3449720151;
and.b64 %rd525, %rd524, 4294967295;
xor.b64 %rd526, %rd525, %rd522;
xor.b64 %rd527, %rd526, 2654435769;
mul.lo.s64 %rd2455, %rd527, 3528531795;
xor.b64 %rd2445, %rd524, %rd13;
mov.u32 %r312, -1879881855;
mov.u32 %r311, -845247145;
mov.u32 %r310, 534103459;
mov.u64 %rd2463, 3678237736;
mov.u64 %rd2462, 3041712726;
mov.u64 %rd2461, 1401181199;
mov.u64 %rd2460, 2835769497;
mov.u64 %rd2459, 1684936478;
mov.u64 %rd2458, 2027808484;
mov.u64 %rd2457, 387276957;
mov.u64 %rd2456, 842468239;
mov.u64 %rd2454, 3986602516;
mov.u64 %rd2453, 1013904242;
mov.u64 %rd2451, 3668340011;
mov.u64 %rd2450, 3144134277;
mov.u64 %rd2448, 3449720151;
mov.u64 %rd2447, 1993301258;
mov.u64 %rd2446, 3528531795;
bra.uni LBB80_5;
LBB80_4:
mov.u32 %r311, -766435501;
mov.u64 %rd2462, 1684936478;
mov.u64 %rd2461, 534103459;
mov.u64 %rd2460, 387276957;
mov.u64 %rd2459, 3041712726;
mov.u64 %rd2458, 3986602516;
mov.u64 %rd2457, 2835769497;
mov.u64 %rd2456, 3668340011;
mov.u64 %rd2454, 2027808484;
mov.u64 %rd2453, 1993301258;
mov.u64 %rd2451, 842468239;
mov.u64 %rd2450, 2654435769;
mov.u64 %rd2448, 3528531795;
mov.u64 %rd2447, 1013904242;
mov.u64 %rd2446, 3449720151;
mov.u32 %r312, -1767562579;
mov.u32 %r310, 1401181199;
mov.u64 %rd2463, 4055616968;
ld.global.nc.u64 %rd2464, [%rd8+8];
selp.u64 %rd491, 1, 0, %p69;
add.s64 %rd492, %rd2464, %rd491;
and.b64 %rd493, %rd492, 4294967295;
mul.lo.s64 %rd2449, %rd493, 3449720151;
xor.b64 %rd494, %rd2449, %rd13;
shr.u64 %rd495, %rd494, 32;
mul.lo.s64 %rd2452, %rd495, 3528531795;
shr.u64 %rd496, %rd2452, 32;
mul.lo.s64 %rd498, %rd2387, 3528531795;
and.b64 %rd499, %rd498, 4294967295;
xor.b64 %rd500, %rd499, %rd496;
xor.b64 %rd501, %rd500, 3144134277;
mul.lo.s64 %rd2455, %rd501, 3449720151;
xor.b64 %rd2445, %rd492, %rd498;
LBB80_5:
shr.u64 %rd528, %rd2455, 32;
shr.u64 %rd529, %rd2445, 32;
mul.lo.s64 %rd530, %rd529, %rd2446;
and.b64 %rd531, %rd530, 4294967295;
xor.b64 %rd532, %rd531, %rd528;
xor.b64 %rd533, %rd532, %rd2447;
mul.lo.s64 %rd534, %rd533, %rd2448;
shr.u64 %rd535, %rd534, 32;
shr.u64 %rd536, %rd530, 32;
and.b64 %rd537, %rd2449, 4294967295;
xor.b64 %rd538, %rd537, %rd536;
xor.b64 %rd539, %rd538, %rd2450;
mul.lo.s64 %rd540, %rd539, %rd2448;
and.b64 %rd541, %rd540, 4294967295;
xor.b64 %rd542, %rd541, %rd535;
xor.b64 %rd543, %rd542, %rd2451;
mul.lo.s64 %rd544, %rd543, %rd2446;
shr.u64 %rd545, %rd544, 32;
shr.u64 %rd546, %rd540, 32;
and.b64 %rd547, %rd2452, 4294967295;
xor.b64 %rd548, %rd547, %rd546;
xor.b64 %rd549, %rd548, %rd2453;
mul.lo.s64 %rd550, %rd549, %rd2446;
and.b64 %rd551, %rd550, 4294967295;
xor.b64 %rd552, %rd551, %rd545;
xor.b64 %rd553, %rd552, %rd2454;
mul.lo.s64 %rd554, %rd553, %rd2448;
shr.u64 %rd555, %rd554, 32;
shr.u64 %rd556, %rd550, 32;
and.b64 %rd557, %rd2455, 4294967295;
xor.b64 %rd558, %rd557, %rd556;
xor.b64 %rd559, %rd558, %rd2456;
mul.lo.s64 %rd560, %rd559, %rd2448;
and.b64 %rd561, %rd560, 4294967295;
xor.b64 %rd562, %rd561, %rd555;
xor.b64 %rd563, %rd562, %rd2457;
mul.lo.s64 %rd564, %rd563, %rd2446;
shr.u64 %rd565, %rd564, 32;
shr.u64 %rd566, %rd560, 32;
and.b64 %rd567, %rd534, 4294967295;
xor.b64 %rd568, %rd567, %rd566;
xor.b64 %rd569, %rd568, %rd2458;
mul.lo.s64 %rd570, %rd569, %rd2446;
and.b64 %rd571, %rd570, 4294967295;
xor.b64 %rd572, %rd571, %rd565;
xor.b64 %rd573, %rd572, %rd2459;
mul.lo.s64 %rd574, %rd573, %rd2448;
shr.u64 %rd575, %rd574, 32;
shr.u64 %rd576, %rd570, 32;
and.b64 %rd577, %rd544, 4294967295;
xor.b64 %rd578, %rd577, %rd576;
xor.b64 %rd579, %rd578, %rd2460;
mul.lo.s64 %rd580, %rd579, %rd2448;
and.b64 %rd581, %rd580, 4294967295;
xor.b64 %rd582, %rd581, %rd575;
xor.b64 %rd583, %rd582, %rd2461;
mul.lo.s64 %rd584, %rd583, %rd2446;
shr.u64 %rd585, %rd584, 32;
shr.u64 %rd586, %rd580, 32;
and.b64 %rd587, %rd554, 4294967295;
xor.b64 %rd588, %rd587, %rd586;
xor.b64 %rd589, %rd588, %rd2462;
mul.lo.s64 %rd590, %rd589, %rd2446;
and.b64 %rd591, %rd590, 4294967295;
xor.b64 %rd592, %rd591, %rd585;
xor.b64 %rd593, %rd592, %rd2463;
mul.lo.s64 %rd594, %rd593, %rd2448;
shr.u64 %rd595, %rd594, 32;
cvt.u32.u64 %r56, %rd595;
shr.u64 %rd596, %rd590, 32;
xor.b64 %rd597, %rd596, %rd564;
cvt.u32.u64 %r57, %rd597;
xor.b32 %r58, %r310, %r57;
mul.lo.s32 %r59, %r58, %r311;
xor.b32 %r60, %r59, %r56;
xor.b32 %r61, %r60, %r312;
shr.u32 %r62, %r61, 9;
cvt.rn.f32.u32 %f20, %r62;
mul.rn.f32 %f21, %f20, 0f34000000;
cvt.rn.f16.f32 %h1, %f21;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p4, %h1, %h2;
mul.wide.u32 %rd598, %r2, 2048;
add.s64 %rd599, %rd10, %rd598;
mul.wide.u32 %rd600, %r3, 2;
add.s64 %rd45, %rd599, %rd600;
ld.global.nc.b16 %h3, [%rd45];
mul.wide.u32 %rd601, %r3, 4;
add.s64 %rd46, %rd1, %rd601;
ld.global.nc.f32 %f22, [%rd46];
cvt.rn.f16.f32 %h4, %f22;
add.rn.f16 %h5, %h3, %h4;
mov.b16 %h6, 0x3C72;
mul.rn.f16 %h7, %h5, %h6;
selp.b16 %h8, %h7, 0x0000, %p4;
cvt.f32.f16 %f23, %h8;
add.s64 %rd602, %rd9, %rd598;
add.s64 %rd47, %rd602, %rd600;
ld.global.nc.b16 %h9, [%rd47];
cvt.f32.f16 %f24, %h9;
mul.wide.u32 %rd603, %r2, 4;
add.s64 %rd604, %rd5, %rd603;
ld.global.nc.f32 %f25, [%rd604];
mul.rn.f32 %f26, %f25, 0f3A800000;
add.rn.f32 %f27, %f26, 0f2B8CBCCC;
rsqrt.approx.f32 %f1, %f27;
add.s64 %rd48, %rd2, %rd601;
ld.global.nc.f32 %f28, [%rd48];
mul.rn.f32 %f29, %f1, %f28;
mul.rn.f32 %f30, %f29, %f24;
add.s64 %rd49, %rd3, %rd601;
ld.global.nc.f32 %f31, [%rd49];
add.s64 %rd605, %rd4, %rd603;
ld.global.nc.f32 %f32, [%rd605];
mul.rn.f32 %f2, %f32, 0f3A800000;
mul.rn.f32 %f33, %f29, %f2;
sub.rn.f32 %f34, %f31, %f33;
add.rn.f32 %f35, %f30, %f34;
add.rn.f32 %f36, %f35, %f23;
add.s64 %rd606, %rd6, %rd603;
ld.global.nc.f32 %f37, [%rd606];
mul.rn.f32 %f3, %f37, 0f3A800000;
sub.rn.f32 %f38, %f36, %f3;
mul.rn.f32 %f39, %f38, %f38;
add.rn.f32 %f4, %f39, 0f00000000;
or.b32 %r63, %r3, 1;
and.b32 %r64, %r63, 3;
setp.ne.s32 %p5, %r64, 1;
@%p5 bra LBB80_7;
mul.lo.s64 %rd2469, %rd2387, 3528531795;
selp.u64 %rd647, 1, 0, %p69;
add.s64 %rd648, %rd2464, %rd647;
xor.b64 %rd649, %rd648, %rd2469;
shr.u64 %rd650, %rd649, 32;
mul.lo.s64 %rd2472, %rd650, 3449720151;
shr.u64 %rd651, %rd2472, 32;
and.b64 %rd652, %rd648, 4294967295;
mul.lo.s64 %rd653, %rd652, 3449720151;
and.b64 %rd654, %rd653, 4294967295;
xor.b64 %rd655, %rd654, %rd651;
xor.b64 %rd656, %rd655, 2654435769;
mul.lo.s64 %rd2475, %rd656, 3528531795;
xor.b64 %rd2465, %rd653, %rd13;
mov.u32 %r314, -845247145;
mov.u32 %r313, -616729560;
mov.u64 %rd2482, 3041712726;
mov.u64 %rd2481, 1401181199;
mov.u64 %rd2480, 2835769497;
mov.u64 %rd2479, 1684936478;
mov.u64 %rd2478, 2027808484;
mov.u64 %rd2477, 387276957;
mov.u64 %rd2476, 842468239;
mov.u64 %rd2474, 3986602516;
mov.u64 %rd2473, 1013904242;
mov.u64 %rd2471, 3668340011;
mov.u64 %rd2470, 3144134277;
mov.u64 %rd2468, 3449720151;
mov.u64 %rd2467, 1993301258;
mov.u64 %rd2466, 3528531795;
bra.uni LBB80_8;
LBB80_7:
mov.u32 %r313, -239350328;
selp.u64 %rd621, 1, 0, %p69;
add.s64 %rd622, %rd2464, %rd621;
and.b64 %rd623, %rd622, 4294967295;
mul.lo.s64 %rd2469, %rd623, 3449720151;
xor.b64 %rd624, %rd2469, %rd13;
shr.u64 %rd625, %rd624, 32;
mul.lo.s64 %rd2472, %rd625, 3528531795;
shr.u64 %rd626, %rd2472, 32;
mul.lo.s64 %rd628, %rd2387, 3528531795;
and.b64 %rd629, %rd628, 4294967295;
xor.b64 %rd630, %rd629, %rd626;
xor.b64 %rd631, %rd630, 3144134277;
mul.lo.s64 %rd2475, %rd631, 3449720151;
xor.b64 %rd2465, %rd622, %rd628;
mov.u32 %r314, -766435501;
mov.u64 %rd2482, 1684936478;
mov.u64 %rd2481, 534103459;
mov.u64 %rd2480, 387276957;
mov.u64 %rd2479, 3041712726;
mov.u64 %rd2478, 3986602516;
mov.u64 %rd2477, 2835769497;
mov.u64 %rd2476, 3668340011;
mov.u64 %rd2474, 2027808484;
mov.u64 %rd2473, 1993301258;
mov.u64 %rd2471, 842468239;
mov.u64 %rd2470, 2654435769;
mov.u64 %rd2468, 3528531795;
mov.u64 %rd2467, 1013904242;
mov.u64 %rd2466, 3449720151;
LBB80_8:
setp.ne.s32 %p8, %r5, 0;
shr.u64 %rd657, %rd2475, 32;
shr.u64 %rd658, %rd2465, 32;
mul.lo.s64 %rd659, %rd658, %rd2466;
and.b64 %rd660, %rd659, 4294967295;
xor.b64 %rd661, %rd660, %rd657;
xor.b64 %rd662, %rd661, %rd2467;
mul.lo.s64 %rd663, %rd662, %rd2468;
shr.u64 %rd664, %rd663, 32;
shr.u64 %rd665, %rd659, 32;
and.b64 %rd666, %rd2469, 4294967295;
xor.b64 %rd667, %rd666, %rd665;
xor.b64 %rd668, %rd667, %rd2470;
mul.lo.s64 %rd669, %rd668, %rd2468;
and.b64 %rd670, %rd669, 4294967295;
xor.b64 %rd671, %rd670, %rd664;
xor.b64 %rd672, %rd671, %rd2471;
mul.lo.s64 %rd673, %rd672, %rd2466;
shr.u64 %rd674, %rd673, 32;
shr.u64 %rd675, %rd669, 32;
and.b64 %rd676, %rd2472, 4294967295;
xor.b64 %rd677, %rd676, %rd675;
xor.b64 %rd678, %rd677, %rd2473;
mul.lo.s64 %rd679, %rd678, %rd2466;
and.b64 %rd680, %rd679, 4294967295;
xor.b64 %rd681, %rd680, %rd674;
xor.b64 %rd682, %rd681, %rd2474;
mul.lo.s64 %rd683, %rd682, %rd2468;
shr.u64 %rd684, %rd683, 32;
shr.u64 %rd685, %rd679, 32;
and.b64 %rd686, %rd2475, 4294967295;
xor.b64 %rd687, %rd686, %rd685;
xor.b64 %rd688, %rd687, %rd2476;
mul.lo.s64 %rd689, %rd688, %rd2468;
and.b64 %rd690, %rd689, 4294967295;
xor.b64 %rd691, %rd690, %rd684;
xor.b64 %rd692, %rd691, %rd2477;
mul.lo.s64 %rd693, %rd692, %rd2466;
shr.u64 %rd694, %rd693, 32;
shr.u64 %rd695, %rd689, 32;
and.b64 %rd696, %rd663, 4294967295;
xor.b64 %rd697, %rd696, %rd695;
xor.b64 %rd698, %rd697, %rd2478;
mul.lo.s64 %rd699, %rd698, %rd2466;
and.b64 %rd700, %rd699, 4294967295;
xor.b64 %rd701, %rd700, %rd694;
xor.b64 %rd702, %rd701, %rd2479;
mul.lo.s64 %rd703, %rd702, %rd2468;
shr.u64 %rd704, %rd703, 32;
shr.u64 %rd705, %rd699, 32;
and.b64 %rd706, %rd673, 4294967295;
xor.b64 %rd707, %rd706, %rd705;
xor.b64 %rd708, %rd707, %rd2480;
mul.lo.s64 %rd709, %rd708, %rd2468;
and.b64 %rd710, %rd709, 4294967295;
xor.b64 %rd711, %rd710, %rd704;
xor.b64 %rd712, %rd711, %rd2481;
mul.lo.s64 %rd713, %rd712, %rd2466;
shr.u64 %rd714, %rd713, 32;
shr.u64 %rd715, %rd709, 32;
xor.b64 %rd716, %rd683, %rd715;
xor.b64 %rd717, %rd716, %rd2482;
mul.lo.s64 %rd718, %rd717, %rd2466;
xor.b64 %rd719, %rd714, %rd718;
cvt.u32.u64 %r69, %rd719;
xor.b32 %r70, %r313, %r69;
mul.lo.s32 %r71, %r70, %r314;
shr.u32 %r72, %r71, 9;
cvt.rn.f32.u32 %f40, %r72;
mul.rn.f32 %f41, %f40, 0f34000000;
cvt.rn.f16.f32 %h10, %f41;
mov.b16 %h11, 0x2E66;
setp.ge.f16 %p9, %h10, %h11;
ld.global.nc.b16 %h12, [%rd45+2];
ld.global.nc.f32 %f42, [%rd46+4];
cvt.rn.f16.f32 %h13, %f42;
add.rn.f16 %h14, %h12, %h13;
mov.b16 %h15, 0x3C72;
mul.rn.f16 %h16, %h14, %h15;
selp.b16 %h17, %h16, 0x0000, %p9;
cvt.f32.f16 %f43, %h17;
ld.global.nc.b16 %h18, [%rd47+2];
cvt.f32.f16 %f44, %h18;
ld.global.nc.f32 %f45, [%rd48+4];
mul.rn.f32 %f46, %f1, %f45;
mul.rn.f32 %f47, %f46, %f44;
ld.global.nc.f32 %f48, [%rd49+4];
mul.rn.f32 %f49, %f2, %f46;
sub.rn.f32 %f50, %f48, %f49;
add.rn.f32 %f51, %f47, %f50;
add.rn.f32 %f52, %f51, %f43;
sub.rn.f32 %f53, %f52, %f3;
mul.rn.f32 %f54, %f53, %f53;
add.rn.f32 %f5, %f4, %f54;
or.b32 %r73, %r3, %r4;
or.b32 %r74, %r73, 128;
shr.u32 %r75, %r74, 2;
cvt.u64.u32 %rd720, %r75;
add.s64 %rd76, %rd12, %rd720;
and.b64 %rd2436, %rd76, 4294967295;
setp.lt.u64 %p74, %rd76, %rd12;
@%p8 bra LBB80_10;
mul.lo.s64 %rd2487, %rd2436, 3528531795;
selp.u64 %rd763, 1, 0, %p74;
add.s64 %rd764, %rd2464, %rd763;
xor.b64 %rd765, %rd764, %rd2487;
shr.u64 %rd766, %rd765, 32;
mul.lo.s64 %rd2490, %rd766, 3449720151;
shr.u64 %rd767, %rd2490, 32;
and.b64 %rd768, %rd764, 4294967295;
mul.lo.s64 %rd769, %rd768, 3449720151;
and.b64 %rd770, %rd769, 4294967295;
xor.b64 %rd771, %rd770, %rd767;
xor.b64 %rd772, %rd771, 2654435769;
mul.lo.s64 %rd2493, %rd772, 3528531795;
xor.b64 %rd2483, %rd769, %rd76;
mov.u32 %r317, -1879881855;
mov.u32 %r316, -845247145;
mov.u32 %r315, 534103459;
mov.u64 %rd2501, 3678237736;
mov.u64 %rd2500, 3041712726;
mov.u64 %rd2499, 1401181199;
mov.u64 %rd2498, 2835769497;
mov.u64 %rd2497, 1684936478;
mov.u64 %rd2496, 2027808484;
mov.u64 %rd2495, 387276957;
mov.u64 %rd2494, 842468239;
mov.u64 %rd2492, 3986602516;
mov.u64 %rd2491, 1013904242;
mov.u64 %rd2489, 3668340011;
mov.u64 %rd2488, 3144134277;
mov.u64 %rd2486, 3449720151;
mov.u64 %rd2485, 1993301258;
mov.u64 %rd2484, 3528531795;
bra.uni LBB80_11;
LBB80_10:
selp.u64 %rd736, 1, 0, %p74;
add.s64 %rd737, %rd2464, %rd736;
and.b64 %rd738, %rd737, 4294967295;
mul.lo.s64 %rd2487, %rd738, 3449720151;
xor.b64 %rd739, %rd2487, %rd76;
shr.u64 %rd740, %rd739, 32;
mul.lo.s64 %rd2490, %rd740, 3528531795;
shr.u64 %rd741, %rd2490, 32;
mul.lo.s64 %rd743, %rd2436, 3528531795;
and.b64 %rd744, %rd743, 4294967295;
xor.b64 %rd745, %rd744, %rd741;
xor.b64 %rd746, %rd745, 3144134277;
mul.lo.s64 %rd2493, %rd746, 3449720151;
xor.b64 %rd2483, %rd737, %rd743;
mov.u32 %r317, -1767562579;
mov.u32 %r316, -766435501;
mov.u32 %r315, 1401181199;
mov.u64 %rd2501, 4055616968;
mov.u64 %rd2500, 1684936478;
mov.u64 %rd2499, 534103459;
mov.u64 %rd2498, 387276957;
mov.u64 %rd2497, 3041712726;
mov.u64 %rd2496, 3986602516;
mov.u64 %rd2495, 2835769497;
mov.u64 %rd2494, 3668340011;
mov.u64 %rd2492, 2027808484;
mov.u64 %rd2491, 1993301258;
mov.u64 %rd2489, 842468239;
mov.u64 %rd2488, 2654435769;
mov.u64 %rd2486, 3528531795;
mov.u64 %rd2485, 1013904242;
mov.u64 %rd2484, 3449720151;
LBB80_11:
shr.u64 %rd773, %rd2493, 32;
shr.u64 %rd774, %rd2483, 32;
mul.lo.s64 %rd775, %rd774, %rd2484;
and.b64 %rd776, %rd775, 4294967295;
xor.b64 %rd777, %rd776, %rd773;
xor.b64 %rd778, %rd777, %rd2485;
mul.lo.s64 %rd779, %rd778, %rd2486;
shr.u64 %rd780, %rd779, 32;
shr.u64 %rd781, %rd775, 32;
and.b64 %rd782, %rd2487, 4294967295;
xor.b64 %rd783, %rd782, %rd781;
xor.b64 %rd784, %rd783, %rd2488;
mul.lo.s64 %rd785, %rd784, %rd2486;
and.b64 %rd786, %rd785, 4294967295;
xor.b64 %rd787, %rd786, %rd780;
xor.b64 %rd788, %rd787, %rd2489;
mul.lo.s64 %rd789, %rd788, %rd2484;
shr.u64 %rd790, %rd789, 32;
shr.u64 %rd791, %rd785, 32;
and.b64 %rd792, %rd2490, 4294967295;
xor.b64 %rd793, %rd792, %rd791;
xor.b64 %rd794, %rd793, %rd2491;
mul.lo.s64 %rd795, %rd794, %rd2484;
and.b64 %rd796, %rd795, 4294967295;
xor.b64 %rd797, %rd796, %rd790;
xor.b64 %rd798, %rd797, %rd2492;
mul.lo.s64 %rd799, %rd798, %rd2486;
shr.u64 %rd800, %rd799, 32;
shr.u64 %rd801, %rd795, 32;
and.b64 %rd802, %rd2493, 4294967295;
xor.b64 %rd803, %rd802, %rd801;
xor.b64 %rd804, %rd803, %rd2494;
mul.lo.s64 %rd805, %rd804, %rd2486;
and.b64 %rd806, %rd805, 4294967295;
xor.b64 %rd807, %rd806, %rd800;
xor.b64 %rd808, %rd807, %rd2495;
mul.lo.s64 %rd809, %rd808, %rd2484;
shr.u64 %rd810, %rd809, 32;
shr.u64 %rd811, %rd805, 32;
and.b64 %rd812, %rd779, 4294967295;
xor.b64 %rd813, %rd812, %rd811;
xor.b64 %rd814, %rd813, %rd2496;
mul.lo.s64 %rd815, %rd814, %rd2484;
and.b64 %rd816, %rd815, 4294967295;
xor.b64 %rd817, %rd816, %rd810;
xor.b64 %rd818, %rd817, %rd2497;
mul.lo.s64 %rd819, %rd818, %rd2486;
shr.u64 %rd820, %rd819, 32;
shr.u64 %rd821, %rd815, 32;
and.b64 %rd822, %rd789, 4294967295;
xor.b64 %rd823, %rd822, %rd821;
xor.b64 %rd824, %rd823, %rd2498;
mul.lo.s64 %rd825, %rd824, %rd2486;
and.b64 %rd826, %rd825, 4294967295;
xor.b64 %rd827, %rd826, %rd820;
xor.b64 %rd828, %rd827, %rd2499;
mul.lo.s64 %rd829, %rd828, %rd2484;
shr.u64 %rd830, %rd829, 32;
shr.u64 %rd831, %rd825, 32;
and.b64 %rd832, %rd799, 4294967295;
xor.b64 %rd833, %rd832, %rd831;
xor.b64 %rd834, %rd833, %rd2500;
mul.lo.s64 %rd835, %rd834, %rd2484;
and.b64 %rd836, %rd835, 4294967295;
xor.b64 %rd837, %rd836, %rd830;
xor.b64 %rd838, %rd837, %rd2501;
mul.lo.s64 %rd839, %rd838, %rd2486;
shr.u64 %rd840, %rd839, 32;
cvt.u32.u64 %r82, %rd840;
shr.u64 %rd841, %rd835, 32;
xor.b64 %rd842, %rd841, %rd809;
cvt.u32.u64 %r83, %rd842;
xor.b32 %r84, %r315, %r83;
mul.lo.s32 %r85, %r84, %r316;
xor.b32 %r86, %r85, %r82;
xor.b32 %r87, %r86, %r317;
shr.u32 %r88, %r87, 9;
cvt.rn.f32.u32 %f55, %r88;
mul.rn.f32 %f56, %f55, 0f34000000;
cvt.rn.f16.f32 %h19, %f56;
mov.b16 %h20, 0x2E66;
setp.ge.f16 %p12, %h19, %h20;
ld.global.nc.b16 %h21, [%rd45+256];
ld.global.nc.f32 %f57, [%rd46+512];
cvt.rn.f16.f32 %h22, %f57;
add.rn.f16 %h23, %h21, %h22;
mov.b16 %h24, 0x3C72;
mul.rn.f16 %h25, %h23, %h24;
selp.b16 %h26, %h25, 0x0000, %p12;
cvt.f32.f16 %f58, %h26;
ld.global.nc.b16 %h27, [%rd47+256];
cvt.f32.f16 %f59, %h27;
ld.global.nc.f32 %f60, [%rd48+512];
mul.rn.f32 %f61, %f1, %f60;
mul.rn.f32 %f62, %f61, %f59;
ld.global.nc.f32 %f63, [%rd49+512];
mul.rn.f32 %f64, %f2, %f61;
sub.rn.f32 %f65, %f63, %f64;
add.rn.f32 %f66, %f62, %f65;
add.rn.f32 %f67, %f66, %f58;
sub.rn.f32 %f68, %f67, %f3;
mul.rn.f32 %f69, %f68, %f68;
add.rn.f32 %f6, %f5, %f69;
or.b32 %r89, %r3, 129;
or.b32 %r90, %r89, %r4;
and.b32 %r91, %r89, 3;
shr.u32 %r92, %r90, 2;
setp.ne.s32 %p13, %r91, 1;
cvt.u64.u32 %rd843, %r92;
add.s64 %rd104, %rd12, %rd843;
and.b64 %rd2433, %rd104, 4294967295;
setp.lt.u64 %p73, %rd104, %rd12;
@%p13 bra LBB80_13;
mul.lo.s64 %rd2506, %rd2433, 3528531795;
selp.u64 %rd884, 1, 0, %p73;
add.s64 %rd885, %rd2464, %rd884;
xor.b64 %rd886, %rd885, %rd2506;
shr.u64 %rd887, %rd886, 32;
mul.lo.s64 %rd2509, %rd887, 3449720151;
shr.u64 %rd888, %rd2509, 32;
and.b64 %rd889, %rd885, 4294967295;
mul.lo.s64 %rd890, %rd889, 3449720151;
and.b64 %rd891, %rd890, 4294967295;
xor.b64 %rd892, %rd891, %rd888;
xor.b64 %rd893, %rd892, 2654435769;
mul.lo.s64 %rd2512, %rd893, 3528531795;
xor.b64 %rd2502, %rd890, %rd104;
mov.u32 %r319, -845247145;
mov.u32 %r318, -616729560;
mov.u64 %rd2519, 3041712726;
mov.u64 %rd2518, 1401181199;
mov.u64 %rd2517, 2835769497;
mov.u64 %rd2516, 1684936478;
mov.u64 %rd2515, 2027808484;
mov.u64 %rd2514, 387276957;
mov.u64 %rd2513, 842468239;
mov.u64 %rd2511, 3986602516;
mov.u64 %rd2510, 1013904242;
mov.u64 %rd2508, 3668340011;
mov.u64 %rd2507, 3144134277;
mov.u64 %rd2505, 3449720151;
mov.u64 %rd2504, 1993301258;
mov.u64 %rd2503, 3528531795;
bra.uni LBB80_14;
LBB80_13:
selp.u64 %rd858, 1, 0, %p73;
add.s64 %rd859, %rd2464, %rd858;
and.b64 %rd860, %rd859, 4294967295;
mul.lo.s64 %rd2506, %rd860, 3449720151;
xor.b64 %rd861, %rd2506, %rd104;
shr.u64 %rd862, %rd861, 32;
mul.lo.s64 %rd2509, %rd862, 3528531795;
shr.u64 %rd863, %rd2509, 32;
mul.lo.s64 %rd865, %rd2433, 3528531795;
and.b64 %rd866, %rd865, 4294967295;
xor.b64 %rd867, %rd866, %rd863;
xor.b64 %rd868, %rd867, 3144134277;
mul.lo.s64 %rd2512, %rd868, 3449720151;
xor.b64 %rd2502, %rd859, %rd865;
mov.u32 %r319, -766435501;
mov.u32 %r318, -239350328;
mov.u64 %rd2519, 1684936478;
mov.u64 %rd2518, 534103459;
mov.u64 %rd2517, 387276957;
mov.u64 %rd2516, 3041712726;
mov.u64 %rd2515, 3986602516;
mov.u64 %rd2514, 2835769497;
mov.u64 %rd2513, 3668340011;
mov.u64 %rd2511, 2027808484;
mov.u64 %rd2510, 1993301258;
mov.u64 %rd2508, 842468239;
mov.u64 %rd2507, 2654435769;
mov.u64 %rd2505, 3528531795;
mov.u64 %rd2504, 1013904242;
mov.u64 %rd2503, 3449720151;
LBB80_14:
shr.u64 %rd894, %rd2512, 32;
shr.u64 %rd895, %rd2502, 32;
mul.lo.s64 %rd896, %rd895, %rd2503;
and.b64 %rd897, %rd896, 4294967295;
xor.b64 %rd898, %rd897, %rd894;
xor.b64 %rd899, %rd898, %rd2504;
mul.lo.s64 %rd900, %rd899, %rd2505;
shr.u64 %rd901, %rd900, 32;
shr.u64 %rd902, %rd896, 32;
and.b64 %rd903, %rd2506, 4294967295;
xor.b64 %rd904, %rd903, %rd902;
xor.b64 %rd905, %rd904, %rd2507;
mul.lo.s64 %rd906, %rd905, %rd2505;
and.b64 %rd907, %rd906, 4294967295;
xor.b64 %rd908, %rd907, %rd901;
xor.b64 %rd909, %rd908, %rd2508;
mul.lo.s64 %rd910, %rd909, %rd2503;
shr.u64 %rd911, %rd910, 32;
shr.u64 %rd912, %rd906, 32;
and.b64 %rd913, %rd2509, 4294967295;
xor.b64 %rd914, %rd913, %rd912;
xor.b64 %rd915, %rd914, %rd2510;
mul.lo.s64 %rd916, %rd915, %rd2503;
and.b64 %rd917, %rd916, 4294967295;
xor.b64 %rd918, %rd917, %rd911;
xor.b64 %rd919, %rd918, %rd2511;
mul.lo.s64 %rd920, %rd919, %rd2505;
shr.u64 %rd921, %rd920, 32;
shr.u64 %rd922, %rd916, 32;
and.b64 %rd923, %rd2512, 4294967295;
xor.b64 %rd924, %rd923, %rd922;
xor.b64 %rd925, %rd924, %rd2513;
mul.lo.s64 %rd926, %rd925, %rd2505;
and.b64 %rd927, %rd926, 4294967295;
xor.b64 %rd928, %rd927, %rd921;
xor.b64 %rd929, %rd928, %rd2514;
mul.lo.s64 %rd930, %rd929, %rd2503;
shr.u64 %rd931, %rd930, 32;
shr.u64 %rd932, %rd926, 32;
and.b64 %rd933, %rd900, 4294967295;
xor.b64 %rd934, %rd933, %rd932;
xor.b64 %rd935, %rd934, %rd2515;
mul.lo.s64 %rd936, %rd935, %rd2503;
and.b64 %rd937, %rd936, 4294967295;
xor.b64 %rd938, %rd937, %rd931;
xor.b64 %rd939, %rd938, %rd2516;
mul.lo.s64 %rd940, %rd939, %rd2505;
shr.u64 %rd941, %rd940, 32;
shr.u64 %rd942, %rd936, 32;
and.b64 %rd943, %rd910, 4294967295;
xor.b64 %rd944, %rd943, %rd942;
xor.b64 %rd945, %rd944, %rd2517;
mul.lo.s64 %rd946, %rd945, %rd2505;
and.b64 %rd947, %rd946, 4294967295;
xor.b64 %rd948, %rd947, %rd941;
xor.b64 %rd949, %rd948, %rd2518;
mul.lo.s64 %rd950, %rd949, %rd2503;
shr.u64 %rd951, %rd950, 32;
shr.u64 %rd952, %rd946, 32;
xor.b64 %rd953, %rd920, %rd952;
xor.b64 %rd954, %rd953, %rd2519;
mul.lo.s64 %rd955, %rd954, %rd2503;
xor.b64 %rd956, %rd951, %rd955;
cvt.u32.u64 %r97, %rd956;
xor.b32 %r98, %r318, %r97;
mul.lo.s32 %r99, %r98, %r319;
shr.u32 %r100, %r99, 9;
cvt.rn.f32.u32 %f70, %r100;
mul.rn.f32 %f71, %f70, 0f34000000;
cvt.rn.f16.f32 %h28, %f71;
mov.b16 %h29, 0x2E66;
setp.ge.f16 %p17, %h28, %h29;
ld.global.nc.b16 %h30, [%rd45+258];
ld.global.nc.f32 %f72, [%rd46+516];
cvt.rn.f16.f32 %h31, %f72;
add.rn.f16 %h32, %h30, %h31;
mov.b16 %h33, 0x3C72;
mul.rn.f16 %h34, %h32, %h33;
selp.b16 %h35, %h34, 0x0000, %p17;
cvt.f32.f16 %f73, %h35;
ld.global.nc.b16 %h36, [%rd47+258];
cvt.f32.f16 %f74, %h36;
ld.global.nc.f32 %f75, [%rd48+516];
mul.rn.f32 %f76, %f1, %f75;
mul.rn.f32 %f77, %f76, %f74;
ld.global.nc.f32 %f78, [%rd49+516];
mul.rn.f32 %f79, %f2, %f76;
sub.rn.f32 %f80, %f78, %f79;
add.rn.f32 %f81, %f77, %f80;
add.rn.f32 %f82, %f81, %f73;
sub.rn.f32 %f83, %f82, %f3;
mul.rn.f32 %f84, %f83, %f83;
add.rn.f32 %f7, %f6, %f84;
or.b32 %r102, %r73, 256;
shr.u32 %r103, %r102, 2;
cvt.u64.u32 %rd957, %r103;
add.s64 %rd131, %rd12, %rd957;
and.b64 %rd2429, %rd131, 4294967295;
setp.lt.u64 %p72, %rd131, %rd12;
@%p8 bra LBB80_16;
mul.lo.s64 %rd2524, %rd2429, 3528531795;
selp.u64 %rd1000, 1, 0, %p72;
add.s64 %rd1001, %rd2464, %rd1000;
xor.b64 %rd1002, %rd1001, %rd2524;
shr.u64 %rd1003, %rd1002, 32;
mul.lo.s64 %rd2527, %rd1003, 3449720151;
shr.u64 %rd1004, %rd2527, 32;
and.b64 %rd1005, %rd1001, 4294967295;
mul.lo.s64 %rd1006, %rd1005, 3449720151;
and.b64 %rd1007, %rd1006, 4294967295;
xor.b64 %rd1008, %rd1007, %rd1004;
xor.b64 %rd1009, %rd1008, 2654435769;
mul.lo.s64 %rd2530, %rd1009, 3528531795;
xor.b64 %rd2520, %rd1006, %rd131;
mov.u32 %r322, -1879881855;
mov.u32 %r321, -845247145;
mov.u32 %r320, 534103459;
mov.u64 %rd2538, 3678237736;
mov.u64 %rd2537, 3041712726;
mov.u64 %rd2536, 1401181199;
mov.u64 %rd2535, 2835769497;
mov.u64 %rd2534, 1684936478;
mov.u64 %rd2533, 2027808484;
mov.u64 %rd2532, 387276957;
mov.u64 %rd2531, 842468239;
mov.u64 %rd2529, 3986602516;
mov.u64 %rd2528, 1013904242;
mov.u64 %rd2526, 3668340011;
mov.u64 %rd2525, 3144134277;
mov.u64 %rd2523, 3449720151;
mov.u64 %rd2522, 1993301258;
mov.u64 %rd2521, 3528531795;
bra.uni LBB80_17;
LBB80_16:
selp.u64 %rd973, 1, 0, %p72;
add.s64 %rd974, %rd2464, %rd973;
and.b64 %rd975, %rd974, 4294967295;
mul.lo.s64 %rd2524, %rd975, 3449720151;
xor.b64 %rd976, %rd2524, %rd131;
shr.u64 %rd977, %rd976, 32;
mul.lo.s64 %rd2527, %rd977, 3528531795;
shr.u64 %rd978, %rd2527, 32;
mul.lo.s64 %rd980, %rd2429, 3528531795;
and.b64 %rd981, %rd980, 4294967295;
xor.b64 %rd982, %rd981, %rd978;
xor.b64 %rd983, %rd982, 3144134277;
mul.lo.s64 %rd2530, %rd983, 3449720151;
xor.b64 %rd2520, %rd974, %rd980;
mov.u32 %r322, -1767562579;
mov.u32 %r321, -766435501;
mov.u32 %r320, 1401181199;
mov.u64 %rd2538, 4055616968;
mov.u64 %rd2537, 1684936478;
mov.u64 %rd2536, 534103459;
mov.u64 %rd2535, 387276957;
mov.u64 %rd2534, 3041712726;
mov.u64 %rd2533, 3986602516;
mov.u64 %rd2532, 2835769497;
mov.u64 %rd2531, 3668340011;
mov.u64 %rd2529, 2027808484;
mov.u64 %rd2528, 1993301258;
mov.u64 %rd2526, 842468239;
mov.u64 %rd2525, 2654435769;
mov.u64 %rd2523, 3528531795;
mov.u64 %rd2522, 1013904242;
mov.u64 %rd2521, 3449720151;
LBB80_17:
shr.u64 %rd1010, %rd2530, 32;
shr.u64 %rd1011, %rd2520, 32;
mul.lo.s64 %rd1012, %rd1011, %rd2521;
and.b64 %rd1013, %rd1012, 4294967295;
xor.b64 %rd1014, %rd1013, %rd1010;
xor.b64 %rd1015, %rd1014, %rd2522;
mul.lo.s64 %rd1016, %rd1015, %rd2523;
shr.u64 %rd1017, %rd1016, 32;
shr.u64 %rd1018, %rd1012, 32;
and.b64 %rd1019, %rd2524, 4294967295;
xor.b64 %rd1020, %rd1019, %rd1018;
xor.b64 %rd1021, %rd1020, %rd2525;
mul.lo.s64 %rd1022, %rd1021, %rd2523;
and.b64 %rd1023, %rd1022, 4294967295;
xor.b64 %rd1024, %rd1023, %rd1017;
xor.b64 %rd1025, %rd1024, %rd2526;
mul.lo.s64 %rd1026, %rd1025, %rd2521;
shr.u64 %rd1027, %rd1026, 32;
shr.u64 %rd1028, %rd1022, 32;
and.b64 %rd1029, %rd2527, 4294967295;
xor.b64 %rd1030, %rd1029, %rd1028;
xor.b64 %rd1031, %rd1030, %rd2528;
mul.lo.s64 %rd1032, %rd1031, %rd2521;
and.b64 %rd1033, %rd1032, 4294967295;
xor.b64 %rd1034, %rd1033, %rd1027;
xor.b64 %rd1035, %rd1034, %rd2529;
mul.lo.s64 %rd1036, %rd1035, %rd2523;
shr.u64 %rd1037, %rd1036, 32;
shr.u64 %rd1038, %rd1032, 32;
and.b64 %rd1039, %rd2530, 4294967295;
xor.b64 %rd1040, %rd1039, %rd1038;
xor.b64 %rd1041, %rd1040, %rd2531;
mul.lo.s64 %rd1042, %rd1041, %rd2523;
and.b64 %rd1043, %rd1042, 4294967295;
xor.b64 %rd1044, %rd1043, %rd1037;
xor.b64 %rd1045, %rd1044, %rd2532;
mul.lo.s64 %rd1046, %rd1045, %rd2521;
shr.u64 %rd1047, %rd1046, 32;
shr.u64 %rd1048, %rd1042, 32;
and.b64 %rd1049, %rd1016, 4294967295;
xor.b64 %rd1050, %rd1049, %rd1048;
xor.b64 %rd1051, %rd1050, %rd2533;
mul.lo.s64 %rd1052, %rd1051, %rd2521;
and.b64 %rd1053, %rd1052, 4294967295;
xor.b64 %rd1054, %rd1053, %rd1047;
xor.b64 %rd1055, %rd1054, %rd2534;
mul.lo.s64 %rd1056, %rd1055, %rd2523;
shr.u64 %rd1057, %rd1056, 32;
shr.u64 %rd1058, %rd1052, 32;
and.b64 %rd1059, %rd1026, 4294967295;
xor.b64 %rd1060, %rd1059, %rd1058;
xor.b64 %rd1061, %rd1060, %rd2535;
mul.lo.s64 %rd1062, %rd1061, %rd2523;
and.b64 %rd1063, %rd1062, 4294967295;
xor.b64 %rd1064, %rd1063, %rd1057;
xor.b64 %rd1065, %rd1064, %rd2536;
mul.lo.s64 %rd1066, %rd1065, %rd2521;
shr.u64 %rd1067, %rd1066, 32;
shr.u64 %rd1068, %rd1062, 32;
and.b64 %rd1069, %rd1036, 4294967295;
xor.b64 %rd1070, %rd1069, %rd1068;
xor.b64 %rd1071, %rd1070, %rd2537;
mul.lo.s64 %rd1072, %rd1071, %rd2521;
and.b64 %rd1073, %rd1072, 4294967295;
xor.b64 %rd1074, %rd1073, %rd1067;
xor.b64 %rd1075, %rd1074, %rd2538;
mul.lo.s64 %rd1076, %rd1075, %rd2523;
shr.u64 %rd1077, %rd1076, 32;
cvt.u32.u64 %r110, %rd1077;
shr.u64 %rd1078, %rd1072, 32;
xor.b64 %rd1079, %rd1078, %rd1046;
cvt.u32.u64 %r111, %rd1079;
xor.b32 %r112, %r320, %r111;
mul.lo.s32 %r113, %r112, %r321;
xor.b32 %r114, %r113, %r110;
xor.b32 %r115, %r114, %r322;
shr.u32 %r116, %r115, 9;
cvt.rn.f32.u32 %f85, %r116;
mul.rn.f32 %f86, %f85, 0f34000000;
cvt.rn.f16.f32 %h37, %f86;
mov.b16 %h38, 0x2E66;
setp.ge.f16 %p20, %h37, %h38;
ld.global.nc.b16 %h39, [%rd45+512];
ld.global.nc.f32 %f87, [%rd46+1024];
cvt.rn.f16.f32 %h40, %f87;
add.rn.f16 %h41, %h39, %h40;
mov.b16 %h42, 0x3C72;
mul.rn.f16 %h43, %h41, %h42;
selp.b16 %h44, %h43, 0x0000, %p20;
cvt.f32.f16 %f88, %h44;
ld.global.nc.b16 %h45, [%rd47+512];
cvt.f32.f16 %f89, %h45;
ld.global.nc.f32 %f90, [%rd48+1024];
mul.rn.f32 %f91, %f1, %f90;
mul.rn.f32 %f92, %f91, %f89;
ld.global.nc.f32 %f93, [%rd49+1024];
mul.rn.f32 %f94, %f2, %f91;
sub.rn.f32 %f95, %f93, %f94;
add.rn.f32 %f96, %f92, %f95;
add.rn.f32 %f97, %f96, %f88;
sub.rn.f32 %f98, %f97, %f3;
mul.rn.f32 %f99, %f98, %f98;
add.rn.f32 %f8, %f7, %f99;
or.b32 %r117, %r3, 257;
or.b32 %r118, %r117, %r4;
and.b32 %r119, %r117, 3;
shr.u32 %r120, %r118, 2;
setp.ne.s32 %p21, %r119, 1;
cvt.u64.u32 %rd1080, %r120;
add.s64 %rd159, %rd12, %rd1080;
and.b64 %rd2426, %rd159, 4294967295;
setp.lt.u64 %p71, %rd159, %rd12;
@%p21 bra LBB80_19;
mul.lo.s64 %rd2543, %rd2426, 3528531795;
selp.u64 %rd1121, 1, 0, %p71;
add.s64 %rd1122, %rd2464, %rd1121;
xor.b64 %rd1123, %rd1122, %rd2543;
shr.u64 %rd1124, %rd1123, 32;
mul.lo.s64 %rd2546, %rd1124, 3449720151;
shr.u64 %rd1125, %rd2546, 32;
and.b64 %rd1126, %rd1122, 4294967295;
mul.lo.s64 %rd1127, %rd1126, 3449720151;
and.b64 %rd1128, %rd1127, 4294967295;
xor.b64 %rd1129, %rd1128, %rd1125;
xor.b64 %rd1130, %rd1129, 2654435769;
mul.lo.s64 %rd2549, %rd1130, 3528531795;
xor.b64 %rd2539, %rd1127, %rd159;
mov.u32 %r324, -845247145;
mov.u32 %r323, -616729560;
mov.u64 %rd2556, 3041712726;
mov.u64 %rd2555, 1401181199;
mov.u64 %rd2554, 2835769497;
mov.u64 %rd2553, 1684936478;
mov.u64 %rd2552, 2027808484;
mov.u64 %rd2551, 387276957;
mov.u64 %rd2550, 842468239;
mov.u64 %rd2548, 3986602516;
mov.u64 %rd2547, 1013904242;
mov.u64 %rd2545, 3668340011;
mov.u64 %rd2544, 3144134277;
mov.u64 %rd2542, 3449720151;
mov.u64 %rd2541, 1993301258;
mov.u64 %rd2540, 3528531795;
bra.uni LBB80_20;
LBB80_19:
selp.u64 %rd1095, 1, 0, %p71;
add.s64 %rd1096, %rd2464, %rd1095;
and.b64 %rd1097, %rd1096, 4294967295;
mul.lo.s64 %rd2543, %rd1097, 3449720151;
xor.b64 %rd1098, %rd2543, %rd159;
shr.u64 %rd1099, %rd1098, 32;
mul.lo.s64 %rd2546, %rd1099, 3528531795;
shr.u64 %rd1100, %rd2546, 32;
mul.lo.s64 %rd1102, %rd2426, 3528531795;
and.b64 %rd1103, %rd1102, 4294967295;
xor.b64 %rd1104, %rd1103, %rd1100;
xor.b64 %rd1105, %rd1104, 3144134277;
mul.lo.s64 %rd2549, %rd1105, 3449720151;
xor.b64 %rd2539, %rd1096, %rd1102;
mov.u32 %r324, -766435501;
mov.u32 %r323, -239350328;
mov.u64 %rd2556, 1684936478;
mov.u64 %rd2555, 534103459;
mov.u64 %rd2554, 387276957;
mov.u64 %rd2553, 3041712726;
mov.u64 %rd2552, 3986602516;
mov.u64 %rd2551, 2835769497;
mov.u64 %rd2550, 3668340011;
mov.u64 %rd2548, 2027808484;
mov.u64 %rd2547, 1993301258;
mov.u64 %rd2545, 842468239;
mov.u64 %rd2544, 2654435769;
mov.u64 %rd2542, 3528531795;
mov.u64 %rd2541, 1013904242;
mov.u64 %rd2540, 3449720151;
LBB80_20:
shr.u64 %rd1131, %rd2549, 32;
shr.u64 %rd1132, %rd2539, 32;
mul.lo.s64 %rd1133, %rd1132, %rd2540;
and.b64 %rd1134, %rd1133, 4294967295;
xor.b64 %rd1135, %rd1134, %rd1131;
xor.b64 %rd1136, %rd1135, %rd2541;
mul.lo.s64 %rd1137, %rd1136, %rd2542;
shr.u64 %rd1138, %rd1137, 32;
shr.u64 %rd1139, %rd1133, 32;
and.b64 %rd1140, %rd2543, 4294967295;
xor.b64 %rd1141, %rd1140, %rd1139;
xor.b64 %rd1142, %rd1141, %rd2544;
mul.lo.s64 %rd1143, %rd1142, %rd2542;
and.b64 %rd1144, %rd1143, 4294967295;
xor.b64 %rd1145, %rd1144, %rd1138;
xor.b64 %rd1146, %rd1145, %rd2545;
mul.lo.s64 %rd1147, %rd1146, %rd2540;
shr.u64 %rd1148, %rd1147, 32;
shr.u64 %rd1149, %rd1143, 32;
and.b64 %rd1150, %rd2546, 4294967295;
xor.b64 %rd1151, %rd1150, %rd1149;
xor.b64 %rd1152, %rd1151, %rd2547;
mul.lo.s64 %rd1153, %rd1152, %rd2540;
and.b64 %rd1154, %rd1153, 4294967295;
xor.b64 %rd1155, %rd1154, %rd1148;
xor.b64 %rd1156, %rd1155, %rd2548;
mul.lo.s64 %rd1157, %rd1156, %rd2542;
shr.u64 %rd1158, %rd1157, 32;
shr.u64 %rd1159, %rd1153, 32;
and.b64 %rd1160, %rd2549, 4294967295;
xor.b64 %rd1161, %rd1160, %rd1159;
xor.b64 %rd1162, %rd1161, %rd2550;
mul.lo.s64 %rd1163, %rd1162, %rd2542;
and.b64 %rd1164, %rd1163, 4294967295;
xor.b64 %rd1165, %rd1164, %rd1158;
xor.b64 %rd1166, %rd1165, %rd2551;
mul.lo.s64 %rd1167, %rd1166, %rd2540;
shr.u64 %rd1168, %rd1167, 32;
shr.u64 %rd1169, %rd1163, 32;
and.b64 %rd1170, %rd1137, 4294967295;
xor.b64 %rd1171, %rd1170, %rd1169;
xor.b64 %rd1172, %rd1171, %rd2552;
mul.lo.s64 %rd1173, %rd1172, %rd2540;
and.b64 %rd1174, %rd1173, 4294967295;
xor.b64 %rd1175, %rd1174, %rd1168;
xor.b64 %rd1176, %rd1175, %rd2553;
mul.lo.s64 %rd1177, %rd1176, %rd2542;
shr.u64 %rd1178, %rd1177, 32;
shr.u64 %rd1179, %rd1173, 32;
and.b64 %rd1180, %rd1147, 4294967295;
xor.b64 %rd1181, %rd1180, %rd1179;
xor.b64 %rd1182, %rd1181, %rd2554;
mul.lo.s64 %rd1183, %rd1182, %rd2542;
and.b64 %rd1184, %rd1183, 4294967295;
xor.b64 %rd1185, %rd1184, %rd1178;
xor.b64 %rd1186, %rd1185, %rd2555;
mul.lo.s64 %rd1187, %rd1186, %rd2540;
shr.u64 %rd1188, %rd1187, 32;
shr.u64 %rd1189, %rd1183, 32;
xor.b64 %rd1190, %rd1157, %rd1189;
xor.b64 %rd1191, %rd1190, %rd2556;
mul.lo.s64 %rd1192, %rd1191, %rd2540;
xor.b64 %rd1193, %rd1188, %rd1192;
cvt.u32.u64 %r125, %rd1193;
xor.b32 %r126, %r323, %r125;
mul.lo.s32 %r127, %r126, %r324;
shr.u32 %r128, %r127, 9;
cvt.rn.f32.u32 %f100, %r128;
mul.rn.f32 %f101, %f100, 0f34000000;
cvt.rn.f16.f32 %h46, %f101;
mov.b16 %h47, 0x2E66;
setp.ge.f16 %p25, %h46, %h47;
ld.global.nc.b16 %h48, [%rd45+514];
ld.global.nc.f32 %f102, [%rd46+1028];
cvt.rn.f16.f32 %h49, %f102;
add.rn.f16 %h50, %h48, %h49;
mov.b16 %h51, 0x3C72;
mul.rn.f16 %h52, %h50, %h51;
selp.b16 %h53, %h52, 0x0000, %p25;
cvt.f32.f16 %f103, %h53;
ld.global.nc.b16 %h54, [%rd47+514];
cvt.f32.f16 %f104, %h54;
ld.global.nc.f32 %f105, [%rd48+1028];
mul.rn.f32 %f106, %f1, %f105;
mul.rn.f32 %f107, %f106, %f104;
ld.global.nc.f32 %f108, [%rd49+1028];
mul.rn.f32 %f109, %f2, %f106;
sub.rn.f32 %f110, %f108, %f109;
add.rn.f32 %f111, %f107, %f110;
add.rn.f32 %f112, %f111, %f103;
sub.rn.f32 %f113, %f112, %f3;
mul.rn.f32 %f114, %f113, %f113;
add.rn.f32 %f9, %f8, %f114;
or.b32 %r130, %r73, 384;
shr.u32 %r131, %r130, 2;
cvt.u64.u32 %rd1194, %r131;
add.s64 %rd186, %rd12, %rd1194;
and.b64 %rd2422, %rd186, 4294967295;
setp.lt.u64 %p70, %rd186, %rd12;
@%p8 bra LBB80_22;
mul.lo.s64 %rd2561, %rd2422, 3528531795;
selp.u64 %rd1237, 1, 0, %p70;
add.s64 %rd1238, %rd2464, %rd1237;
xor.b64 %rd1239, %rd1238, %rd2561;
shr.u64 %rd1240, %rd1239, 32;
mul.lo.s64 %rd2564, %rd1240, 3449720151;
shr.u64 %rd1241, %rd2564, 32;
and.b64 %rd1242, %rd1238, 4294967295;
mul.lo.s64 %rd1243, %rd1242, 3449720151;
and.b64 %rd1244, %rd1243, 4294967295;
xor.b64 %rd1245, %rd1244, %rd1241;
xor.b64 %rd1246, %rd1245, 2654435769;
mul.lo.s64 %rd2567, %rd1246, 3528531795;
xor.b64 %rd2557, %rd1243, %rd186;
mov.u32 %r327, -1879881855;
mov.u32 %r326, -845247145;
mov.u32 %r325, 534103459;
mov.u64 %rd2575, 3678237736;
mov.u64 %rd2574, 3041712726;
mov.u64 %rd2573, 1401181199;
mov.u64 %rd2572, 2835769497;
mov.u64 %rd2571, 1684936478;
mov.u64 %rd2570, 2027808484;
mov.u64 %rd2569, 387276957;
mov.u64 %rd2568, 842468239;
mov.u64 %rd2566, 3986602516;
mov.u64 %rd2565, 1013904242;
mov.u64 %rd2563, 3668340011;
mov.u64 %rd2562, 3144134277;
mov.u64 %rd2560, 3449720151;
mov.u64 %rd2559, 1993301258;
mov.u64 %rd2558, 3528531795;
bra.uni LBB80_23;
LBB80_22:
selp.u64 %rd1210, 1, 0, %p70;
add.s64 %rd1211, %rd2464, %rd1210;
and.b64 %rd1212, %rd1211, 4294967295;
mul.lo.s64 %rd2561, %rd1212, 3449720151;
xor.b64 %rd1213, %rd2561, %rd186;
shr.u64 %rd1214, %rd1213, 32;
mul.lo.s64 %rd2564, %rd1214, 3528531795;
shr.u64 %rd1215, %rd2564, 32;
mul.lo.s64 %rd1217, %rd2422, 3528531795;
and.b64 %rd1218, %rd1217, 4294967295;
xor.b64 %rd1219, %rd1218, %rd1215;
xor.b64 %rd1220, %rd1219, 3144134277;
mul.lo.s64 %rd2567, %rd1220, 3449720151;
xor.b64 %rd2557, %rd1211, %rd1217;
mov.u32 %r327, -1767562579;
mov.u32 %r326, -766435501;
mov.u32 %r325, 1401181199;
mov.u64 %rd2575, 4055616968;
mov.u64 %rd2574, 1684936478;
mov.u64 %rd2573, 534103459;
mov.u64 %rd2572, 387276957;
mov.u64 %rd2571, 3041712726;
mov.u64 %rd2570, 3986602516;
mov.u64 %rd2569, 2835769497;
mov.u64 %rd2568, 3668340011;
mov.u64 %rd2566, 2027808484;
mov.u64 %rd2565, 1993301258;
mov.u64 %rd2563, 842468239;
mov.u64 %rd2562, 2654435769;
mov.u64 %rd2560, 3528531795;
mov.u64 %rd2559, 1013904242;
mov.u64 %rd2558, 3449720151;
LBB80_23:
shr.u64 %rd1247, %rd2567, 32;
shr.u64 %rd1248, %rd2557, 32;
mul.lo.s64 %rd1249, %rd1248, %rd2558;
and.b64 %rd1250, %rd1249, 4294967295;
xor.b64 %rd1251, %rd1250, %rd1247;
xor.b64 %rd1252, %rd1251, %rd2559;
mul.lo.s64 %rd1253, %rd1252, %rd2560;
shr.u64 %rd1254, %rd1253, 32;
shr.u64 %rd1255, %rd1249, 32;
and.b64 %rd1256, %rd2561, 4294967295;
xor.b64 %rd1257, %rd1256, %rd1255;
xor.b64 %rd1258, %rd1257, %rd2562;
mul.lo.s64 %rd1259, %rd1258, %rd2560;
and.b64 %rd1260, %rd1259, 4294967295;
xor.b64 %rd1261, %rd1260, %rd1254;
xor.b64 %rd1262, %rd1261, %rd2563;
mul.lo.s64 %rd1263, %rd1262, %rd2558;
shr.u64 %rd1264, %rd1263, 32;
shr.u64 %rd1265, %rd1259, 32;
and.b64 %rd1266, %rd2564, 4294967295;
xor.b64 %rd1267, %rd1266, %rd1265;
xor.b64 %rd1268, %rd1267, %rd2565;
mul.lo.s64 %rd1269, %rd1268, %rd2558;
and.b64 %rd1270, %rd1269, 4294967295;
xor.b64 %rd1271, %rd1270, %rd1264;
xor.b64 %rd1272, %rd1271, %rd2566;
mul.lo.s64 %rd1273, %rd1272, %rd2560;
shr.u64 %rd1274, %rd1273, 32;
shr.u64 %rd1275, %rd1269, 32;
and.b64 %rd1276, %rd2567, 4294967295;
xor.b64 %rd1277, %rd1276, %rd1275;
xor.b64 %rd1278, %rd1277, %rd2568;
mul.lo.s64 %rd1279, %rd1278, %rd2560;
and.b64 %rd1280, %rd1279, 4294967295;
xor.b64 %rd1281, %rd1280, %rd1274;
xor.b64 %rd1282, %rd1281, %rd2569;
mul.lo.s64 %rd1283, %rd1282, %rd2558;
shr.u64 %rd1284, %rd1283, 32;
shr.u64 %rd1285, %rd1279, 32;
and.b64 %rd1286, %rd1253, 4294967295;
xor.b64 %rd1287, %rd1286, %rd1285;
xor.b64 %rd1288, %rd1287, %rd2570;
mul.lo.s64 %rd1289, %rd1288, %rd2558;
and.b64 %rd1290, %rd1289, 4294967295;
xor.b64 %rd1291, %rd1290, %rd1284;
xor.b64 %rd1292, %rd1291, %rd2571;
mul.lo.s64 %rd1293, %rd1292, %rd2560;
shr.u64 %rd1294, %rd1293, 32;
shr.u64 %rd1295, %rd1289, 32;
and.b64 %rd1296, %rd1263, 4294967295;
xor.b64 %rd1297, %rd1296, %rd1295;
xor.b64 %rd1298, %rd1297, %rd2572;
mul.lo.s64 %rd1299, %rd1298, %rd2560;
and.b64 %rd1300, %rd1299, 4294967295;
xor.b64 %rd1301, %rd1300, %rd1294;
xor.b64 %rd1302, %rd1301, %rd2573;
mul.lo.s64 %rd1303, %rd1302, %rd2558;
shr.u64 %rd1304, %rd1303, 32;
shr.u64 %rd1305, %rd1299, 32;
and.b64 %rd1306, %rd1273, 4294967295;
xor.b64 %rd1307, %rd1306, %rd1305;
xor.b64 %rd1308, %rd1307, %rd2574;
mul.lo.s64 %rd1309, %rd1308, %rd2558;
and.b64 %rd1310, %rd1309, 4294967295;
xor.b64 %rd1311, %rd1310, %rd1304;
xor.b64 %rd1312, %rd1311, %rd2575;
mul.lo.s64 %rd1313, %rd1312, %rd2560;
shr.u64 %rd1314, %rd1313, 32;
cvt.u32.u64 %r138, %rd1314;
shr.u64 %rd1315, %rd1309, 32;
xor.b64 %rd1316, %rd1315, %rd1283;
cvt.u32.u64 %r139, %rd1316;
xor.b32 %r140, %r325, %r139;
mul.lo.s32 %r141, %r140, %r326;
xor.b32 %r142, %r141, %r138;
xor.b32 %r143, %r142, %r327;
shr.u32 %r144, %r143, 9;
cvt.rn.f32.u32 %f115, %r144;
mul.rn.f32 %f116, %f115, 0f34000000;
cvt.rn.f16.f32 %h55, %f116;
mov.b16 %h56, 0x2E66;
setp.ge.f16 %p28, %h55, %h56;
ld.global.nc.b16 %h57, [%rd45+768];
ld.global.nc.f32 %f117, [%rd46+1536];
cvt.rn.f16.f32 %h58, %f117;
add.rn.f16 %h59, %h57, %h58;
mov.b16 %h60, 0x3C72;
mul.rn.f16 %h61, %h59, %h60;
selp.b16 %h62, %h61, 0x0000, %p28;
cvt.f32.f16 %f118, %h62;
ld.global.nc.b16 %h63, [%rd47+768];
cvt.f32.f16 %f119, %h63;
ld.global.nc.f32 %f120, [%rd48+1536];
mul.rn.f32 %f121, %f1, %f120;
mul.rn.f32 %f122, %f121, %f119;
ld.global.nc.f32 %f123, [%rd49+1536];
mul.rn.f32 %f124, %f2, %f121;
sub.rn.f32 %f125, %f123, %f124;
add.rn.f32 %f126, %f122, %f125;
add.rn.f32 %f127, %f126, %f118;
sub.rn.f32 %f128, %f127, %f3;
mul.rn.f32 %f129, %f128, %f128;
add.rn.f32 %f10, %f9, %f129;
or.b32 %r145, %r3, 385;
or.b32 %r146, %r145, %r4;
and.b32 %r147, %r145, 3;
shr.u32 %r148, %r146, 2;
setp.ne.s32 %p29, %r147, 1;
cvt.u64.u32 %rd1317, %r148;
add.s64 %rd214, %rd12, %rd1317;
@%p29 bra LBB80_25;
and.b64 %rd1357, %rd214, 4294967295;
mul.lo.s64 %rd2580, %rd1357, 3528531795;
setp.lt.u64 %p31, %rd214, %rd12;
selp.u64 %rd1358, 1, 0, %p31;
add.s64 %rd1359, %rd2464, %rd1358;
xor.b64 %rd1360, %rd1359, %rd2580;
shr.u64 %rd1361, %rd1360, 32;
mul.lo.s64 %rd2583, %rd1361, 3449720151;
shr.u64 %rd1362, %rd2583, 32;
and.b64 %rd1363, %rd1359, 4294967295;
mul.lo.s64 %rd1364, %rd1363, 3449720151;
and.b64 %rd1365, %rd1364, 4294967295;
xor.b64 %rd1366, %rd1365, %rd1362;
xor.b64 %rd1367, %rd1366, 2654435769;
mul.lo.s64 %rd2586, %rd1367, 3528531795;
xor.b64 %rd2576, %rd1364, %rd214;
mov.u32 %r329, -845247145;
mov.u32 %r328, -616729560;
mov.u64 %rd2593, 3041712726;
mov.u64 %rd2592, 1401181199;
mov.u64 %rd2591, 2835769497;
mov.u64 %rd2590, 1684936478;
mov.u64 %rd2589, 2027808484;
mov.u64 %rd2588, 387276957;
mov.u64 %rd2587, 842468239;
mov.u64 %rd2585, 3986602516;
mov.u64 %rd2584, 1013904242;
mov.u64 %rd2582, 3668340011;
mov.u64 %rd2581, 3144134277;
mov.u64 %rd2579, 3449720151;
mov.u64 %rd2578, 1993301258;
mov.u64 %rd2577, 3528531795;
bra.uni LBB80_26;
LBB80_25:
setp.lt.u64 %p30, %rd214, %rd12;
selp.u64 %rd1332, 1, 0, %p30;
add.s64 %rd1333, %rd2464, %rd1332;
and.b64 %rd1334, %rd1333, 4294967295;
mul.lo.s64 %rd2580, %rd1334, 3449720151;
xor.b64 %rd1335, %rd2580, %rd214;
shr.u64 %rd1336, %rd1335, 32;
mul.lo.s64 %rd2583, %rd1336, 3528531795;
shr.u64 %rd1337, %rd2583, 32;
and.b64 %rd1338, %rd214, 4294967295;
mul.lo.s64 %rd1339, %rd1338, 3528531795;
and.b64 %rd1340, %rd1339, 4294967295;
xor.b64 %rd1341, %rd1340, %rd1337;
xor.b64 %rd1342, %rd1341, 3144134277;
mul.lo.s64 %rd2586, %rd1342, 3449720151;
xor.b64 %rd2576, %rd1333, %rd1339;
mov.u32 %r329, -766435501;
mov.u32 %r328, -239350328;
mov.u64 %rd2593, 1684936478;
mov.u64 %rd2592, 534103459;
mov.u64 %rd2591, 387276957;
mov.u64 %rd2590, 3041712726;
mov.u64 %rd2589, 3986602516;
mov.u64 %rd2588, 2835769497;
mov.u64 %rd2587, 3668340011;
mov.u64 %rd2585, 2027808484;
mov.u64 %rd2584, 1993301258;
mov.u64 %rd2582, 842468239;
mov.u64 %rd2581, 2654435769;
mov.u64 %rd2579, 3528531795;
mov.u64 %rd2578, 1013904242;
mov.u64 %rd2577, 3449720151;
LBB80_26:
shr.u64 %rd1368, %rd2586, 32;
shr.u64 %rd1369, %rd2576, 32;
mul.lo.s64 %rd1370, %rd1369, %rd2577;
and.b64 %rd1371, %rd1370, 4294967295;
xor.b64 %rd1372, %rd1371, %rd1368;
xor.b64 %rd1373, %rd1372, %rd2578;
mul.lo.s64 %rd1374, %rd1373, %rd2579;
shr.u64 %rd1375, %rd1374, 32;
shr.u64 %rd1376, %rd1370, 32;
and.b64 %rd1377, %rd2580, 4294967295;
xor.b64 %rd1378, %rd1377, %rd1376;
xor.b64 %rd1379, %rd1378, %rd2581;
mul.lo.s64 %rd1380, %rd1379, %rd2579;
and.b64 %rd1381, %rd1380, 4294967295;
xor.b64 %rd1382, %rd1381, %rd1375;
xor.b64 %rd1383, %rd1382, %rd2582;
mul.lo.s64 %rd1384, %rd1383, %rd2577;
shr.u64 %rd1385, %rd1384, 32;
shr.u64 %rd1386, %rd1380, 32;
and.b64 %rd1387, %rd2583, 4294967295;
xor.b64 %rd1388, %rd1387, %rd1386;
xor.b64 %rd1389, %rd1388, %rd2584;
mul.lo.s64 %rd1390, %rd1389, %rd2577;
and.b64 %rd1391, %rd1390, 4294967295;
xor.b64 %rd1392, %rd1391, %rd1385;
xor.b64 %rd1393, %rd1392, %rd2585;
mul.lo.s64 %rd1394, %rd1393, %rd2579;
shr.u64 %rd1395, %rd1394, 32;
shr.u64 %rd1396, %rd1390, 32;
and.b64 %rd1397, %rd2586, 4294967295;
xor.b64 %rd1398, %rd1397, %rd1396;
xor.b64 %rd1399, %rd1398, %rd2587;
mul.lo.s64 %rd1400, %rd1399, %rd2579;
and.b64 %rd1401, %rd1400, 4294967295;
xor.b64 %rd1402, %rd1401, %rd1395;
xor.b64 %rd1403, %rd1402, %rd2588;
mul.lo.s64 %rd1404, %rd1403, %rd2577;
shr.u64 %rd1405, %rd1404, 32;
shr.u64 %rd1406, %rd1400, 32;
and.b64 %rd1407, %rd1374, 4294967295;
xor.b64 %rd1408, %rd1407, %rd1406;
xor.b64 %rd1409, %rd1408, %rd2589;
mul.lo.s64 %rd1410, %rd1409, %rd2577;
and.b64 %rd1411, %rd1410, 4294967295;
xor.b64 %rd1412, %rd1411, %rd1405;
xor.b64 %rd1413, %rd1412, %rd2590;
mul.lo.s64 %rd1414, %rd1413, %rd2579;
shr.u64 %rd1415, %rd1414, 32;
shr.u64 %rd1416, %rd1410, 32;
and.b64 %rd1417, %rd1384, 4294967295;
xor.b64 %rd1418, %rd1417, %rd1416;
xor.b64 %rd1419, %rd1418, %rd2591;
mul.lo.s64 %rd1420, %rd1419, %rd2579;
and.b64 %rd1421, %rd1420, 4294967295;
xor.b64 %rd1422, %rd1421, %rd1415;
xor.b64 %rd1423, %rd1422, %rd2592;
mul.lo.s64 %rd1424, %rd1423, %rd2577;
shr.u64 %rd1425, %rd1424, 32;
shr.u64 %rd1426, %rd1420, 32;
xor.b64 %rd1427, %rd1394, %rd1426;
xor.b64 %rd1428, %rd1427, %rd2593;
mul.lo.s64 %rd1429, %rd1428, %rd2577;
xor.b64 %rd1430, %rd1425, %rd1429;
cvt.u32.u64 %r153, %rd1430;
xor.b32 %r154, %r328, %r153;
mul.lo.s32 %r155, %r154, %r329;
shr.u32 %r156, %r155, 9;
cvt.rn.f32.u32 %f130, %r156;
mul.rn.f32 %f131, %f130, 0f34000000;
cvt.rn.f16.f32 %h64, %f131;
mov.b16 %h65, 0x2E66;
setp.ge.f16 %p33, %h64, %h65;
ld.global.nc.b16 %h66, [%rd45+770];
ld.global.nc.f32 %f132, [%rd46+1540];
cvt.rn.f16.f32 %h67, %f132;
add.rn.f16 %h68, %h66, %h67;
mov.b16 %h69, 0x3C72;
mul.rn.f16 %h70, %h68, %h69;
selp.b16 %h71, %h70, 0x0000, %p33;
cvt.f32.f16 %f133, %h71;
ld.global.nc.b16 %h72, [%rd47+770];
cvt.f32.f16 %f134, %h72;
ld.global.nc.f32 %f135, [%rd48+1540];
mul.rn.f32 %f136, %f1, %f135;
mul.rn.f32 %f137, %f136, %f134;
ld.global.nc.f32 %f138, [%rd49+1540];
mul.rn.f32 %f139, %f2, %f136;
sub.rn.f32 %f140, %f138, %f139;
add.rn.f32 %f141, %f137, %f140;
add.rn.f32 %f142, %f141, %f133;
sub.rn.f32 %f143, %f142, %f3;
mul.rn.f32 %f144, %f143, %f143;
add.rn.f32 %f11, %f10, %f144;
or.b32 %r158, %r73, 512;
shr.u32 %r159, %r158, 2;
cvt.u64.u32 %rd1431, %r159;
add.s64 %rd241, %rd12, %rd1431;
@%p8 bra LBB80_28;
and.b64 %rd1473, %rd241, 4294967295;
mul.lo.s64 %rd2598, %rd1473, 3528531795;
setp.lt.u64 %p35, %rd241, %rd12;
selp.u64 %rd1474, 1, 0, %p35;
add.s64 %rd1475, %rd2464, %rd1474;
xor.b64 %rd1476, %rd1475, %rd2598;
shr.u64 %rd1477, %rd1476, 32;
mul.lo.s64 %rd2601, %rd1477, 3449720151;
shr.u64 %rd1478, %rd2601, 32;
and.b64 %rd1479, %rd1475, 4294967295;
mul.lo.s64 %rd1480, %rd1479, 3449720151;
and.b64 %rd1481, %rd1480, 4294967295;
xor.b64 %rd1482, %rd1481, %rd1478;
xor.b64 %rd1483, %rd1482, 2654435769;
mul.lo.s64 %rd2604, %rd1483, 3528531795;
xor.b64 %rd2594, %rd1480, %rd241;
mov.u32 %r332, -1879881855;
mov.u32 %r331, -845247145;
mov.u32 %r330, 534103459;
mov.u64 %rd2612, 3678237736;
mov.u64 %rd2611, 3041712726;
mov.u64 %rd2610, 1401181199;
mov.u64 %rd2609, 2835769497;
mov.u64 %rd2608, 1684936478;
mov.u64 %rd2607, 2027808484;
mov.u64 %rd2606, 387276957;
mov.u64 %rd2605, 842468239;
mov.u64 %rd2603, 3986602516;
mov.u64 %rd2602, 1013904242;
mov.u64 %rd2600, 3668340011;
mov.u64 %rd2599, 3144134277;
mov.u64 %rd2597, 3449720151;
mov.u64 %rd2596, 1993301258;
mov.u64 %rd2595, 3528531795;
bra.uni LBB80_29;
LBB80_28:
setp.lt.u64 %p34, %rd241, %rd12;
selp.u64 %rd1447, 1, 0, %p34;
add.s64 %rd1448, %rd2464, %rd1447;
and.b64 %rd1449, %rd1448, 4294967295;
mul.lo.s64 %rd2598, %rd1449, 3449720151;
xor.b64 %rd1450, %rd2598, %rd241;
shr.u64 %rd1451, %rd1450, 32;
mul.lo.s64 %rd2601, %rd1451, 3528531795;
shr.u64 %rd1452, %rd2601, 32;
and.b64 %rd1453, %rd241, 4294967295;
mul.lo.s64 %rd1454, %rd1453, 3528531795;
and.b64 %rd1455, %rd1454, 4294967295;
xor.b64 %rd1456, %rd1455, %rd1452;
xor.b64 %rd1457, %rd1456, 3144134277;
mul.lo.s64 %rd2604, %rd1457, 3449720151;
xor.b64 %rd2594, %rd1448, %rd1454;
mov.u32 %r332, -1767562579;
mov.u32 %r331, -766435501;
mov.u32 %r330, 1401181199;
mov.u64 %rd2612, 4055616968;
mov.u64 %rd2611, 1684936478;
mov.u64 %rd2610, 534103459;
mov.u64 %rd2609, 387276957;
mov.u64 %rd2608, 3041712726;
mov.u64 %rd2607, 3986602516;
mov.u64 %rd2606, 2835769497;
mov.u64 %rd2605, 3668340011;
mov.u64 %rd2603, 2027808484;
mov.u64 %rd2602, 1993301258;
mov.u64 %rd2600, 842468239;
mov.u64 %rd2599, 2654435769;
mov.u64 %rd2597, 3528531795;
mov.u64 %rd2596, 1013904242;
mov.u64 %rd2595, 3449720151;
LBB80_29:
shr.u64 %rd1484, %rd2604, 32;
shr.u64 %rd1485, %rd2594, 32;
mul.lo.s64 %rd1486, %rd1485, %rd2595;
and.b64 %rd1487, %rd1486, 4294967295;
xor.b64 %rd1488, %rd1487, %rd1484;
xor.b64 %rd1489, %rd1488, %rd2596;
mul.lo.s64 %rd1490, %rd1489, %rd2597;
shr.u64 %rd1491, %rd1490, 32;
shr.u64 %rd1492, %rd1486, 32;
and.b64 %rd1493, %rd2598, 4294967295;
xor.b64 %rd1494, %rd1493, %rd1492;
xor.b64 %rd1495, %rd1494, %rd2599;
mul.lo.s64 %rd1496, %rd1495, %rd2597;
and.b64 %rd1497, %rd1496, 4294967295;
xor.b64 %rd1498, %rd1497, %rd1491;
xor.b64 %rd1499, %rd1498, %rd2600;
mul.lo.s64 %rd1500, %rd1499, %rd2595;
shr.u64 %rd1501, %rd1500, 32;
shr.u64 %rd1502, %rd1496, 32;
and.b64 %rd1503, %rd2601, 4294967295;
xor.b64 %rd1504, %rd1503, %rd1502;
xor.b64 %rd1505, %rd1504, %rd2602;
mul.lo.s64 %rd1506, %rd1505, %rd2595;
and.b64 %rd1507, %rd1506, 4294967295;
xor.b64 %rd1508, %rd1507, %rd1501;
xor.b64 %rd1509, %rd1508, %rd2603;
mul.lo.s64 %rd1510, %rd1509, %rd2597;
shr.u64 %rd1511, %rd1510, 32;
shr.u64 %rd1512, %rd1506, 32;
and.b64 %rd1513, %rd2604, 4294967295;
xor.b64 %rd1514, %rd1513, %rd1512;
xor.b64 %rd1515, %rd1514, %rd2605;
mul.lo.s64 %rd1516, %rd1515, %rd2597;
and.b64 %rd1517, %rd1516, 4294967295;
xor.b64 %rd1518, %rd1517, %rd1511;
xor.b64 %rd1519, %rd1518, %rd2606;
mul.lo.s64 %rd1520, %rd1519, %rd2595;
shr.u64 %rd1521, %rd1520, 32;
shr.u64 %rd1522, %rd1516, 32;
and.b64 %rd1523, %rd1490, 4294967295;
xor.b64 %rd1524, %rd1523, %rd1522;
xor.b64 %rd1525, %rd1524, %rd2607;
mul.lo.s64 %rd1526, %rd1525, %rd2595;
and.b64 %rd1527, %rd1526, 4294967295;
xor.b64 %rd1528, %rd1527, %rd1521;
xor.b64 %rd1529, %rd1528, %rd2608;
mul.lo.s64 %rd1530, %rd1529, %rd2597;
shr.u64 %rd1531, %rd1530, 32;
shr.u64 %rd1532, %rd1526, 32;
and.b64 %rd1533, %rd1500, 4294967295;
xor.b64 %rd1534, %rd1533, %rd1532;
xor.b64 %rd1535, %rd1534, %rd2609;
mul.lo.s64 %rd1536, %rd1535, %rd2597;
and.b64 %rd1537, %rd1536, 4294967295;
xor.b64 %rd1538, %rd1537, %rd1531;
xor.b64 %rd1539, %rd1538, %rd2610;
mul.lo.s64 %rd1540, %rd1539, %rd2595;
shr.u64 %rd1541, %rd1540, 32;
shr.u64 %rd1542, %rd1536, 32;
and.b64 %rd1543, %rd1510, 4294967295;
xor.b64 %rd1544, %rd1543, %rd1542;
xor.b64 %rd1545, %rd1544, %rd2611;
mul.lo.s64 %rd1546, %rd1545, %rd2595;
and.b64 %rd1547, %rd1546, 4294967295;
xor.b64 %rd1548, %rd1547, %rd1541;
xor.b64 %rd1549, %rd1548, %rd2612;
mul.lo.s64 %rd1550, %rd1549, %rd2597;
shr.u64 %rd1551, %rd1550, 32;
cvt.u32.u64 %r166, %rd1551;
shr.u64 %rd1552, %rd1546, 32;
xor.b64 %rd1553, %rd1552, %rd1520;
cvt.u32.u64 %r167, %rd1553;
xor.b32 %r168, %r330, %r167;
mul.lo.s32 %r169, %r168, %r331;
xor.b32 %r170, %r169, %r166;
xor.b32 %r171, %r170, %r332;
shr.u32 %r172, %r171, 9;
cvt.rn.f32.u32 %f145, %r172;
mul.rn.f32 %f146, %f145, 0f34000000;
cvt.rn.f16.f32 %h73, %f146;
mov.b16 %h74, 0x2E66;
setp.ge.f16 %p36, %h73, %h74;
ld.global.nc.b16 %h75, [%rd45+1024];
ld.global.nc.f32 %f147, [%rd46+2048];
cvt.rn.f16.f32 %h76, %f147;
add.rn.f16 %h77, %h75, %h76;
mov.b16 %h78, 0x3C72;
mul.rn.f16 %h79, %h77, %h78;
selp.b16 %h80, %h79, 0x0000, %p36;
cvt.f32.f16 %f148, %h80;
ld.global.nc.b16 %h81, [%rd47+1024];
cvt.f32.f16 %f149, %h81;
ld.global.nc.f32 %f150, [%rd48+2048];
mul.rn.f32 %f151, %f1, %f150;
mul.rn.f32 %f152, %f151, %f149;
ld.global.nc.f32 %f153, [%rd49+2048];
mul.rn.f32 %f154, %f2, %f151;
sub.rn.f32 %f155, %f153, %f154;
add.rn.f32 %f156, %f152, %f155;
add.rn.f32 %f157, %f156, %f148;
sub.rn.f32 %f158, %f157, %f3;
mul.rn.f32 %f159, %f158, %f158;
add.rn.f32 %f12, %f11, %f159;
or.b32 %r173, %r3, 513;
or.b32 %r174, %r173, %r4;
and.b32 %r175, %r173, 3;
shr.u32 %r176, %r174, 2;
setp.ne.s32 %p37, %r175, 1;
cvt.u64.u32 %rd1554, %r176;
add.s64 %rd269, %rd12, %rd1554;
@%p37 bra LBB80_31;
and.b64 %rd1594, %rd269, 4294967295;
mul.lo.s64 %rd2617, %rd1594, 3528531795;
setp.lt.u64 %p39, %rd269, %rd12;
selp.u64 %rd1595, 1, 0, %p39;
add.s64 %rd1596, %rd2464, %rd1595;
xor.b64 %rd1597, %rd1596, %rd2617;
shr.u64 %rd1598, %rd1597, 32;
mul.lo.s64 %rd2620, %rd1598, 3449720151;
shr.u64 %rd1599, %rd2620, 32;
and.b64 %rd1600, %rd1596, 4294967295;
mul.lo.s64 %rd1601, %rd1600, 3449720151;
and.b64 %rd1602, %rd1601, 4294967295;
xor.b64 %rd1603, %rd1602, %rd1599;
xor.b64 %rd1604, %rd1603, 2654435769;
mul.lo.s64 %rd2623, %rd1604, 3528531795;
xor.b64 %rd2613, %rd1601, %rd269;
mov.u32 %r334, -845247145;
mov.u32 %r333, -616729560;
mov.u64 %rd2630, 3041712726;
mov.u64 %rd2629, 1401181199;
mov.u64 %rd2628, 2835769497;
mov.u64 %rd2627, 1684936478;
mov.u64 %rd2626, 2027808484;
mov.u64 %rd2625, 387276957;
mov.u64 %rd2624, 842468239;
mov.u64 %rd2622, 3986602516;
mov.u64 %rd2621, 1013904242;
mov.u64 %rd2619, 3668340011;
mov.u64 %rd2618, 3144134277;
mov.u64 %rd2616, 3449720151;
mov.u64 %rd2615, 1993301258;
mov.u64 %rd2614, 3528531795;
bra.uni LBB80_32;
LBB80_31:
setp.lt.u64 %p38, %rd269, %rd12;
selp.u64 %rd1569, 1, 0, %p38;
add.s64 %rd1570, %rd2464, %rd1569;
and.b64 %rd1571, %rd1570, 4294967295;
mul.lo.s64 %rd2617, %rd1571, 3449720151;
xor.b64 %rd1572, %rd2617, %rd269;
shr.u64 %rd1573, %rd1572, 32;
mul.lo.s64 %rd2620, %rd1573, 3528531795;
shr.u64 %rd1574, %rd2620, 32;
and.b64 %rd1575, %rd269, 4294967295;
mul.lo.s64 %rd1576, %rd1575, 3528531795;
and.b64 %rd1577, %rd1576, 4294967295;
xor.b64 %rd1578, %rd1577, %rd1574;
xor.b64 %rd1579, %rd1578, 3144134277;
mul.lo.s64 %rd2623, %rd1579, 3449720151;
xor.b64 %rd2613, %rd1570, %rd1576;
mov.u32 %r334, -766435501;
mov.u32 %r333, -239350328;
mov.u64 %rd2630, 1684936478;
mov.u64 %rd2629, 534103459;
mov.u64 %rd2628, 387276957;
mov.u64 %rd2627, 3041712726;
mov.u64 %rd2626, 3986602516;
mov.u64 %rd2625, 2835769497;
mov.u64 %rd2624, 3668340011;
mov.u64 %rd2622, 2027808484;
mov.u64 %rd2621, 1993301258;
mov.u64 %rd2619, 842468239;
mov.u64 %rd2618, 2654435769;
mov.u64 %rd2616, 3528531795;
mov.u64 %rd2615, 1013904242;
mov.u64 %rd2614, 3449720151;
LBB80_32:
shr.u64 %rd1605, %rd2623, 32;
shr.u64 %rd1606, %rd2613, 32;
mul.lo.s64 %rd1607, %rd1606, %rd2614;
and.b64 %rd1608, %rd1607, 4294967295;
xor.b64 %rd1609, %rd1608, %rd1605;
xor.b64 %rd1610, %rd1609, %rd2615;
mul.lo.s64 %rd1611, %rd1610, %rd2616;
shr.u64 %rd1612, %rd1611, 32;
shr.u64 %rd1613, %rd1607, 32;
and.b64 %rd1614, %rd2617, 4294967295;
xor.b64 %rd1615, %rd1614, %rd1613;
xor.b64 %rd1616, %rd1615, %rd2618;
mul.lo.s64 %rd1617, %rd1616, %rd2616;
and.b64 %rd1618, %rd1617, 4294967295;
xor.b64 %rd1619, %rd1618, %rd1612;
xor.b64 %rd1620, %rd1619, %rd2619;
mul.lo.s64 %rd1621, %rd1620, %rd2614;
shr.u64 %rd1622, %rd1621, 32;
shr.u64 %rd1623, %rd1617, 32;
and.b64 %rd1624, %rd2620, 4294967295;
xor.b64 %rd1625, %rd1624, %rd1623;
xor.b64 %rd1626, %rd1625, %rd2621;
mul.lo.s64 %rd1627, %rd1626, %rd2614;
and.b64 %rd1628, %rd1627, 4294967295;
xor.b64 %rd1629, %rd1628, %rd1622;
xor.b64 %rd1630, %rd1629, %rd2622;
mul.lo.s64 %rd1631, %rd1630, %rd2616;
shr.u64 %rd1632, %rd1631, 32;
shr.u64 %rd1633, %rd1627, 32;
and.b64 %rd1634, %rd2623, 4294967295;
xor.b64 %rd1635, %rd1634, %rd1633;
xor.b64 %rd1636, %rd1635, %rd2624;
mul.lo.s64 %rd1637, %rd1636, %rd2616;
and.b64 %rd1638, %rd1637, 4294967295;
xor.b64 %rd1639, %rd1638, %rd1632;
xor.b64 %rd1640, %rd1639, %rd2625;
mul.lo.s64 %rd1641, %rd1640, %rd2614;
shr.u64 %rd1642, %rd1641, 32;
shr.u64 %rd1643, %rd1637, 32;
and.b64 %rd1644, %rd1611, 4294967295;
xor.b64 %rd1645, %rd1644, %rd1643;
xor.b64 %rd1646, %rd1645, %rd2626;
mul.lo.s64 %rd1647, %rd1646, %rd2614;
and.b64 %rd1648, %rd1647, 4294967295;
xor.b64 %rd1649, %rd1648, %rd1642;
xor.b64 %rd1650, %rd1649, %rd2627;
mul.lo.s64 %rd1651, %rd1650, %rd2616;
shr.u64 %rd1652, %rd1651, 32;
shr.u64 %rd1653, %rd1647, 32;
and.b64 %rd1654, %rd1621, 4294967295;
xor.b64 %rd1655, %rd1654, %rd1653;
xor.b64 %rd1656, %rd1655, %rd2628;
mul.lo.s64 %rd1657, %rd1656, %rd2616;
and.b64 %rd1658, %rd1657, 4294967295;
xor.b64 %rd1659, %rd1658, %rd1652;
xor.b64 %rd1660, %rd1659, %rd2629;
mul.lo.s64 %rd1661, %rd1660, %rd2614;
shr.u64 %rd1662, %rd1661, 32;
shr.u64 %rd1663, %rd1657, 32;
xor.b64 %rd1664, %rd1631, %rd1663;
xor.b64 %rd1665, %rd1664, %rd2630;
mul.lo.s64 %rd1666, %rd1665, %rd2614;
xor.b64 %rd1667, %rd1662, %rd1666;
cvt.u32.u64 %r181, %rd1667;
xor.b32 %r182, %r333, %r181;
mul.lo.s32 %r183, %r182, %r334;
shr.u32 %r184, %r183, 9;
cvt.rn.f32.u32 %f160, %r184;
mul.rn.f32 %f161, %f160, 0f34000000;
cvt.rn.f16.f32 %h82, %f161;
mov.b16 %h83, 0x2E66;
setp.ge.f16 %p41, %h82, %h83;
ld.global.nc.b16 %h84, [%rd45+1026];
ld.global.nc.f32 %f162, [%rd46+2052];
cvt.rn.f16.f32 %h85, %f162;
add.rn.f16 %h86, %h84, %h85;
mov.b16 %h87, 0x3C72;
mul.rn.f16 %h88, %h86, %h87;
selp.b16 %h89, %h88, 0x0000, %p41;
cvt.f32.f16 %f163, %h89;
ld.global.nc.b16 %h90, [%rd47+1026];
cvt.f32.f16 %f164, %h90;
ld.global.nc.f32 %f165, [%rd48+2052];
mul.rn.f32 %f166, %f1, %f165;
mul.rn.f32 %f167, %f166, %f164;
ld.global.nc.f32 %f168, [%rd49+2052];
mul.rn.f32 %f169, %f2, %f166;
sub.rn.f32 %f170, %f168, %f169;
add.rn.f32 %f171, %f167, %f170;
add.rn.f32 %f172, %f171, %f163;
sub.rn.f32 %f173, %f172, %f3;
mul.rn.f32 %f174, %f173, %f173;
add.rn.f32 %f13, %f12, %f174;
or.b32 %r186, %r73, 640;
shr.u32 %r187, %r186, 2;
cvt.u64.u32 %rd1668, %r187;
add.s64 %rd296, %rd12, %rd1668;
@%p8 bra LBB80_34;
and.b64 %rd1710, %rd296, 4294967295;
mul.lo.s64 %rd2635, %rd1710, 3528531795;
setp.lt.u64 %p43, %rd296, %rd12;
selp.u64 %rd1711, 1, 0, %p43;
add.s64 %rd1712, %rd2464, %rd1711;
xor.b64 %rd1713, %rd1712, %rd2635;
shr.u64 %rd1714, %rd1713, 32;
mul.lo.s64 %rd2638, %rd1714, 3449720151;
shr.u64 %rd1715, %rd2638, 32;
and.b64 %rd1716, %rd1712, 4294967295;
mul.lo.s64 %rd1717, %rd1716, 3449720151;
and.b64 %rd1718, %rd1717, 4294967295;
xor.b64 %rd1719, %rd1718, %rd1715;
xor.b64 %rd1720, %rd1719, 2654435769;
mul.lo.s64 %rd2641, %rd1720, 3528531795;
xor.b64 %rd2631, %rd1717, %rd296;
mov.u32 %r337, -1879881855;
mov.u32 %r336, -845247145;
mov.u32 %r335, 534103459;
mov.u64 %rd2649, 3678237736;
mov.u64 %rd2648, 3041712726;
mov.u64 %rd2647, 1401181199;
mov.u64 %rd2646, 2835769497;
mov.u64 %rd2645, 1684936478;
mov.u64 %rd2644, 2027808484;
mov.u64 %rd2643, 387276957;
mov.u64 %rd2642, 842468239;
mov.u64 %rd2640, 3986602516;
mov.u64 %rd2639, 1013904242;
mov.u64 %rd2637, 3668340011;
mov.u64 %rd2636, 3144134277;
mov.u64 %rd2634, 3449720151;
mov.u64 %rd2633, 1993301258;
mov.u64 %rd2632, 3528531795;
bra.uni LBB80_35;
LBB80_34:
setp.lt.u64 %p42, %rd296, %rd12;
selp.u64 %rd1684, 1, 0, %p42;
add.s64 %rd1685, %rd2464, %rd1684;
and.b64 %rd1686, %rd1685, 4294967295;
mul.lo.s64 %rd2635, %rd1686, 3449720151;
xor.b64 %rd1687, %rd2635, %rd296;
shr.u64 %rd1688, %rd1687, 32;
mul.lo.s64 %rd2638, %rd1688, 3528531795;
shr.u64 %rd1689, %rd2638, 32;
and.b64 %rd1690, %rd296, 4294967295;
mul.lo.s64 %rd1691, %rd1690, 3528531795;
and.b64 %rd1692, %rd1691, 4294967295;
xor.b64 %rd1693, %rd1692, %rd1689;
xor.b64 %rd1694, %rd1693, 3144134277;
mul.lo.s64 %rd2641, %rd1694, 3449720151;
xor.b64 %rd2631, %rd1685, %rd1691;
mov.u32 %r337, -1767562579;
mov.u32 %r336, -766435501;
mov.u32 %r335, 1401181199;
mov.u64 %rd2649, 4055616968;
mov.u64 %rd2648, 1684936478;
mov.u64 %rd2647, 534103459;
mov.u64 %rd2646, 387276957;
mov.u64 %rd2645, 3041712726;
mov.u64 %rd2644, 3986602516;
mov.u64 %rd2643, 2835769497;
mov.u64 %rd2642, 3668340011;
mov.u64 %rd2640, 2027808484;
mov.u64 %rd2639, 1993301258;
mov.u64 %rd2637, 842468239;
mov.u64 %rd2636, 2654435769;
mov.u64 %rd2634, 3528531795;
mov.u64 %rd2633, 1013904242;
mov.u64 %rd2632, 3449720151;
LBB80_35:
shr.u64 %rd1721, %rd2641, 32;
shr.u64 %rd1722, %rd2631, 32;
mul.lo.s64 %rd1723, %rd1722, %rd2632;
and.b64 %rd1724, %rd1723, 4294967295;
xor.b64 %rd1725, %rd1724, %rd1721;
xor.b64 %rd1726, %rd1725, %rd2633;
mul.lo.s64 %rd1727, %rd1726, %rd2634;
shr.u64 %rd1728, %rd1727, 32;
shr.u64 %rd1729, %rd1723, 32;
and.b64 %rd1730, %rd2635, 4294967295;
xor.b64 %rd1731, %rd1730, %rd1729;
xor.b64 %rd1732, %rd1731, %rd2636;
mul.lo.s64 %rd1733, %rd1732, %rd2634;
and.b64 %rd1734, %rd1733, 4294967295;
xor.b64 %rd1735, %rd1734, %rd1728;
xor.b64 %rd1736, %rd1735, %rd2637;
mul.lo.s64 %rd1737, %rd1736, %rd2632;
shr.u64 %rd1738, %rd1737, 32;
shr.u64 %rd1739, %rd1733, 32;
and.b64 %rd1740, %rd2638, 4294967295;
xor.b64 %rd1741, %rd1740, %rd1739;
xor.b64 %rd1742, %rd1741, %rd2639;
mul.lo.s64 %rd1743, %rd1742, %rd2632;
and.b64 %rd1744, %rd1743, 4294967295;
xor.b64 %rd1745, %rd1744, %rd1738;
xor.b64 %rd1746, %rd1745, %rd2640;
mul.lo.s64 %rd1747, %rd1746, %rd2634;
shr.u64 %rd1748, %rd1747, 32;
shr.u64 %rd1749, %rd1743, 32;
and.b64 %rd1750, %rd2641, 4294967295;
xor.b64 %rd1751, %rd1750, %rd1749;
xor.b64 %rd1752, %rd1751, %rd2642;
mul.lo.s64 %rd1753, %rd1752, %rd2634;
and.b64 %rd1754, %rd1753, 4294967295;
xor.b64 %rd1755, %rd1754, %rd1748;
xor.b64 %rd1756, %rd1755, %rd2643;
mul.lo.s64 %rd1757, %rd1756, %rd2632;
shr.u64 %rd1758, %rd1757, 32;
shr.u64 %rd1759, %rd1753, 32;
and.b64 %rd1760, %rd1727, 4294967295;
xor.b64 %rd1761, %rd1760, %rd1759;
xor.b64 %rd1762, %rd1761, %rd2644;
mul.lo.s64 %rd1763, %rd1762, %rd2632;
and.b64 %rd1764, %rd1763, 4294967295;
xor.b64 %rd1765, %rd1764, %rd1758;
xor.b64 %rd1766, %rd1765, %rd2645;
mul.lo.s64 %rd1767, %rd1766, %rd2634;
shr.u64 %rd1768, %rd1767, 32;
shr.u64 %rd1769, %rd1763, 32;
and.b64 %rd1770, %rd1737, 4294967295;
xor.b64 %rd1771, %rd1770, %rd1769;
xor.b64 %rd1772, %rd1771, %rd2646;
mul.lo.s64 %rd1773, %rd1772, %rd2634;
and.b64 %rd1774, %rd1773, 4294967295;
xor.b64 %rd1775, %rd1774, %rd1768;
xor.b64 %rd1776, %rd1775, %rd2647;
mul.lo.s64 %rd1777, %rd1776, %rd2632;
shr.u64 %rd1778, %rd1777, 32;
shr.u64 %rd1779, %rd1773, 32;
and.b64 %rd1780, %rd1747, 4294967295;
xor.b64 %rd1781, %rd1780, %rd1779;
xor.b64 %rd1782, %rd1781, %rd2648;
mul.lo.s64 %rd1783, %rd1782, %rd2632;
and.b64 %rd1784, %rd1783, 4294967295;
xor.b64 %rd1785, %rd1784, %rd1778;
xor.b64 %rd1786, %rd1785, %rd2649;
mul.lo.s64 %rd1787, %rd1786, %rd2634;
shr.u64 %rd1788, %rd1787, 32;
cvt.u32.u64 %r194, %rd1788;
shr.u64 %rd1789, %rd1783, 32;
xor.b64 %rd1790, %rd1789, %rd1757;
cvt.u32.u64 %r195, %rd1790;
xor.b32 %r196, %r335, %r195;
mul.lo.s32 %r197, %r196, %r336;
xor.b32 %r198, %r197, %r194;
xor.b32 %r199, %r198, %r337;
shr.u32 %r200, %r199, 9;
cvt.rn.f32.u32 %f175, %r200;
mul.rn.f32 %f176, %f175, 0f34000000;
cvt.rn.f16.f32 %h91, %f176;
mov.b16 %h92, 0x2E66;
setp.ge.f16 %p44, %h91, %h92;
ld.global.nc.b16 %h93, [%rd45+1280];
ld.global.nc.f32 %f177, [%rd46+2560];
cvt.rn.f16.f32 %h94, %f177;
add.rn.f16 %h95, %h93, %h94;
mov.b16 %h96, 0x3C72;
mul.rn.f16 %h97, %h95, %h96;
selp.b16 %h98, %h97, 0x0000, %p44;
cvt.f32.f16 %f178, %h98;
ld.global.nc.b16 %h99, [%rd47+1280];
cvt.f32.f16 %f179, %h99;
ld.global.nc.f32 %f180, [%rd48+2560];
mul.rn.f32 %f181, %f1, %f180;
mul.rn.f32 %f182, %f181, %f179;
ld.global.nc.f32 %f183, [%rd49+2560];
mul.rn.f32 %f184, %f2, %f181;
sub.rn.f32 %f185, %f183, %f184;
add.rn.f32 %f186, %f182, %f185;
add.rn.f32 %f187, %f186, %f178;
sub.rn.f32 %f188, %f187, %f3;
mul.rn.f32 %f189, %f188, %f188;
add.rn.f32 %f14, %f13, %f189;
or.b32 %r201, %r3, 641;
or.b32 %r202, %r201, %r4;
and.b32 %r203, %r201, 3;
shr.u32 %r204, %r202, 2;
setp.ne.s32 %p45, %r203, 1;
cvt.u64.u32 %rd1791, %r204;
add.s64 %rd324, %rd12, %rd1791;
@%p45 bra LBB80_37;
and.b64 %rd1831, %rd324, 4294967295;
mul.lo.s64 %rd2654, %rd1831, 3528531795;
setp.lt.u64 %p47, %rd324, %rd12;
selp.u64 %rd1832, 1, 0, %p47;
add.s64 %rd1833, %rd2464, %rd1832;
xor.b64 %rd1834, %rd1833, %rd2654;
shr.u64 %rd1835, %rd1834, 32;
mul.lo.s64 %rd2657, %rd1835, 3449720151;
shr.u64 %rd1836, %rd2657, 32;
and.b64 %rd1837, %rd1833, 4294967295;
mul.lo.s64 %rd1838, %rd1837, 3449720151;
and.b64 %rd1839, %rd1838, 4294967295;
xor.b64 %rd1840, %rd1839, %rd1836;
xor.b64 %rd1841, %rd1840, 2654435769;
mul.lo.s64 %rd2660, %rd1841, 3528531795;
xor.b64 %rd2650, %rd1838, %rd324;
mov.u32 %r339, -845247145;
mov.u32 %r338, -616729560;
mov.u64 %rd2667, 3041712726;
mov.u64 %rd2666, 1401181199;
mov.u64 %rd2665, 2835769497;
mov.u64 %rd2664, 1684936478;
mov.u64 %rd2663, 2027808484;
mov.u64 %rd2662, 387276957;
mov.u64 %rd2661, 842468239;
mov.u64 %rd2659, 3986602516;
mov.u64 %rd2658, 1013904242;
mov.u64 %rd2656, 3668340011;
mov.u64 %rd2655, 3144134277;
mov.u64 %rd2653, 3449720151;
mov.u64 %rd2652, 1993301258;
mov.u64 %rd2651, 3528531795;
bra.uni LBB80_38;
LBB80_37:
setp.lt.u64 %p46, %rd324, %rd12;
selp.u64 %rd1806, 1, 0, %p46;
add.s64 %rd1807, %rd2464, %rd1806;
and.b64 %rd1808, %rd1807, 4294967295;
mul.lo.s64 %rd2654, %rd1808, 3449720151;
xor.b64 %rd1809, %rd2654, %rd324;
shr.u64 %rd1810, %rd1809, 32;
mul.lo.s64 %rd2657, %rd1810, 3528531795;
shr.u64 %rd1811, %rd2657, 32;
and.b64 %rd1812, %rd324, 4294967295;
mul.lo.s64 %rd1813, %rd1812, 3528531795;
and.b64 %rd1814, %rd1813, 4294967295;
xor.b64 %rd1815, %rd1814, %rd1811;
xor.b64 %rd1816, %rd1815, 3144134277;
mul.lo.s64 %rd2660, %rd1816, 3449720151;
xor.b64 %rd2650, %rd1807, %rd1813;
mov.u32 %r339, -766435501;
mov.u32 %r338, -239350328;
mov.u64 %rd2667, 1684936478;
mov.u64 %rd2666, 534103459;
mov.u64 %rd2665, 387276957;
mov.u64 %rd2664, 3041712726;
mov.u64 %rd2663, 3986602516;
mov.u64 %rd2662, 2835769497;
mov.u64 %rd2661, 3668340011;
mov.u64 %rd2659, 2027808484;
mov.u64 %rd2658, 1993301258;
mov.u64 %rd2656, 842468239;
mov.u64 %rd2655, 2654435769;
mov.u64 %rd2653, 3528531795;
mov.u64 %rd2652, 1013904242;
mov.u64 %rd2651, 3449720151;
LBB80_38:
shr.u64 %rd1842, %rd2660, 32;
shr.u64 %rd1843, %rd2650, 32;
mul.lo.s64 %rd1844, %rd1843, %rd2651;
and.b64 %rd1845, %rd1844, 4294967295;
xor.b64 %rd1846, %rd1845, %rd1842;
xor.b64 %rd1847, %rd1846, %rd2652;
mul.lo.s64 %rd1848, %rd1847, %rd2653;
shr.u64 %rd1849, %rd1848, 32;
shr.u64 %rd1850, %rd1844, 32;
and.b64 %rd1851, %rd2654, 4294967295;
xor.b64 %rd1852, %rd1851, %rd1850;
xor.b64 %rd1853, %rd1852, %rd2655;
mul.lo.s64 %rd1854, %rd1853, %rd2653;
and.b64 %rd1855, %rd1854, 4294967295;
xor.b64 %rd1856, %rd1855, %rd1849;
xor.b64 %rd1857, %rd1856, %rd2656;
mul.lo.s64 %rd1858, %rd1857, %rd2651;
shr.u64 %rd1859, %rd1858, 32;
shr.u64 %rd1860, %rd1854, 32;
and.b64 %rd1861, %rd2657, 4294967295;
xor.b64 %rd1862, %rd1861, %rd1860;
xor.b64 %rd1863, %rd1862, %rd2658;
mul.lo.s64 %rd1864, %rd1863, %rd2651;
and.b64 %rd1865, %rd1864, 4294967295;
xor.b64 %rd1866, %rd1865, %rd1859;
xor.b64 %rd1867, %rd1866, %rd2659;
mul.lo.s64 %rd1868, %rd1867, %rd2653;
shr.u64 %rd1869, %rd1868, 32;
shr.u64 %rd1870, %rd1864, 32;
and.b64 %rd1871, %rd2660, 4294967295;
xor.b64 %rd1872, %rd1871, %rd1870;
xor.b64 %rd1873, %rd1872, %rd2661;
mul.lo.s64 %rd1874, %rd1873, %rd2653;
and.b64 %rd1875, %rd1874, 4294967295;
xor.b64 %rd1876, %rd1875, %rd1869;
xor.b64 %rd1877, %rd1876, %rd2662;
mul.lo.s64 %rd1878, %rd1877, %rd2651;
shr.u64 %rd1879, %rd1878, 32;
shr.u64 %rd1880, %rd1874, 32;
and.b64 %rd1881, %rd1848, 4294967295;
xor.b64 %rd1882, %rd1881, %rd1880;
xor.b64 %rd1883, %rd1882, %rd2663;
mul.lo.s64 %rd1884, %rd1883, %rd2651;
and.b64 %rd1885, %rd1884, 4294967295;
xor.b64 %rd1886, %rd1885, %rd1879;
xor.b64 %rd1887, %rd1886, %rd2664;
mul.lo.s64 %rd1888, %rd1887, %rd2653;
shr.u64 %rd1889, %rd1888, 32;
shr.u64 %rd1890, %rd1884, 32;
and.b64 %rd1891, %rd1858, 4294967295;
xor.b64 %rd1892, %rd1891, %rd1890;
xor.b64 %rd1893, %rd1892, %rd2665;
mul.lo.s64 %rd1894, %rd1893, %rd2653;
and.b64 %rd1895, %rd1894, 4294967295;
xor.b64 %rd1896, %rd1895, %rd1889;
xor.b64 %rd1897, %rd1896, %rd2666;
mul.lo.s64 %rd1898, %rd1897, %rd2651;
shr.u64 %rd1899, %rd1898, 32;
shr.u64 %rd1900, %rd1894, 32;
xor.b64 %rd1901, %rd1868, %rd1900;
xor.b64 %rd1902, %rd1901, %rd2667;
mul.lo.s64 %rd1903, %rd1902, %rd2651;
xor.b64 %rd1904, %rd1899, %rd1903;
cvt.u32.u64 %r209, %rd1904;
xor.b32 %r210, %r338, %r209;
mul.lo.s32 %r211, %r210, %r339;
shr.u32 %r212, %r211, 9;
cvt.rn.f32.u32 %f190, %r212;
mul.rn.f32 %f191, %f190, 0f34000000;
cvt.rn.f16.f32 %h100, %f191;
mov.b16 %h101, 0x2E66;
setp.ge.f16 %p49, %h100, %h101;
ld.global.nc.b16 %h102, [%rd45+1282];
ld.global.nc.f32 %f192, [%rd46+2564];
cvt.rn.f16.f32 %h103, %f192;
add.rn.f16 %h104, %h102, %h103;
mov.b16 %h105, 0x3C72;
mul.rn.f16 %h106, %h104, %h105;
selp.b16 %h107, %h106, 0x0000, %p49;
cvt.f32.f16 %f193, %h107;
ld.global.nc.b16 %h108, [%rd47+1282];
cvt.f32.f16 %f194, %h108;
ld.global.nc.f32 %f195, [%rd48+2564];
mul.rn.f32 %f196, %f1, %f195;
mul.rn.f32 %f197, %f196, %f194;
ld.global.nc.f32 %f198, [%rd49+2564];
mul.rn.f32 %f199, %f2, %f196;
sub.rn.f32 %f200, %f198, %f199;
add.rn.f32 %f201, %f197, %f200;
add.rn.f32 %f202, %f201, %f193;
sub.rn.f32 %f203, %f202, %f3;
mul.rn.f32 %f204, %f203, %f203;
add.rn.f32 %f15, %f14, %f204;
or.b32 %r214, %r73, 768;
shr.u32 %r215, %r214, 2;
cvt.u64.u32 %rd1905, %r215;
add.s64 %rd351, %rd12, %rd1905;
@%p8 bra LBB80_40;
and.b64 %rd1947, %rd351, 4294967295;
mul.lo.s64 %rd2672, %rd1947, 3528531795;
setp.lt.u64 %p51, %rd351, %rd12;
selp.u64 %rd1948, 1, 0, %p51;
add.s64 %rd1949, %rd2464, %rd1948;
xor.b64 %rd1950, %rd1949, %rd2672;
shr.u64 %rd1951, %rd1950, 32;
mul.lo.s64 %rd2675, %rd1951, 3449720151;
shr.u64 %rd1952, %rd2675, 32;
and.b64 %rd1953, %rd1949, 4294967295;
mul.lo.s64 %rd1954, %rd1953, 3449720151;
and.b64 %rd1955, %rd1954, 4294967295;
xor.b64 %rd1956, %rd1955, %rd1952;
xor.b64 %rd1957, %rd1956, 2654435769;
mul.lo.s64 %rd2678, %rd1957, 3528531795;
xor.b64 %rd2668, %rd1954, %rd351;
mov.u32 %r342, -1879881855;
mov.u32 %r341, -845247145;
mov.u32 %r340, 534103459;
mov.u64 %rd2686, 3678237736;
mov.u64 %rd2685, 3041712726;
mov.u64 %rd2684, 1401181199;
mov.u64 %rd2683, 2835769497;
mov.u64 %rd2682, 1684936478;
mov.u64 %rd2681, 2027808484;
mov.u64 %rd2680, 387276957;
mov.u64 %rd2679, 842468239;
mov.u64 %rd2677, 3986602516;
mov.u64 %rd2676, 1013904242;
mov.u64 %rd2674, 3668340011;
mov.u64 %rd2673, 3144134277;
mov.u64 %rd2671, 3449720151;
mov.u64 %rd2670, 1993301258;
mov.u64 %rd2669, 3528531795;
bra.uni LBB80_41;
LBB80_40:
setp.lt.u64 %p50, %rd351, %rd12;
selp.u64 %rd1921, 1, 0, %p50;
add.s64 %rd1922, %rd2464, %rd1921;
and.b64 %rd1923, %rd1922, 4294967295;
mul.lo.s64 %rd2672, %rd1923, 3449720151;
xor.b64 %rd1924, %rd2672, %rd351;
shr.u64 %rd1925, %rd1924, 32;
mul.lo.s64 %rd2675, %rd1925, 3528531795;
shr.u64 %rd1926, %rd2675, 32;
and.b64 %rd1927, %rd351, 4294967295;
mul.lo.s64 %rd1928, %rd1927, 3528531795;
and.b64 %rd1929, %rd1928, 4294967295;
xor.b64 %rd1930, %rd1929, %rd1926;
xor.b64 %rd1931, %rd1930, 3144134277;
mul.lo.s64 %rd2678, %rd1931, 3449720151;
xor.b64 %rd2668, %rd1922, %rd1928;
mov.u32 %r342, -1767562579;
mov.u32 %r341, -766435501;
mov.u32 %r340, 1401181199;
mov.u64 %rd2686, 4055616968;
mov.u64 %rd2685, 1684936478;
mov.u64 %rd2684, 534103459;
mov.u64 %rd2683, 387276957;
mov.u64 %rd2682, 3041712726;
mov.u64 %rd2681, 3986602516;
mov.u64 %rd2680, 2835769497;
mov.u64 %rd2679, 3668340011;
mov.u64 %rd2677, 2027808484;
mov.u64 %rd2676, 1993301258;
mov.u64 %rd2674, 842468239;
mov.u64 %rd2673, 2654435769;
mov.u64 %rd2671, 3528531795;
mov.u64 %rd2670, 1013904242;
mov.u64 %rd2669, 3449720151;
LBB80_41:
shr.u64 %rd1958, %rd2678, 32;
shr.u64 %rd1959, %rd2668, 32;
mul.lo.s64 %rd1960, %rd1959, %rd2669;
and.b64 %rd1961, %rd1960, 4294967295;
xor.b64 %rd1962, %rd1961, %rd1958;
xor.b64 %rd1963, %rd1962, %rd2670;
mul.lo.s64 %rd1964, %rd1963, %rd2671;
shr.u64 %rd1965, %rd1964, 32;
shr.u64 %rd1966, %rd1960, 32;
and.b64 %rd1967, %rd2672, 4294967295;
xor.b64 %rd1968, %rd1967, %rd1966;
xor.b64 %rd1969, %rd1968, %rd2673;
mul.lo.s64 %rd1970, %rd1969, %rd2671;
and.b64 %rd1971, %rd1970, 4294967295;
xor.b64 %rd1972, %rd1971, %rd1965;
xor.b64 %rd1973, %rd1972, %rd2674;
mul.lo.s64 %rd1974, %rd1973, %rd2669;
shr.u64 %rd1975, %rd1974, 32;
shr.u64 %rd1976, %rd1970, 32;
and.b64 %rd1977, %rd2675, 4294967295;
xor.b64 %rd1978, %rd1977, %rd1976;
xor.b64 %rd1979, %rd1978, %rd2676;
mul.lo.s64 %rd1980, %rd1979, %rd2669;
and.b64 %rd1981, %rd1980, 4294967295;
xor.b64 %rd1982, %rd1981, %rd1975;
xor.b64 %rd1983, %rd1982, %rd2677;
mul.lo.s64 %rd1984, %rd1983, %rd2671;
shr.u64 %rd1985, %rd1984, 32;
shr.u64 %rd1986, %rd1980, 32;
and.b64 %rd1987, %rd2678, 4294967295;
xor.b64 %rd1988, %rd1987, %rd1986;
xor.b64 %rd1989, %rd1988, %rd2679;
mul.lo.s64 %rd1990, %rd1989, %rd2671;
and.b64 %rd1991, %rd1990, 4294967295;
xor.b64 %rd1992, %rd1991, %rd1985;
xor.b64 %rd1993, %rd1992, %rd2680;
mul.lo.s64 %rd1994, %rd1993, %rd2669;
shr.u64 %rd1995, %rd1994, 32;
shr.u64 %rd1996, %rd1990, 32;
and.b64 %rd1997, %rd1964, 4294967295;
xor.b64 %rd1998, %rd1997, %rd1996;
xor.b64 %rd1999, %rd1998, %rd2681;
mul.lo.s64 %rd2000, %rd1999, %rd2669;
and.b64 %rd2001, %rd2000, 4294967295;
xor.b64 %rd2002, %rd2001, %rd1995;
xor.b64 %rd2003, %rd2002, %rd2682;
mul.lo.s64 %rd2004, %rd2003, %rd2671;
shr.u64 %rd2005, %rd2004, 32;
shr.u64 %rd2006, %rd2000, 32;
and.b64 %rd2007, %rd1974, 4294967295;
xor.b64 %rd2008, %rd2007, %rd2006;
xor.b64 %rd2009, %rd2008, %rd2683;
mul.lo.s64 %rd2010, %rd2009, %rd2671;
and.b64 %rd2011, %rd2010, 4294967295;
xor.b64 %rd2012, %rd2011, %rd2005;
xor.b64 %rd2013, %rd2012, %rd2684;
mul.lo.s64 %rd2014, %rd2013, %rd2669;
shr.u64 %rd2015, %rd2014, 32;
shr.u64 %rd2016, %rd2010, 32;
and.b64 %rd2017, %rd1984, 4294967295;
xor.b64 %rd2018, %rd2017, %rd2016;
xor.b64 %rd2019, %rd2018, %rd2685;
mul.lo.s64 %rd2020, %rd2019, %rd2669;
and.b64 %rd2021, %rd2020, 4294967295;
xor.b64 %rd2022, %rd2021, %rd2015;
xor.b64 %rd2023, %rd2022, %rd2686;
mul.lo.s64 %rd2024, %rd2023, %rd2671;
shr.u64 %rd2025, %rd2024, 32;
cvt.u32.u64 %r222, %rd2025;
shr.u64 %rd2026, %rd2020, 32;
xor.b64 %rd2027, %rd2026, %rd1994;
cvt.u32.u64 %r223, %rd2027;
xor.b32 %r224, %r340, %r223;
mul.lo.s32 %r225, %r224, %r341;
xor.b32 %r226, %r225, %r222;
xor.b32 %r227, %r226, %r342;
shr.u32 %r228, %r227, 9;
cvt.rn.f32.u32 %f205, %r228;
mul.rn.f32 %f206, %f205, 0f34000000;
cvt.rn.f16.f32 %h109, %f206;
mov.b16 %h110, 0x2E66;
setp.ge.f16 %p52, %h109, %h110;
ld.global.nc.b16 %h111, [%rd45+1536];
ld.global.nc.f32 %f207, [%rd46+3072];
cvt.rn.f16.f32 %h112, %f207;
add.rn.f16 %h113, %h111, %h112;
mov.b16 %h114, 0x3C72;
mul.rn.f16 %h115, %h113, %h114;
selp.b16 %h116, %h115, 0x0000, %p52;
cvt.f32.f16 %f208, %h116;
ld.global.nc.b16 %h117, [%rd47+1536];
cvt.f32.f16 %f209, %h117;
ld.global.nc.f32 %f210, [%rd48+3072];
mul.rn.f32 %f211, %f1, %f210;
mul.rn.f32 %f212, %f211, %f209;
ld.global.nc.f32 %f213, [%rd49+3072];
mul.rn.f32 %f214, %f2, %f211;
sub.rn.f32 %f215, %f213, %f214;
add.rn.f32 %f216, %f212, %f215;
add.rn.f32 %f217, %f216, %f208;
sub.rn.f32 %f218, %f217, %f3;
mul.rn.f32 %f219, %f218, %f218;
add.rn.f32 %f16, %f15, %f219;
or.b32 %r229, %r3, 769;
or.b32 %r230, %r229, %r4;
and.b32 %r231, %r229, 3;
shr.u32 %r232, %r230, 2;
setp.ne.s32 %p53, %r231, 1;
cvt.u64.u32 %rd2028, %r232;
add.s64 %rd379, %rd12, %rd2028;
@%p53 bra LBB80_43;
and.b64 %rd2068, %rd379, 4294967295;
mul.lo.s64 %rd2691, %rd2068, 3528531795;
setp.lt.u64 %p55, %rd379, %rd12;
selp.u64 %rd2069, 1, 0, %p55;
add.s64 %rd2070, %rd2464, %rd2069;
xor.b64 %rd2071, %rd2070, %rd2691;
shr.u64 %rd2072, %rd2071, 32;
mul.lo.s64 %rd2694, %rd2072, 3449720151;
shr.u64 %rd2073, %rd2694, 32;
and.b64 %rd2074, %rd2070, 4294967295;
mul.lo.s64 %rd2075, %rd2074, 3449720151;
and.b64 %rd2076, %rd2075, 4294967295;
xor.b64 %rd2077, %rd2076, %rd2073;
xor.b64 %rd2078, %rd2077, 2654435769;
mul.lo.s64 %rd2697, %rd2078, 3528531795;
xor.b64 %rd2687, %rd2075, %rd379;
mov.u32 %r344, -845247145;
mov.u32 %r343, -616729560;
mov.u64 %rd2704, 3041712726;
mov.u64 %rd2703, 1401181199;
mov.u64 %rd2702, 2835769497;
mov.u64 %rd2701, 1684936478;
mov.u64 %rd2700, 2027808484;
mov.u64 %rd2699, 387276957;
mov.u64 %rd2698, 842468239;
mov.u64 %rd2696, 3986602516;
mov.u64 %rd2695, 1013904242;
mov.u64 %rd2693, 3668340011;
mov.u64 %rd2692, 3144134277;
mov.u64 %rd2690, 3449720151;
mov.u64 %rd2689, 1993301258;
mov.u64 %rd2688, 3528531795;
bra.uni LBB80_44;
LBB80_43:
setp.lt.u64 %p54, %rd379, %rd12;
selp.u64 %rd2043, 1, 0, %p54;
add.s64 %rd2044, %rd2464, %rd2043;
and.b64 %rd2045, %rd2044, 4294967295;
mul.lo.s64 %rd2691, %rd2045, 3449720151;
xor.b64 %rd2046, %rd2691, %rd379;
shr.u64 %rd2047, %rd2046, 32;
mul.lo.s64 %rd2694, %rd2047, 3528531795;
shr.u64 %rd2048, %rd2694, 32;
and.b64 %rd2049, %rd379, 4294967295;
mul.lo.s64 %rd2050, %rd2049, 3528531795;
and.b64 %rd2051, %rd2050, 4294967295;
xor.b64 %rd2052, %rd2051, %rd2048;
xor.b64 %rd2053, %rd2052, 3144134277;
mul.lo.s64 %rd2697, %rd2053, 3449720151;
xor.b64 %rd2687, %rd2044, %rd2050;
mov.u32 %r344, -766435501;
mov.u32 %r343, -239350328;
mov.u64 %rd2704, 1684936478;
mov.u64 %rd2703, 534103459;
mov.u64 %rd2702, 387276957;
mov.u64 %rd2701, 3041712726;
mov.u64 %rd2700, 3986602516;
mov.u64 %rd2699, 2835769497;
mov.u64 %rd2698, 3668340011;
mov.u64 %rd2696, 2027808484;
mov.u64 %rd2695, 1993301258;
mov.u64 %rd2693, 842468239;
mov.u64 %rd2692, 2654435769;
mov.u64 %rd2690, 3528531795;
mov.u64 %rd2689, 1013904242;
mov.u64 %rd2688, 3449720151;
LBB80_44:
shr.u64 %rd2079, %rd2697, 32;
shr.u64 %rd2080, %rd2687, 32;
mul.lo.s64 %rd2081, %rd2080, %rd2688;
and.b64 %rd2082, %rd2081, 4294967295;
xor.b64 %rd2083, %rd2082, %rd2079;
xor.b64 %rd2084, %rd2083, %rd2689;
mul.lo.s64 %rd2085, %rd2084, %rd2690;
shr.u64 %rd2086, %rd2085, 32;
shr.u64 %rd2087, %rd2081, 32;
and.b64 %rd2088, %rd2691, 4294967295;
xor.b64 %rd2089, %rd2088, %rd2087;
xor.b64 %rd2090, %rd2089, %rd2692;
mul.lo.s64 %rd2091, %rd2090, %rd2690;
and.b64 %rd2092, %rd2091, 4294967295;
xor.b64 %rd2093, %rd2092, %rd2086;
xor.b64 %rd2094, %rd2093, %rd2693;
mul.lo.s64 %rd2095, %rd2094, %rd2688;
shr.u64 %rd2096, %rd2095, 32;
shr.u64 %rd2097, %rd2091, 32;
and.b64 %rd2098, %rd2694, 4294967295;
xor.b64 %rd2099, %rd2098, %rd2097;
xor.b64 %rd2100, %rd2099, %rd2695;
mul.lo.s64 %rd2101, %rd2100, %rd2688;
and.b64 %rd2102, %rd2101, 4294967295;
xor.b64 %rd2103, %rd2102, %rd2096;
xor.b64 %rd2104, %rd2103, %rd2696;
mul.lo.s64 %rd2105, %rd2104, %rd2690;
shr.u64 %rd2106, %rd2105, 32;
shr.u64 %rd2107, %rd2101, 32;
and.b64 %rd2108, %rd2697, 4294967295;
xor.b64 %rd2109, %rd2108, %rd2107;
xor.b64 %rd2110, %rd2109, %rd2698;
mul.lo.s64 %rd2111, %rd2110, %rd2690;
and.b64 %rd2112, %rd2111, 4294967295;
xor.b64 %rd2113, %rd2112, %rd2106;
xor.b64 %rd2114, %rd2113, %rd2699;
mul.lo.s64 %rd2115, %rd2114, %rd2688;
shr.u64 %rd2116, %rd2115, 32;
shr.u64 %rd2117, %rd2111, 32;
and.b64 %rd2118, %rd2085, 4294967295;
xor.b64 %rd2119, %rd2118, %rd2117;
xor.b64 %rd2120, %rd2119, %rd2700;
mul.lo.s64 %rd2121, %rd2120, %rd2688;
and.b64 %rd2122, %rd2121, 4294967295;
xor.b64 %rd2123, %rd2122, %rd2116;
xor.b64 %rd2124, %rd2123, %rd2701;
mul.lo.s64 %rd2125, %rd2124, %rd2690;
shr.u64 %rd2126, %rd2125, 32;
shr.u64 %rd2127, %rd2121, 32;
and.b64 %rd2128, %rd2095, 4294967295;
xor.b64 %rd2129, %rd2128, %rd2127;
xor.b64 %rd2130, %rd2129, %rd2702;
mul.lo.s64 %rd2131, %rd2130, %rd2690;
and.b64 %rd2132, %rd2131, 4294967295;
xor.b64 %rd2133, %rd2132, %rd2126;
xor.b64 %rd2134, %rd2133, %rd2703;
mul.lo.s64 %rd2135, %rd2134, %rd2688;
shr.u64 %rd2136, %rd2135, 32;
shr.u64 %rd2137, %rd2131, 32;
xor.b64 %rd2138, %rd2105, %rd2137;
xor.b64 %rd2139, %rd2138, %rd2704;
mul.lo.s64 %rd2140, %rd2139, %rd2688;
xor.b64 %rd2141, %rd2136, %rd2140;
cvt.u32.u64 %r237, %rd2141;
xor.b32 %r238, %r343, %r237;
mul.lo.s32 %r239, %r238, %r344;
shr.u32 %r240, %r239, 9;
cvt.rn.f32.u32 %f220, %r240;
mul.rn.f32 %f221, %f220, 0f34000000;
cvt.rn.f16.f32 %h118, %f221;
mov.b16 %h119, 0x2E66;
setp.ge.f16 %p57, %h118, %h119;
ld.global.nc.b16 %h120, [%rd45+1538];
ld.global.nc.f32 %f222, [%rd46+3076];
cvt.rn.f16.f32 %h121, %f222;
add.rn.f16 %h122, %h120, %h121;
mov.b16 %h123, 0x3C72;
mul.rn.f16 %h124, %h122, %h123;
selp.b16 %h125, %h124, 0x0000, %p57;
cvt.f32.f16 %f223, %h125;
ld.global.nc.b16 %h126, [%rd47+1538];
cvt.f32.f16 %f224, %h126;
ld.global.nc.f32 %f225, [%rd48+3076];
mul.rn.f32 %f226, %f1, %f225;
mul.rn.f32 %f227, %f226, %f224;
ld.global.nc.f32 %f228, [%rd49+3076];
mul.rn.f32 %f229, %f2, %f226;
sub.rn.f32 %f230, %f228, %f229;
add.rn.f32 %f231, %f227, %f230;
add.rn.f32 %f232, %f231, %f223;
sub.rn.f32 %f233, %f232, %f3;
mul.rn.f32 %f234, %f233, %f233;
add.rn.f32 %f17, %f16, %f234;
or.b32 %r242, %r73, 896;
shr.u32 %r243, %r242, 2;
cvt.u64.u32 %rd2142, %r243;
add.s64 %rd406, %rd12, %rd2142;
@%p8 bra LBB80_46;
mov.u32 %r347, -1879881855;
mov.u32 %r345, 534103459;
mov.u64 %rd2723, 3678237736;
and.b64 %rd2184, %rd406, 4294967295;
mul.lo.s64 %rd2709, %rd2184, 3528531795;
setp.lt.u64 %p59, %rd406, %rd12;
selp.u64 %rd2185, 1, 0, %p59;
add.s64 %rd2186, %rd2464, %rd2185;
xor.b64 %rd2187, %rd2186, %rd2709;
shr.u64 %rd2188, %rd2187, 32;
mul.lo.s64 %rd2712, %rd2188, 3449720151;
shr.u64 %rd2189, %rd2712, 32;
and.b64 %rd2190, %rd2186, 4294967295;
mul.lo.s64 %rd2191, %rd2190, 3449720151;
and.b64 %rd2192, %rd2191, 4294967295;
xor.b64 %rd2193, %rd2192, %rd2189;
xor.b64 %rd2194, %rd2193, 2654435769;
mul.lo.s64 %rd2715, %rd2194, 3528531795;
xor.b64 %rd2705, %rd2191, %rd406;
mov.u32 %r346, -845247145;
mov.u64 %rd2722, 3041712726;
mov.u64 %rd2721, 1401181199;
mov.u64 %rd2720, 2835769497;
mov.u64 %rd2719, 1684936478;
mov.u64 %rd2718, 2027808484;
mov.u64 %rd2717, 387276957;
mov.u64 %rd2716, 842468239;
mov.u64 %rd2714, 3986602516;
mov.u64 %rd2713, 1013904242;
mov.u64 %rd2711, 3668340011;
mov.u64 %rd2710, 3144134277;
mov.u64 %rd2708, 3449720151;
mov.u64 %rd2707, 1993301258;
mov.u64 %rd2706, 3528531795;
bra.uni LBB80_47;
LBB80_46:
setp.lt.u64 %p58, %rd406, %rd12;
selp.u64 %rd2158, 1, 0, %p58;
add.s64 %rd2159, %rd2464, %rd2158;
and.b64 %rd2160, %rd2159, 4294967295;
mul.lo.s64 %rd2709, %rd2160, 3449720151;
xor.b64 %rd2161, %rd2709, %rd406;
shr.u64 %rd2162, %rd2161, 32;
mul.lo.s64 %rd2712, %rd2162, 3528531795;
shr.u64 %rd2163, %rd2712, 32;
and.b64 %rd2164, %rd406, 4294967295;
mul.lo.s64 %rd2165, %rd2164, 3528531795;
and.b64 %rd2166, %rd2165, 4294967295;
xor.b64 %rd2167, %rd2166, %rd2163;
xor.b64 %rd2168, %rd2167, 3144134277;
mul.lo.s64 %rd2715, %rd2168, 3449720151;
xor.b64 %rd2705, %rd2159, %rd2165;
mov.u32 %r347, -1767562579;
mov.u32 %r346, -766435501;
mov.u32 %r345, 1401181199;
mov.u64 %rd2723, 4055616968;
mov.u64 %rd2722, 1684936478;
mov.u64 %rd2721, 534103459;
mov.u64 %rd2720, 387276957;
mov.u64 %rd2719, 3041712726;
mov.u64 %rd2718, 3986602516;
mov.u64 %rd2717, 2835769497;
mov.u64 %rd2716, 3668340011;
mov.u64 %rd2714, 2027808484;
mov.u64 %rd2713, 1993301258;
mov.u64 %rd2711, 842468239;
mov.u64 %rd2710, 2654435769;
mov.u64 %rd2708, 3528531795;
mov.u64 %rd2707, 1013904242;
mov.u64 %rd2706, 3449720151;
LBB80_47:
shr.u64 %rd2195, %rd2715, 32;
shr.u64 %rd2196, %rd2705, 32;
mul.lo.s64 %rd2197, %rd2196, %rd2706;
and.b64 %rd2198, %rd2197, 4294967295;
xor.b64 %rd2199, %rd2198, %rd2195;
xor.b64 %rd2200, %rd2199, %rd2707;
mul.lo.s64 %rd2201, %rd2200, %rd2708;
shr.u64 %rd2202, %rd2201, 32;
shr.u64 %rd2203, %rd2197, 32;
and.b64 %rd2204, %rd2709, 4294967295;
xor.b64 %rd2205, %rd2204, %rd2203;
xor.b64 %rd2206, %rd2205, %rd2710;
mul.lo.s64 %rd2207, %rd2206, %rd2708;
and.b64 %rd2208, %rd2207, 4294967295;
xor.b64 %rd2209, %rd2208, %rd2202;
xor.b64 %rd2210, %rd2209, %rd2711;
mul.lo.s64 %rd2211, %rd2210, %rd2706;
shr.u64 %rd2212, %rd2211, 32;
shr.u64 %rd2213, %rd2207, 32;
and.b64 %rd2214, %rd2712, 4294967295;
xor.b64 %rd2215, %rd2214, %rd2213;
xor.b64 %rd2216, %rd2215, %rd2713;
mul.lo.s64 %rd2217, %rd2216, %rd2706;
and.b64 %rd2218, %rd2217, 4294967295;
xor.b64 %rd2219, %rd2218, %rd2212;
xor.b64 %rd2220, %rd2219, %rd2714;
mul.lo.s64 %rd2221, %rd2220, %rd2708;
shr.u64 %rd2222, %rd2221, 32;
shr.u64 %rd2223, %rd2217, 32;
and.b64 %rd2224, %rd2715, 4294967295;
xor.b64 %rd2225, %rd2224, %rd2223;
xor.b64 %rd2226, %rd2225, %rd2716;
mul.lo.s64 %rd2227, %rd2226, %rd2708;
and.b64 %rd2228, %rd2227, 4294967295;
xor.b64 %rd2229, %rd2228, %rd2222;
xor.b64 %rd2230, %rd2229, %rd2717;
mul.lo.s64 %rd2231, %rd2230, %rd2706;
shr.u64 %rd2232, %rd2231, 32;
shr.u64 %rd2233, %rd2227, 32;
and.b64 %rd2234, %rd2201, 4294967295;
xor.b64 %rd2235, %rd2234, %rd2233;
xor.b64 %rd2236, %rd2235, %rd2718;
mul.lo.s64 %rd2237, %rd2236, %rd2706;
and.b64 %rd2238, %rd2237, 4294967295;
xor.b64 %rd2239, %rd2238, %rd2232;
xor.b64 %rd2240, %rd2239, %rd2719;
mul.lo.s64 %rd2241, %rd2240, %rd2708;
shr.u64 %rd2242, %rd2241, 32;
shr.u64 %rd2243, %rd2237, 32;
and.b64 %rd2244, %rd2211, 4294967295;
xor.b64 %rd2245, %rd2244, %rd2243;
xor.b64 %rd2246, %rd2245, %rd2720;
mul.lo.s64 %rd2247, %rd2246, %rd2708;
and.b64 %rd2248, %rd2247, 4294967295;
xor.b64 %rd2249, %rd2248, %rd2242;
xor.b64 %rd2250, %rd2249, %rd2721;
mul.lo.s64 %rd2251, %rd2250, %rd2706;
shr.u64 %rd2252, %rd2251, 32;
shr.u64 %rd2253, %rd2247, 32;
and.b64 %rd2254, %rd2221, 4294967295;
xor.b64 %rd2255, %rd2254, %rd2253;
xor.b64 %rd2256, %rd2255, %rd2722;
mul.lo.s64 %rd2257, %rd2256, %rd2706;
and.b64 %rd2258, %rd2257, 4294967295;
xor.b64 %rd2259, %rd2258, %rd2252;
xor.b64 %rd2260, %rd2259, %rd2723;
mul.lo.s64 %rd2261, %rd2260, %rd2708;
shr.u64 %rd2262, %rd2261, 32;
cvt.u32.u64 %r250, %rd2262;
shr.u64 %rd2263, %rd2257, 32;
xor.b64 %rd2264, %rd2263, %rd2231;
cvt.u32.u64 %r251, %rd2264;
xor.b32 %r252, %r345, %r251;
mul.lo.s32 %r253, %r252, %r346;
xor.b32 %r254, %r253, %r250;
xor.b32 %r255, %r254, %r347;
shr.u32 %r256, %r255, 9;
cvt.rn.f32.u32 %f235, %r256;
mul.rn.f32 %f236, %f235, 0f34000000;
cvt.rn.f16.f32 %h127, %f236;
mov.b16 %h128, 0x2E66;
setp.ge.f16 %p60, %h127, %h128;
ld.global.nc.b16 %h129, [%rd45+1792];
ld.global.nc.f32 %f237, [%rd46+3584];
cvt.rn.f16.f32 %h130, %f237;
add.rn.f16 %h131, %h129, %h130;
mov.b16 %h132, 0x3C72;
mul.rn.f16 %h133, %h131, %h132;
selp.b16 %h134, %h133, 0x0000, %p60;
cvt.f32.f16 %f238, %h134;
ld.global.nc.b16 %h135, [%rd47+1792];
cvt.f32.f16 %f239, %h135;
ld.global.nc.f32 %f240, [%rd48+3584];
mul.rn.f32 %f241, %f1, %f240;
mul.rn.f32 %f242, %f241, %f239;
ld.global.nc.f32 %f243, [%rd49+3584];
mul.rn.f32 %f244, %f2, %f241;
sub.rn.f32 %f245, %f243, %f244;
add.rn.f32 %f246, %f242, %f245;
add.rn.f32 %f247, %f246, %f238;
sub.rn.f32 %f248, %f247, %f3;
mul.rn.f32 %f249, %f248, %f248;
add.rn.f32 %f18, %f17, %f249;
or.b32 %r257, %r3, 897;
or.b32 %r258, %r257, %r4;
and.b32 %r259, %r257, 3;
shr.u32 %r260, %r258, 2;
setp.ne.s32 %p61, %r259, 1;
cvt.u64.u32 %rd2265, %r260;
add.s64 %rd434, %rd12, %rd2265;
@%p61 bra LBB80_49;
mov.u32 %r349, -845247145;
mov.u64 %rd2740, 1401181199;
mov.u64 %rd2729, 3144134277;
mov.u32 %r348, -616729560;
and.b64 %rd2305, %rd434, 4294967295;
mul.lo.s64 %rd2728, %rd2305, 3528531795;
setp.lt.u64 %p63, %rd434, %rd12;
selp.u64 %rd2306, 1, 0, %p63;
add.s64 %rd2307, %rd2464, %rd2306;
xor.b64 %rd2308, %rd2307, %rd2728;
shr.u64 %rd2309, %rd2308, 32;
mul.lo.s64 %rd2731, %rd2309, 3449720151;
shr.u64 %rd2310, %rd2731, 32;
and.b64 %rd2311, %rd2307, 4294967295;
mul.lo.s64 %rd2312, %rd2311, 3449720151;
and.b64 %rd2313, %rd2312, 4294967295;
xor.b64 %rd2314, %rd2313, %rd2310;
xor.b64 %rd2315, %rd2314, 2654435769;
mul.lo.s64 %rd2734, %rd2315, 3528531795;
xor.b64 %rd2724, %rd2312, %rd434;
mov.u64 %rd2741, 3041712726;
mov.u64 %rd2739, 2835769497;
mov.u64 %rd2738, 1684936478;
mov.u64 %rd2737, 2027808484;
mov.u64 %rd2736, 387276957;
mov.u64 %rd2735, 842468239;
mov.u64 %rd2733, 3986602516;
mov.u64 %rd2732, 1013904242;
mov.u64 %rd2730, 3668340011;
mov.u64 %rd2727, 3449720151;
mov.u64 %rd2726, 1993301258;
mov.u64 %rd2725, 3528531795;
bra.uni LBB80_50;
LBB80_49:
setp.lt.u64 %p62, %rd434, %rd12;
selp.u64 %rd2280, 1, 0, %p62;
add.s64 %rd2281, %rd2464, %rd2280;
and.b64 %rd2282, %rd2281, 4294967295;
mul.lo.s64 %rd2728, %rd2282, 3449720151;
xor.b64 %rd2283, %rd2728, %rd434;
shr.u64 %rd2284, %rd2283, 32;
mul.lo.s64 %rd2731, %rd2284, 3528531795;
shr.u64 %rd2285, %rd2731, 32;
and.b64 %rd2286, %rd434, 4294967295;
mul.lo.s64 %rd2287, %rd2286, 3528531795;
and.b64 %rd2288, %rd2287, 4294967295;
xor.b64 %rd2289, %rd2288, %rd2285;
xor.b64 %rd2290, %rd2289, 3144134277;
mul.lo.s64 %rd2734, %rd2290, 3449720151;
xor.b64 %rd2724, %rd2281, %rd2287;
mov.u32 %r349, -766435501;
mov.u32 %r348, -239350328;
mov.u64 %rd2741, 1684936478;
mov.u64 %rd2740, 534103459;
mov.u64 %rd2739, 387276957;
mov.u64 %rd2738, 3041712726;
mov.u64 %rd2737, 3986602516;
mov.u64 %rd2736, 2835769497;
mov.u64 %rd2735, 3668340011;
mov.u64 %rd2733, 2027808484;
mov.u64 %rd2732, 1993301258;
mov.u64 %rd2730, 842468239;
mov.u64 %rd2729, 2654435769;
mov.u64 %rd2727, 3528531795;
mov.u64 %rd2726, 1013904242;
mov.u64 %rd2725, 3449720151;
LBB80_50:
shr.u64 %rd2316, %rd2734, 32;
shr.u64 %rd2317, %rd2724, 32;
mul.lo.s64 %rd2318, %rd2317, %rd2725;
and.b64 %rd2319, %rd2318, 4294967295;
xor.b64 %rd2320, %rd2319, %rd2316;
xor.b64 %rd2321, %rd2320, %rd2726;
mul.lo.s64 %rd2322, %rd2321, %rd2727;
shr.u64 %rd2323, %rd2322, 32;
shr.u64 %rd2324, %rd2318, 32;
and.b64 %rd2325, %rd2728, 4294967295;
xor.b64 %rd2326, %rd2325, %rd2324;
xor.b64 %rd2327, %rd2326, %rd2729;
mul.lo.s64 %rd2328, %rd2327, %rd2727;
and.b64 %rd2329, %rd2328, 4294967295;
xor.b64 %rd2330, %rd2329, %rd2323;
xor.b64 %rd2331, %rd2330, %rd2730;
mul.lo.s64 %rd2332, %rd2331, %rd2725;
shr.u64 %rd2333, %rd2332, 32;
shr.u64 %rd2334, %rd2328, 32;
and.b64 %rd2335, %rd2731, 4294967295;
xor.b64 %rd2336, %rd2335, %rd2334;
xor.b64 %rd2337, %rd2336, %rd2732;
mul.lo.s64 %rd2338, %rd2337, %rd2725;
and.b64 %rd2339, %rd2338, 4294967295;
xor.b64 %rd2340, %rd2339, %rd2333;
xor.b64 %rd2341, %rd2340, %rd2733;
mul.lo.s64 %rd2342, %rd2341, %rd2727;
shr.u64 %rd2343, %rd2342, 32;
shr.u64 %rd2344, %rd2338, 32;
and.b64 %rd2345, %rd2734, 4294967295;
xor.b64 %rd2346, %rd2345, %rd2344;
xor.b64 %rd2347, %rd2346, %rd2735;
mul.lo.s64 %rd2348, %rd2347, %rd2727;
and.b64 %rd2349, %rd2348, 4294967295;
xor.b64 %rd2350, %rd2349, %rd2343;
xor.b64 %rd2351, %rd2350, %rd2736;
mul.lo.s64 %rd2352, %rd2351, %rd2725;
shr.u64 %rd2353, %rd2352, 32;
shr.u64 %rd2354, %rd2348, 32;
and.b64 %rd2355, %rd2322, 4294967295;
xor.b64 %rd2356, %rd2355, %rd2354;
xor.b64 %rd2357, %rd2356, %rd2737;
mul.lo.s64 %rd2358, %rd2357, %rd2725;
and.b64 %rd2359, %rd2358, 4294967295;
xor.b64 %rd2360, %rd2359, %rd2353;
xor.b64 %rd2361, %rd2360, %rd2738;
mul.lo.s64 %rd2362, %rd2361, %rd2727;
shr.u64 %rd2363, %rd2362, 32;
shr.u64 %rd2364, %rd2358, 32;
and.b64 %rd2365, %rd2332, 4294967295;
xor.b64 %rd2366, %rd2365, %rd2364;
xor.b64 %rd2367, %rd2366, %rd2739;
mul.lo.s64 %rd2368, %rd2367, %rd2727;
and.b64 %rd2369, %rd2368, 4294967295;
xor.b64 %rd2370, %rd2369, %rd2363;
xor.b64 %rd2371, %rd2370, %rd2740;
mul.lo.s64 %rd2372, %rd2371, %rd2725;
shr.u64 %rd2373, %rd2372, 32;
shr.u64 %rd2374, %rd2368, 32;
xor.b64 %rd2375, %rd2342, %rd2374;
xor.b64 %rd2376, %rd2375, %rd2741;
mul.lo.s64 %rd2377, %rd2376, %rd2725;
xor.b64 %rd2378, %rd2373, %rd2377;
cvt.u32.u64 %r265, %rd2378;
xor.b32 %r266, %r348, %r265;
mul.lo.s32 %r267, %r266, %r349;
shr.u32 %r268, %r267, 9;
cvt.rn.f32.u32 %f250, %r268;
mul.rn.f32 %f251, %f250, 0f34000000;
cvt.rn.f16.f32 %h136, %f251;
mov.b16 %h137, 0x2E66;
setp.ge.f16 %p64, %h136, %h137;
ld.global.nc.b16 %h138, [%rd45+1794];
ld.global.nc.f32 %f252, [%rd46+3588];
cvt.rn.f16.f32 %h139, %f252;
add.rn.f16 %h140, %h138, %h139;
mov.b16 %h141, 0x3C72;
mul.rn.f16 %h142, %h140, %h141;
selp.b16 %h143, %h142, 0x0000, %p64;
cvt.f32.f16 %f253, %h143;
ld.global.nc.b16 %h144, [%rd47+1794];
cvt.f32.f16 %f254, %h144;
ld.global.nc.f32 %f255, [%rd48+3588];
mul.rn.f32 %f256, %f1, %f255;
mul.rn.f32 %f257, %f256, %f254;
ld.global.nc.f32 %f258, [%rd49+3588];
mul.rn.f32 %f259, %f2, %f256;
sub.rn.f32 %f260, %f258, %f259;
add.rn.f32 %f261, %f257, %f260;
add.rn.f32 %f262, %f261, %f253;
sub.rn.f32 %f263, %f262, %f3;
mul.rn.f32 %f264, %f263, %f263;
add.rn.f32 %f265, %f18, %f264;
and.b32 %r46, %r1, 31;
shfl.sync.down.b32 %f266, %f265, 16, 31, -1;
add.rn.f32 %f267, %f266, %f265;
shfl.sync.down.b32 %f268, %f267, 8, 31, -1;
add.rn.f32 %f269, %f268, %f267;
shfl.sync.down.b32 %f270, %f269, 4, 31, -1;
add.rn.f32 %f271, %f270, %f269;
shfl.sync.down.b32 %f272, %f271, 2, 31, -1;
add.rn.f32 %f273, %f272, %f271;
shfl.sync.down.b32 %f274, %f273, 1, 31, -1;
shr.u32 %r47, %r1, 5;
setp.ne.s32 %p65, %r46, 0;
mov.u64 %rd2381, shared_cache_019;
@%p65 bra LBB80_2;
mul.wide.u32 %rd2380, %r47, 4;
add.s64 %rd462, %rd2381, %rd2380;
add.rn.f32 %f19, %f274, %f273;
st.shared.f32 [%rd462], %f19;
LBB80_2:
bar.sync 0;
setp.eq.s32 %p66, %r47, 0;
@%p66 bra LBB80_52;
bra.uni LBB80_3;
LBB80_52:
add.u64 %rd474, %SP, 0;
add.u64 %rd11, %SPL, 0;
mul.wide.u32 %rd2382, %r46, 4;
add.s64 %rd463, %rd2381, %rd2382;
cvta.shared.u64 %rd2384, %rd463;
mov.u32 %r269, 0;
st.local.u32 [%rd11], %r269;
setp.lt.u32 %p67, %r1, 2;
selp.b64 %rd2386, %rd2384, %rd474, %p67;
ld.f32 %f275, [%rd2386];
shfl.sync.down.b32 %f276, %f275, 16, 31, -1;
add.rn.f32 %f277, %f275, %f276;
shfl.sync.down.b32 %f278, %f277, 8, 31, -1;
add.rn.f32 %f279, %f277, %f278;
shfl.sync.down.b32 %f280, %f279, 4, 31, -1;
add.rn.f32 %f281, %f279, %f280;
shfl.sync.down.b32 %f282, %f281, 2, 31, -1;
add.rn.f32 %f283, %f281, %f282;
shfl.sync.down.b32 %f284, %f283, 1, 31, -1;
add.rn.f32 %f285, %f283, %f284;
st.f32 [%rd2386], %f285;
setp.ne.s32 %p68, %r1, 0;
@%p68 bra LBB80_3;
ld.param.u64 %rd470, [fusion_2178_param_3];
cvt.u64.u32 %rd44, %r2;
cvta.to.global.u64 %rd7, %rd470;
shl.b64 %rd2379, %rd44, 2;
add.s64 %rd461, %rd7, %rd2379;
ld.shared.f32 %f286, [%rd463];
atom.global.add.f32 %f287, [%rd461], %f286;
LBB80_3:
ret;
}
// .globl fusion_2175
.visible .entry fusion_2175(
.param .u64 fusion_2175_param_0,
.param .u64 fusion_2175_param_1,
.param .u64 fusion_2175_param_2,
.param .u64 fusion_2175_param_3,
.param .u64 fusion_2175_param_4,
.param .u64 fusion_2175_param_5,
.param .u64 fusion_2175_param_6,
.param .u64 fusion_2175_param_7,
.param .u64 fusion_2175_param_8,
.param .u64 fusion_2175_param_9,
.param .u64 fusion_2175_param_10,
.param .u64 fusion_2175_param_11,
.param .u64 fusion_2175_param_12,
.param .u64 fusion_2175_param_13
)
.reqntid 256, 1, 1
{
.reg .pred %p<6>;
.reg .b16 %h<39>;
.reg .b32 %hh<5>;
.reg .f32 %f<97>;
.reg .b32 %r<31>;
.reg .b64 %rd<162>;
ld.param.u64 %rd1, [fusion_2175_param_0];
ld.param.u64 %rd2, [fusion_2175_param_12];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2175_param_1];
ld.param.u64 %rd5, [fusion_2175_param_11];
cvta.to.global.u64 %rd6, %rd5;
ld.param.u64 %rd7, [fusion_2175_param_2];
ld.param.u64 %rd8, [fusion_2175_param_10];
cvta.to.global.u64 %rd9, %rd8;
ld.param.u64 %rd10, [fusion_2175_param_3];
ld.param.u64 %rd11, [fusion_2175_param_9];
cvta.to.global.u64 %rd12, %rd11;
ld.param.u64 %rd13, [fusion_2175_param_4];
ld.param.u64 %rd14, [fusion_2175_param_8];
cvta.to.global.u64 %rd15, %rd14;
ld.param.u64 %rd16, [fusion_2175_param_5];
ld.param.u64 %rd17, [fusion_2175_param_7];
cvta.to.global.u64 %rd18, %rd17;
ld.param.u64 %rd19, [fusion_2175_param_6];
cvta.to.global.u64 %rd20, %rd19;
cvta.to.global.u64 %rd21, %rd16;
cvta.to.global.u64 %rd22, %rd13;
cvta.to.global.u64 %rd23, %rd10;
cvta.to.global.u64 %rd24, %rd7;
cvta.to.global.u64 %rd25, %rd4;
cvta.to.global.u64 %rd26, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
or.b32 %r6, %r4, 1;
or.b32 %r7, %r4, 2;
shr.u32 %r8, %r5, 2;
ld.global.nc.v2.u64 {%rd27, %rd28}, [%rd23];
cvt.u64.u32 %rd29, %r8;
add.s64 %rd30, %rd27, %rd29;
setp.lt.u64 %p1, %rd30, %rd27;
and.b64 %rd31, %rd30, 4294967295;
mul.lo.s64 %rd32, %rd31, 3528531795;
selp.u64 %rd33, 1, 0, %p1;
add.s64 %rd34, %rd28, %rd33;
xor.b64 %rd35, %rd34, %rd32;
shr.u64 %rd36, %rd35, 32;
mul.lo.s64 %rd37, %rd36, 3449720151;
shr.u64 %rd38, %rd37, 32;
and.b64 %rd39, %rd34, 4294967295;
mul.lo.s64 %rd40, %rd39, 3449720151;
and.b64 %rd41, %rd40, 4294967295;
xor.b64 %rd42, %rd41, %rd38;
xor.b64 %rd43, %rd42, 2654435769;
mul.lo.s64 %rd44, %rd43, 3528531795;
shr.u64 %rd45, %rd44, 32;
xor.b64 %rd46, %rd40, %rd30;
shr.u64 %rd47, %rd46, 32;
mul.lo.s64 %rd48, %rd47, 3528531795;
and.b64 %rd49, %rd48, 4294967295;
xor.b64 %rd50, %rd49, %rd45;
xor.b64 %rd51, %rd50, 1993301258;
mul.lo.s64 %rd52, %rd51, 3449720151;
shr.u64 %rd53, %rd52, 32;
shr.u64 %rd54, %rd48, 32;
and.b64 %rd55, %rd32, 4294967295;
xor.b64 %rd56, %rd55, %rd54;
xor.b64 %rd57, %rd56, 3144134277;
mul.lo.s64 %rd58, %rd57, 3449720151;
and.b64 %rd59, %rd58, 4294967295;
xor.b64 %rd60, %rd59, %rd53;
xor.b64 %rd61, %rd60, 3668340011;
mul.lo.s64 %rd62, %rd61, 3528531795;
shr.u64 %rd63, %rd62, 32;
shr.u64 %rd64, %rd58, 32;
and.b64 %rd65, %rd37, 4294967295;
xor.b64 %rd66, %rd65, %rd64;
xor.b64 %rd67, %rd66, 1013904242;
mul.lo.s64 %rd68, %rd67, 3528531795;
and.b64 %rd69, %rd68, 4294967295;
xor.b64 %rd70, %rd69, %rd63;
xor.b64 %rd71, %rd70, 3986602516;
mul.lo.s64 %rd72, %rd71, 3449720151;
shr.u64 %rd73, %rd72, 32;
shr.u64 %rd74, %rd68, 32;
and.b64 %rd75, %rd44, 4294967295;
xor.b64 %rd76, %rd75, %rd74;
xor.b64 %rd77, %rd76, 842468239;
mul.lo.s64 %rd78, %rd77, 3449720151;
and.b64 %rd79, %rd78, 4294967295;
xor.b64 %rd80, %rd79, %rd73;
xor.b64 %rd81, %rd80, 387276957;
mul.lo.s64 %rd82, %rd81, 3528531795;
shr.u64 %rd83, %rd82, 32;
shr.u64 %rd84, %rd78, 32;
and.b64 %rd85, %rd52, 4294967295;
xor.b64 %rd86, %rd85, %rd84;
xor.b64 %rd87, %rd86, 2027808484;
mul.lo.s64 %rd88, %rd87, 3528531795;
and.b64 %rd89, %rd88, 4294967295;
shr.u64 %rd90, %rd88, 32;
and.b64 %rd91, %rd62, 4294967295;
xor.b64 %rd92, %rd91, %rd90;
xor.b64 %rd93, %rd92, 2835769497;
mul.lo.s64 %rd94, %rd93, 3449720151;
and.b64 %rd95, %rd94, 4294967295;
shr.u64 %rd96, %rd94, 32;
and.b64 %rd97, %rd72, 4294967295;
xor.b64 %rd98, %rd97, %rd96;
xor.b64 %rd99, %rd98, 3041712726;
mul.lo.s64 %rd100, %rd99, 3528531795;
and.b64 %rd101, %rd100, 4294967295;
xor.b64 %rd102, %rd89, %rd83;
xor.b64 %rd103, %rd102, 1684936478;
mul.lo.s64 %rd104, %rd103, 3449720151;
shr.u64 %rd105, %rd104, 32;
xor.b64 %rd106, %rd95, %rd105;
xor.b64 %rd107, %rd106, 1401181199;
mul.lo.s64 %rd108, %rd107, 3528531795;
shr.u64 %rd109, %rd108, 32;
xor.b64 %rd110, %rd101, %rd109;
xor.b64 %rd111, %rd110, 3678237736;
mul.lo.s64 %rd112, %rd111, 3449720151;
shr.u64 %rd113, %rd112, 32;
cvt.u32.u64 %r9, %rd113;
shr.u64 %rd114, %rd100, 32;
xor.b64 %rd115, %rd114, %rd82;
cvt.u32.u64 %r10, %rd115;
xor.b32 %r11, %r10, 534103459;
mul.lo.s32 %r12, %r11, -845247145;
xor.b32 %r13, %r12, %r9;
shr.u32 %r14, %r13, 9;
xor.b32 %r15, %r14, 4716963;
cvt.rn.f32.u32 %f1, %r15;
mul.rn.f32 %f2, %f1, 0f34000000;
cvt.rn.f16.f32 %h1, %f2;
mov.b16 %h2, 0x2E66;
setp.ge.f16 %p2, %h1, %h2;
mul.wide.u32 %rd116, %r5, 2;
add.s64 %rd117, %rd25, %rd116;
ld.global.nc.v4.b16 {%h3, %h4, %h5, %h6}, [%rd117];
mov.b32 %hh1, {%h5, %h6};
mov.b32 %hh2, {%h3, %h4};
mov.b32 {%h7, %h8}, %hh2;
mov.b32 {%h9, %h10}, %hh1;
mul.wide.u32 %rd118, %r4, 4;
add.s64 %rd119, %rd6, %rd118;
ld.global.nc.f32 %f3, [%rd119];
cvt.rn.f16.f32 %h11, %f3;
add.rn.f16 %h12, %h7, %h11;
mov.b16 %h13, 0x3C72;
mul.rn.f16 %h14, %h12, %h13;
cvt.f32.f16 %f4, %h14;
selp.f32 %f5, %f4, 0f00000000, %p2;
add.s64 %rd120, %rd24, %rd116;
ld.global.nc.v4.b16 {%h15, %h16, %h17, %h18}, [%rd120];
mov.b32 %hh3, {%h17, %h18};
mov.b32 %hh4, {%h15, %h16};
mov.b32 {%h19, %h20}, %hh4;
mov.b32 {%h21, %h22}, %hh3;
cvt.f32.f16 %f6, %h19;
mul.wide.u32 %rd121, %r1, 4;
add.s64 %rd122, %rd20, %rd121;
ld.global.nc.f32 %f7, [%rd122];
mul.rn.f32 %f8, %f7, 0f3A800000;
add.rn.f32 %f9, %f8, 0f2B8CBCCC;
rsqrt.approx.f32 %f10, %f9;
add.s64 %rd123, %rd9, %rd118;
ld.global.nc.f32 %f11, [%rd123];
mul.rn.f32 %f12, %f10, %f11;
mul.rn.f32 %f13, %f12, %f6;
add.s64 %rd124, %rd12, %rd118;
ld.global.nc.f32 %f14, [%rd124];
add.s64 %rd125, %rd18, %rd121;
ld.global.nc.f32 %f15, [%rd125];
mul.rn.f32 %f16, %f15, 0f3A800000;
mul.rn.f32 %f17, %f12, %f16;
sub.rn.f32 %f18, %f14, %f17;
add.rn.f32 %f19, %f13, %f18;
add.rn.f32 %f20, %f19, %f5;
add.s64 %rd126, %rd22, %rd121;
ld.global.nc.f32 %f21, [%rd126];
mul.rn.f32 %f22, %f21, 0f3A800000;
add.rn.f32 %f23, %f22, 0f2B8CBCCC;
rsqrt.approx.f32 %f24, %f23;
add.s64 %rd127, %rd15, %rd118;
ld.global.nc.f32 %f25, [%rd127];
mul.rn.f32 %f26, %f24, %f25;
mul.rn.f32 %f27, %f26, %f20;
add.s64 %rd128, %rd3, %rd118;
ld.global.nc.f32 %f28, [%rd128];
add.s64 %rd129, %rd21, %rd121;
ld.global.nc.f32 %f29, [%rd129];
mul.rn.f32 %f30, %f29, 0f3A800000;
mul.rn.f32 %f31, %f26, %f30;
sub.rn.f32 %f32, %f28, %f31;
add.rn.f32 %f33, %f32, %f27;
cvt.rn.f16.f32 %h23, %f33;
add.s64 %rd130, %rd26, %rd116;
xor.b64 %rd131, %rd72, %rd96;
xor.b64 %rd132, %rd131, 3041712726;
mul.lo.s64 %rd133, %rd132, 3528531795;
xor.b64 %rd134, %rd109, %rd133;
cvt.u32.u64 %r16, %rd134;
xor.b32 %r17, %r16, -616729560;
mul.lo.s32 %r18, %r17, -845247145;
shr.u32 %r19, %r18, 9;
cvt.rn.f32.u32 %f34, %r19;
mul.rn.f32 %f35, %f34, 0f34000000;
cvt.rn.f16.f32 %h24, %f35;
setp.ge.f16 %p3, %h24, %h2;
mul.wide.u32 %rd135, %r6, 4;
add.s64 %rd136, %rd6, %rd135;
ld.global.nc.f32 %f36, [%rd136];
cvt.rn.f16.f32 %h25, %f36;
add.rn.f16 %h26, %h8, %h25;
mul.rn.f16 %h27, %h26, %h13;
cvt.f32.f16 %f37, %h27;
selp.f32 %f38, %f37, 0f00000000, %p3;
cvt.f32.f16 %f39, %h20;
add.s64 %rd137, %rd9, %rd135;
ld.global.nc.f32 %f40, [%rd137];
mul.rn.f32 %f41, %f10, %f40;
mul.rn.f32 %f42, %f41, %f39;
add.s64 %rd138, %rd12, %rd135;
ld.global.nc.f32 %f43, [%rd138];
mul.rn.f32 %f44, %f16, %f41;
sub.rn.f32 %f45, %f43, %f44;
add.rn.f32 %f46, %f42, %f45;
add.rn.f32 %f47, %f46, %f38;
add.s64 %rd139, %rd15, %rd135;
ld.global.nc.f32 %f48, [%rd139];
mul.rn.f32 %f49, %f24, %f48;
mul.rn.f32 %f50, %f49, %f47;
add.s64 %rd140, %rd3, %rd135;
ld.global.nc.f32 %f51, [%rd140];
mul.rn.f32 %f52, %f30, %f49;
sub.rn.f32 %f53, %f51, %f52;
add.rn.f32 %f54, %f53, %f50;
cvt.rn.f16.f32 %h28, %f54;
and.b64 %rd141, %rd104, 4294967295;
and.b64 %rd142, %rd82, 4294967295;
xor.b64 %rd143, %rd142, %rd114;
xor.b64 %rd144, %rd143, 534103459;
mul.lo.s64 %rd145, %rd144, 3449720151;
shr.u64 %rd146, %rd145, 32;
xor.b64 %rd147, %rd141, %rd146;
xor.b64 %rd148, %rd147, 4055616968;
mul.lo.s64 %rd149, %rd148, 3528531795;
shr.u64 %rd150, %rd149, 32;
cvt.u32.u64 %r20, %rd150;
xor.b64 %rd151, %rd105, %rd94;
cvt.u32.u64 %r21, %rd151;
xor.b32 %r22, %r21, 1401181199;
mul.lo.s32 %r23, %r22, -766435501;
xor.b32 %r24, %r23, %r20;
shr.u32 %r25, %r24, 9;
xor.b32 %r26, %r25, 4936337;
cvt.rn.f32.u32 %f55, %r26;
mul.rn.f32 %f56, %f55, 0f34000000;
cvt.rn.f16.f32 %h29, %f56;
setp.ge.f16 %p4, %h29, %h2;
mul.wide.u32 %rd152, %r7, 4;
add.s64 %rd153, %rd6, %rd152;
ld.global.nc.f32 %f57, [%rd153];
cvt.rn.f16.f32 %h30, %f57;
add.rn.f16 %h31, %h9, %h30;
mul.rn.f16 %h32, %h31, %h13;
cvt.f32.f16 %f58, %h32;
selp.f32 %f59, %f58, 0f00000000, %p4;
cvt.f32.f16 %f60, %h21;
add.s64 %rd154, %rd9, %rd152;
ld.global.nc.f32 %f61, [%rd154];
mul.rn.f32 %f62, %f10, %f61;
mul.rn.f32 %f63, %f62, %f60;
add.s64 %rd155, %rd12, %rd152;
ld.global.nc.f32 %f64, [%rd155];
mul.rn.f32 %f65, %f16, %f62;
sub.rn.f32 %f66, %f64, %f65;
add.rn.f32 %f67, %f63, %f66;
add.rn.f32 %f68, %f67, %f59;
add.s64 %rd156, %rd15, %rd152;
ld.global.nc.f32 %f69, [%rd156];
mul.rn.f32 %f70, %f24, %f69;
mul.rn.f32 %f71, %f70, %f68;
add.s64 %rd157, %rd3, %rd152;
ld.global.nc.f32 %f72, [%rd157];
mul.rn.f32 %f73, %f30, %f70;
sub.rn.f32 %f74, %f72, %f73;
add.rn.f32 %f75, %f74, %f71;
cvt.rn.f16.f32 %h33, %f75;
xor.b64 %rd158, %rd83, %rd88;
xor.b64 %rd159, %rd158, 1684936478;
mul.lo.s64 %rd160, %rd159, 3449720151;
xor.b64 %rd161, %rd146, %rd160;
cvt.u32.u64 %r27, %rd161;
xor.b32 %r28, %r27, -239350328;
mul.lo.s32 %r29, %r28, -766435501;
shr.u32 %r30, %r29, 9;
cvt.rn.f32.u32 %f76, %r30;
mul.rn.f32 %f77, %f76, 0f34000000;
cvt.rn.f16.f32 %h34, %f77;
setp.ge.f16 %p5, %h34, %h2;
ld.global.nc.f32 %f78, [%rd119+12];
cvt.rn.f16.f32 %h35, %f78;
add.rn.f16 %h36, %h10, %h35;
mul.rn.f16 %h37, %h36, %h13;
cvt.f32.f16 %f79, %h37;
selp.f32 %f80, %f79, 0f00000000, %p5;
cvt.f32.f16 %f81, %h22;
ld.global.nc.f32 %f82, [%rd123+12];
mul.rn.f32 %f83, %f10, %f82;
mul.rn.f32 %f84, %f83, %f81;
ld.global.nc.f32 %f85, [%rd124+12];
mul.rn.f32 %f86, %f16, %f83;
sub.rn.f32 %f87, %f85, %f86;
add.rn.f32 %f88, %f84, %f87;
add.rn.f32 %f89, %f88, %f80;
ld.global.nc.f32 %f90, [%rd127+12];
mul.rn.f32 %f91, %f24, %f90;
mul.rn.f32 %f92, %f91, %f89;
ld.global.nc.f32 %f93, [%rd128+12];
mul.rn.f32 %f94, %f30, %f91;
sub.rn.f32 %f95, %f93, %f94;
add.rn.f32 %f96, %f95, %f92;
cvt.rn.f16.f32 %h38, %f96;
st.global.v4.b16 [%rd130], {%h23, %h28, %h33, %h38};
ret;
}
// .globl fusion_2698
.visible .entry fusion_2698(
.param .u64 fusion_2698_param_0,
.param .u64 fusion_2698_param_1,
.param .u64 fusion_2698_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2698_param_0];
ld.param.u64 %rd2, [fusion_2698_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2171
.visible .entry fusion_2171(
.param .u64 fusion_2171_param_0,
.param .u64 fusion_2171_param_1,
.param .u64 fusion_2171_param_2,
.param .u64 fusion_2171_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2171_param_0];
ld.param.u64 %rd2, [fusion_2171_param_3];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2171_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd9, %r14, 2048;
add.s64 %rd10, %rd6, %rd9;
mul.wide.u32 %rd11, %r16, 2;
add.s64 %rd12, %rd10, %rd11;
ld.global.nc.b16 %h1, [%rd12];
mul.wide.u32 %rd13, %r7, 256;
add.s64 %rd14, %rd5, %rd13;
mul.wide.u32 %rd15, %r6, 4;
add.s64 %rd16, %rd14, %rd15;
ld.global.nc.f32 %f1, [%rd16];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd10, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd14, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd10, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd14, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd10, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd14, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd8+444893184], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2699
.visible .entry fusion_2699(
.param .u64 fusion_2699_param_0,
.param .u64 fusion_2699_param_1,
.param .u64 fusion_2699_param_2
)
.reqntid 256, 1, 1
{
.reg .b16 %h<5>;
.reg .f32 %f<5>;
.reg .b32 %r<6>;
.reg .b64 %rd<9>;
ld.param.u64 %rd1, [fusion_2699_param_0];
ld.param.u64 %rd2, [fusion_2699_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd6];
cvt.rn.f16.f32 %h1, %f1;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
cvt.rn.f16.f32 %h2, %f2;
cvt.rn.f16.f32 %h3, %f3;
cvt.rn.f16.f32 %h4, %f4;
st.global.v4.b16 [%rd8], {%h1, %h2, %h3, %h4};
ret;
}
// .globl fusion_2172
.visible .entry fusion_2172(
.param .u64 fusion_2172_param_0,
.param .u64 fusion_2172_param_1,
.param .u64 fusion_2172_param_2,
.param .u64 fusion_2172_param_3
)
.reqntid 256, 1, 1
{
.reg .b16 %h<13>;
.reg .f32 %f<5>;
.reg .b32 %r<20>;
.reg .b64 %rd<29>;
ld.param.u64 %rd1, [fusion_2172_param_0];
ld.param.u64 %rd2, [fusion_2172_param_3];
cvta.to.global.u64 %rd3, %rd2;
ld.param.u64 %rd4, [fusion_2172_param_1];
cvta.to.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
shl.b32 %r3, %r1, 10;
shl.b32 %r4, %r2, 2;
or.b32 %r5, %r4, %r3;
and.b32 %r6, %r4, 60;
shr.u32 %r7, %r1, 5;
or.b32 %r8, %r4, 1;
and.b32 %r9, %r8, 61;
or.b32 %r10, %r4, 2;
and.b32 %r11, %r10, 62;
or.b32 %r12, %r4, 3;
and.b32 %r13, %r12, 63;
bfe.u32 %r14, %r5, 6, 9;
mul.wide.u32 %rd7, %r5, 2;
add.s64 %rd8, %rd3, %rd7;
shl.b32 %r15, %r7, 6;
or.b32 %r16, %r6, %r15;
mul.wide.u32 %rd9, %r14, 2048;
add.s64 %rd10, %rd6, %rd9;
mul.wide.u32 %rd11, %r16, 2;
add.s64 %rd12, %rd10, %rd11;
ld.global.nc.b16 %h1, [%rd12];
mul.wide.u32 %rd13, %r7, 256;
add.s64 %rd14, %rd5, %rd13;
mul.wide.u32 %rd15, %r6, 4;
add.s64 %rd16, %rd14, %rd15;
ld.global.nc.f32 %f1, [%rd16];
cvt.rn.f16.f32 %h2, %f1;
add.rn.f16 %h3, %h1, %h2;
or.b32 %r17, %r9, %r15;
mul.wide.u32 %rd17, %r17, 2;
add.s64 %rd18, %rd10, %rd17;
ld.global.nc.b16 %h4, [%rd18];
mul.wide.u32 %rd19, %r9, 4;
add.s64 %rd20, %rd14, %rd19;
ld.global.nc.f32 %f2, [%rd20];
cvt.rn.f16.f32 %h5, %f2;
add.rn.f16 %h6, %h4, %h5;
or.b32 %r18, %r11, %r15;
mul.wide.u32 %rd21, %r18, 2;
add.s64 %rd22, %rd10, %rd21;
ld.global.nc.b16 %h7, [%rd22];
mul.wide.u32 %rd23, %r11, 4;
add.s64 %rd24, %rd14, %rd23;
ld.global.nc.f32 %f3, [%rd24];
cvt.rn.f16.f32 %h8, %f3;
add.rn.f16 %h9, %h7, %h8;
or.b32 %r19, %r13, %r15;
mul.wide.u32 %rd25, %r19, 2;
add.s64 %rd26, %rd10, %rd25;
ld.global.nc.b16 %h10, [%rd26];
mul.wide.u32 %rd27, %r13, 4;
add.s64 %rd28, %rd14, %rd27;
ld.global.nc.f32 %f4, [%rd28];
cvt.rn.f16.f32 %h11, %f4;
add.rn.f16 %h12, %h10, %h11;
st.global.v4.b16 [%rd8+445941760], {%h3, %h6, %h9, %h12};
ret;
}
// .globl fusion_2169
.visible .entry fusion_2169(
.param .u64 fusion_2169_param_0,
.param .u64 fusion_2169_param_1,
.param .u64 fusion_2169_param_2,
.param .u64 fusion_2169_param_3
)
.reqntid 32, 1, 1
{
.local .align 4 .b8 __local_depot86[4];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<4>;
.reg .b16 %h<83>;
.reg .b32 %hh<9>;
.reg .f32 %f<57>;
.reg .b32 %r<37>;
.reg .b64 %rd<37>;
mov.u64 %SPL, __local_depot86;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd4, [fusion_2169_param_0];
ld.param.u64 %rd5, [fusion_2169_param_2];
cvta.to.global.u64 %rd6, %rd5;
cvta.to.global.u64 %rd9, %rd4;
add.u64 %rd10, %SP, 0;
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
mov.u32 %r5, %ctaid.x;
shl.b32 %r6, %r1, 1;
shl.b32 %r7, %r5, 9;
or.b32 %r8, %r7, %r6;
mul.wide.u32 %rd11, %r8, 2;
add.s64 %rd12, %rd9, %rd11;
ld.global.nc.b32 %hh1, [%rd12];
mov.b32 {%h1, %h2}, %hh1;
mul.wide.u32 %rd13, %r6, 4;
add.s64 %rd14, %rd6, %rd13;
ld.global.nc.v2.u32 {%r9, %r10}, [%rd14];
cvt.rn.f16.s32 %h3, %r9;
mov.b16 %h4, 0x3C00;
sub.rn.f16 %h5, %h4, %h3;
mov.b16 %h6, 0x70E2;
mul.rn.f16 %h7, %h5, %h6;
sub.rn.f16 %h8, %h1, %h7;
cvt.f32.f16 %f2, %h8;
max.f32 %f3, %f2, 0fFF800000;
cvt.rn.f16.s32 %h9, %r10;
sub.rn.f16 %h10, %h4, %h9;
mul.rn.f16 %h11, %h10, %h6;
sub.rn.f16 %h12, %h2, %h11;
cvt.f32.f16 %f4, %h12;
max.f32 %f5, %f3, %f4;
or.b32 %r11, %r6, 64;
ld.global.nc.b32 %hh2, [%rd12+128];
mov.b32 {%h13, %h14}, %hh2;
mul.wide.u32 %rd15, %r11, 4;
add.s64 %rd16, %rd6, %rd15;
ld.global.nc.u32 %r12, [%rd16];
cvt.rn.f16.s32 %h15, %r12;
sub.rn.f16 %h16, %h4, %h15;
mul.rn.f16 %h17, %h16, %h6;
sub.rn.f16 %h18, %h13, %h17;
cvt.f32.f16 %f6, %h18;
max.f32 %f7, %f5, %f6;
ld.global.nc.u32 %r13, [%rd14+260];
cvt.rn.f16.s32 %h19, %r13;
sub.rn.f16 %h20, %h4, %h19;
mul.rn.f16 %h21, %h20, %h6;
sub.rn.f16 %h22, %h14, %h21;
cvt.f32.f16 %f8, %h22;
max.f32 %f9, %f7, %f8;
or.b32 %r14, %r6, 128;
ld.global.nc.b32 %hh3, [%rd12+256];
mov.b32 {%h23, %h24}, %hh3;
mul.wide.u32 %rd17, %r14, 4;
add.s64 %rd18, %rd6, %rd17;
ld.global.nc.u32 %r15, [%rd18];
cvt.rn.f16.s32 %h25, %r15;
sub.rn.f16 %h26, %h4, %h25;
mul.rn.f16 %h27, %h26, %h6;
sub.rn.f16 %h28, %h23, %h27;
cvt.f32.f16 %f10, %h28;
max.f32 %f11, %f9, %f10;
ld.global.nc.u32 %r16, [%rd14+516];
cvt.rn.f16.s32 %h29, %r16;
sub.rn.f16 %h30, %h4, %h29;
mul.rn.f16 %h31, %h30, %h6;
sub.rn.f16 %h32, %h24, %h31;
cvt.f32.f16 %f12, %h32;
max.f32 %f13, %f11, %f12;
or.b32 %r17, %r6, 192;
ld.global.nc.b32 %hh4, [%rd12+384];
mov.b32 {%h33, %h34}, %hh4;
mul.wide.u32 %rd19, %r17, 4;
add.s64 %rd20, %rd6, %rd19;
ld.global.nc.u32 %r18, [%rd20];
cvt.rn.f16.s32 %h35, %r18;
sub.rn.f16 %h36, %h4, %h35;
mul.rn.f16 %h37, %h36, %h6;
sub.rn.f16 %h38, %h33, %h37;
cvt.f32.f16 %f14, %h38;
max.f32 %f15, %f13, %f14;
ld.global.nc.u32 %r19, [%rd14+772];
cvt.rn.f16.s32 %h39, %r19;
sub.rn.f16 %h40, %h4, %h39;
mul.rn.f16 %h41, %h40, %h6;
sub.rn.f16 %h42, %h34, %h41;
cvt.f32.f16 %f16, %h42;
max.f32 %f17, %f15, %f16;
or.b32 %r20, %r6, 256;
ld.global.nc.b32 %hh5, [%rd12+512];
mov.b32 {%h43, %h44}, %hh5;
mul.wide.u32 %rd21, %r20, 4;
add.s64 %rd22, %rd6, %rd21;
ld.global.nc.u32 %r21, [%rd22];
cvt.rn.f16.s32 %h45, %r21;
sub.rn.f16 %h46, %h4, %h45;
mul.rn.f16 %h47, %h46, %h6;
sub.rn.f16 %h48, %h43, %h47;
cvt.f32.f16 %f18, %h48;
max.f32 %f19, %f17, %f18;
ld.global.nc.u32 %r22, [%rd14+1028];
cvt.rn.f16.s32 %h49, %r22;
sub.rn.f16 %h50, %h4, %h49;
mul.rn.f16 %h51, %h50, %h6;
sub.rn.f16 %h52, %h44, %h51;
cvt.f32.f16 %f20, %h52;
max.f32 %f21, %f19, %f20;
or.b32 %r23, %r6, 320;
ld.global.nc.b32 %hh6, [%rd12+640];
mov.b32 {%h53, %h54}, %hh6;
mul.wide.u32 %rd23, %r23, 4;
add.s64 %rd24, %rd6, %rd23;
ld.global.nc.u32 %r24, [%rd24];
cvt.rn.f16.s32 %h55, %r24;
sub.rn.f16 %h56, %h4, %h55;
mul.rn.f16 %h57, %h56, %h6;
sub.rn.f16 %h58, %h53, %h57;
cvt.f32.f16 %f22, %h58;
max.f32 %f23, %f21, %f22;
ld.global.nc.u32 %r25, [%rd14+1284];
cvt.rn.f16.s32 %h59, %r25;
sub.rn.f16 %h60, %h4, %h59;
mul.rn.f16 %h61, %h60, %h6;
sub.rn.f16 %h62, %h54, %h61;
cvt.f32.f16 %f24, %h62;
max.f32 %f25, %f23, %f24;
or.b32 %r26, %r6, 384;
ld.global.nc.b32 %hh7, [%rd12+768];
mov.b32 {%h63, %h64}, %hh7;
mul.wide.u32 %rd25, %r26, 4;
add.s64 %rd26, %rd6, %rd25;
ld.global.nc.u32 %r27, [%rd26];
cvt.rn.f16.s32 %h65, %r27;
sub.rn.f16 %h66, %h4, %h65;
mul.rn.f16 %h67, %h66, %h6;
sub.rn.f16 %h68, %h63, %h67;
cvt.f32.f16 %f26, %h68;
max.f32 %f27, %f25, %f26;
ld.global.nc.u32 %r28, [%rd14+1540];
cvt.rn.f16.s32 %h69, %r28;
sub.rn.f16 %h70, %h4, %h69;
mul.rn.f16 %h71, %h70, %h6;
sub.rn.f16 %h72, %h64, %h71;
cvt.f32.f16 %f28, %h72;
max.f32 %f29, %f27, %f28;
or.b32 %r29, %r6, 448;
ld.global.nc.b32 %hh8, [%rd12+896];
mov.b32 {%h73, %h74}, %hh8;
mul.wide.u32 %rd27, %r29, 4;
add.s64 %rd28, %rd6, %rd27;
ld.global.nc.u32 %r30, [%rd28];
cvt.rn.f16.s32 %h75, %r30;
sub.rn.f16 %h76, %h4, %h75;
mul.rn.f16 %h77, %h76, %h6;
sub.rn.f16 %h78, %h73, %h77;
cvt.f32.f16 %f30, %h78;
max.f32 %f31, %f29, %f30;
ld.global.nc.u32 %r31, [%rd14+1796];
cvt.rn.f16.s32 %h79, %r31;
sub.rn.f16 %h80, %h4, %h79;
mul.rn.f16 %h81, %h80, %h6;
sub.rn.f16 %h82, %h74, %h81;
cvt.f32.f16 %f32, %h82;
max.f32 %f33, %f31, %f32;
shfl.sync.down.b32 %f34, %f33, 16, 31, -1;
max.f32 %f35, %f33, %f34;
shfl.sync.down.b32 %f36, %f35, 8, 31, -1;
max.f32 %f37, %f35, %f36;
shfl.sync.down.b32 %f38, %f37, 4, 31, -1;
m
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment