Last active
December 6, 2017 19:42
-
-
Save kaushikcfd/a7ad1c5c9314e22f4b107665736f6838 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define lid(N) ((int) get_local_id(N)) | |
#define gid(N) ((int) get_group_id(N)) | |
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable | |
#if __OPENCL_C_VERSION__ < 120 | |
#pragma OPENCL EXTENSION cl_khr_fp64: enable | |
#endif | |
__constant double const cnst[12 * 6] = { -0.05512856699248408, 0.6533077030470595, -0.055128566992484106, 0.2205142679699365, 0.015920894998035683, 0.2205142679699366, -0.055128566992484036, -0.05512856699248409, 0.6533077030470597, 0.22051426796993637, 0.2205142679699362, 0.015920894998035957, 0.6533077030470594, -0.055128566992484154, -0.055128566992484106, 0.01592089499803581, 0.2205142679699364, 0.2205142679699366, -0.1249989825350975, 0.0014305795177887936, -0.12499898253509754, 0.49999593014038923, 0.24857552527162577, 0.4999959301403913, -0.1249989825350975, -0.12499898253509759, 0.0014305795177888207, 0.49999593014038923, 0.4999959301403911, 0.24857552527162594, 0.0014305795177898299, -0.12499898253509759, -0.12499898253509754, 0.24857552527162485, 0.4999959301403901, 0.4999959301403903, -0.04749625719880001, 0.17376836365417403, -0.11771516330842915, 0.790160442765823, 0.06597478591860526, 0.1353078281686269, -0.1177151633084291, 0.17376836365417417, -0.04749625719880001, 0.13530782816862674, 0.06597478591860512, 0.7901604427658232, -0.04749625719879997, -0.1177151633084292, 0.17376836365417409, 0.790160442765823, 0.13530782816862671, 0.06597478591860538, 0.1737683636541741, -0.11771516330842913, -0.04749625719880001, 0.06597478591860524, 0.1353078281686266, 0.7901604427658232, -0.11771516330842914, -0.047496257198800095, 0.17376836365417409, 0.13530782816862683, 0.7901604427658229, 0.06597478591860538, 0.17376836365417406, -0.04749625719880015, -0.11771516330842915, 0.06597478591860535, 0.7901604427658229, 0.1353078281686269 }; | |
__constant double const cnst_0[12] = { 0.0254224531851035, 0.0254224531851035, 0.0254224531851035, 0.0583931378631895, 0.0583931378631895, 0.0583931378631895, 0.041425537809187, 0.041425537809187, 0.041425537809187, 0.041425537809187, 0.041425537809187, 0.041425537809187 }; | |
__constant double const cnst_1[12 * 6] = { 0.7476439420339936, 2.4952878840679835, 0.0, 0.25235605796600685, -0.25235605796600685, -3.2429318261019766, 0.7476439420339914, -0.747643942033992, 0.0, 3.4952878840679777, -3.4952878840679777, 0.0, -2.495287884067985, -0.7476439420339958, 0.0, 0.25235605796600624, -0.25235605796600624, 3.2429318261019806, 0.0028530193163572646, 1.0057060386327157, 0.0, 0.9971469806836384, -0.9971469806836384, -1.0085590579490729, 0.0028530193163565083, -0.0028530193163601946, 0.0, 2.0057060386327135, -2.0057060386327135, 0.0, -1.0057060386327197, -0.0028530193163612467, 0.0, 0.9971469806836384, -0.9971469806836384, 1.0085590579490813, 0.7874198006207375, 1.546009996485596, 0.0, 1.2414098041351382, -1.2414098041351382, -2.333429797106333, -0.24140980413513827, 1.5460099964855956, 0.0, 0.2125801993792626, -0.2125801993792626, -1.304600192350457, 0.7874198006207367, 0.24140980413514002, 0.0, 2.5460099964855925, -2.5460099964855925, -1.0288296047558765, -1.5460099964855953, 0.2414098041351382, 0.0, 0.21258019937926237, -0.21258019937926237, 1.3046001923504573, -0.24140980413514057, -0.7874198006207365, 0.0, 2.546009996485593, -2.546009996485593, 1.0288296047558774, -1.546009996485597, -0.787419800620738, 0.0, 1.2414098041351387, -1.2414098041351387, 2.333429797106335 }; | |
__constant double const cnst_2[12 * 6] = { 0.7476439420339873, 0.0, -0.7476439420339914, 3.4952878840679706, 1.1215314724299662e-14, -3.4952878840679857, 0.7476439420339868, 0.0, 2.495287884067986, 0.25235605796599214, -3.242931826101967, -0.25235605796600574, -2.4952878840679884, 0.0, -0.7476439420339918, 0.2523560579659977, 3.2429318261019837, -0.2523560579660064, 0.0028530193163513367, 0.0, -0.002853019316358805, 2.005706038632703, 1.428660823968979e-14, -2.0057060386327175, 0.0028530193163510222, 0.0, 1.0057060386327183, 0.9971469806836263, -1.0085590579490629, -0.99714698068364, -1.005706038632725, 0.0, -0.0028530193163588775, 0.9971469806836281, 1.0085590579490893, -0.9971469806836403, 0.7874198006207309, 0.0, 0.24140980413514163, 2.5460099964855814, -1.0288296047558645, -2.5460099964855973, -0.2414098041351445, 0.0, -0.7874198006207358, 2.5460099964855836, 1.0288296047558865, -2.5460099964855973, 0.7874198006207308, 0.0, 1.5460099964855984, 1.2414098041351245, -2.333429797106322, -1.2414098041351396, -1.5460099964856004, 0.0, -0.7874198006207358, 1.241409804135129, 2.333429797106341, -1.2414098041351402, -0.24140980413514518, 0.0, 1.5460099964855982, 0.21258019937925107, -1.304600192350448, -0.21258019937926315, -1.5460099964856011, 0.0, 0.2414098041351414, 0.21258019937925327, 1.304600192350464, -0.21258019937926348 }; | |
__constant double const cnst_3[12 * 10] = { 0.0463079953908666, 0.440268993398561, 0.04630799539086663, 0.40225091496147397, -0.20112545748073685, -0.01452104355632575, -0.014521043556325792, -0.201125457480737, 0.40225091496147386, 0.09390618797088351, 0.04630799539086659, 0.04630799539086662, 0.44026899339856124, -0.20112545748073674, 0.40225091496147364, -0.20112545748073687, 0.40225091496147347, -0.014521043556325774, -0.014521043556325662, 0.09390618797088346, 0.44026899339856096, 0.04630799539086654, 0.046307995390866656, -0.014521043556325686, -0.014521043556325821, 0.40225091496147364, -0.2011254574807369, 0.40225091496147364, -0.20112545748073685, 0.09390618797088351, 0.03935168581745833, -0.0626737220523999, 0.03935168581745875, 0.2836549261579288, -0.14182746307896527, -0.07051024619919724, -0.0705102461991982, -0.14182746307896418, 0.2836549261579299, 0.8413359166579489, 0.0393516858174583, 0.0393516858174587, -0.06267372205239984, -0.14182746307896513, 0.2836549261579287, -0.1418274630789641, 0.28365492615792975, -0.07051024619919738, -0.07051024619919796, 0.8413359166579489, -0.06267372205240003, 0.03935168581745864, 0.039351685817458765, -0.07051024619919775, -0.0705102461991979, 0.2836549261579309, -0.1418274630789656, 0.28365492615793075, -0.1418274630789653, 0.8413359166579472, 0.04111072846643509, -0.02619322659935562, 0.011435826065379854, 0.8084889526681459, -0.06128522144874131, -0.062388096817690426, -0.005117035916025209, -0.12795187989530318, 0.1384464196928617, 0.2834535337842932, 0.01143582606537985, -0.02619322659935547, 0.041110728466435105, 0.13844641969286162, -0.1279518798953029, -0.005117035916025165, -0.062388096817690385, -0.06128522144874131, 0.8084889526681459, 0.2834535337842927, 0.04111072846643504, 0.011435826065379873, -0.026193226599355485, -0.0612852214487412, 0.8084889526681459, -0.12795187989530296, 0.13844641969286167, -0.06238809681769047, -0.005117035916025135, 0.28345353378429305, -0.026193226599355555, 0.011435826065379759, 0.04111072846643509, -0.005117035916025144, -0.062388096817690426, 0.1384464196928615, -0.127951879895303, 0.8084889526681458, -0.061285221448741094, 0.2834535337842927, 0.01143582606537988, 0.04111072846643508, -0.026193226599355596, -0.127951879895303, 0.13844641969286164, -0.06128522144874138, 0.8084889526681458, -0.005117035916025219, -0.06238809681769036, 0.28345353378429317, -0.026193226599355586, 0.041110728466435056, 0.011435826065379875, -0.06238809681769045, -0.005117035916025181, 0.8084889526681456, -0.061285221448741406, 0.13844641969286173, -0.127951879895303, 0.28345353378429317 }; | |
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel(int const w_1_global_len, __global double const *__restrict__ w_1_global, __global int const *__restrict__ ltg_2, int const nelements, int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size, __global volatile double *__restrict__ A1_global, int const A1_size) | |
{ | |
for (int i_init_1 = 0; i_init_1 <= -1 + A1_size; ++i_init_1) | |
{ | |
A1_global[2 * i_init_1] = 0.0; | |
A1_global[2 * i_init_1 + 1] = 0.0; | |
} | |
for (int i_init_0 = 0; i_init_0 <= -1 + A0_size; ++i_init_0) | |
A0_global[i_init_0] = 0.0; | |
} | |
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0(int const w_1_global_len, __global double const *__restrict__ w_1_global, __global int const *__restrict__ ltg_2, int const nelements, int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size, __global volatile double *__restrict__ A1_global, int const A1_size) | |
{ | |
double acc_i16; | |
double acc_i16_5; | |
double acc_i16_6; | |
double acc_i18; | |
double acc_i18_0; | |
double acc_i18_1; | |
double acc_i23; | |
double acc_i23_0; | |
double sum_tmp_0_0[12]; | |
double sum_tmp_10_0[10]; | |
double sum_tmp_11_0[12]; | |
double sum_tmp_12_0[10]; | |
double sum_tmp_1_0[12]; | |
double sum_tmp_2[12]; | |
double sum_tmp_8_0[6]; | |
double sum_tmp_9_0[12]; | |
for (int iel = 0; iel <= -1 + nelements; ++iel) | |
{ | |
for (int i16_4 = 0; i16_4 <= 11; ++i16_4) | |
{ | |
acc_i23 = 0.0; | |
for (int i23 = 0; i23 <= 9; ++i23) | |
acc_i23 = acc_i23 + cnst_3[10 * i16_4 + i23] * w_1_global[2 * ltg_2[10 * iel + i23] + 1]; | |
sum_tmp_11_0[i16_4] = acc_i23; | |
} | |
for (int i16_3 = 0; i16_3 <= 11; ++i16_3) | |
{ | |
acc_i23_0 = 0.0; | |
for (int i23_0 = 0; i23_0 <= 9; ++i23_0) | |
acc_i23_0 = acc_i23_0 + cnst_3[10 * i16_3 + i23_0] * w_1_global[2 * ltg_2[10 * iel + i23_0]]; | |
sum_tmp_9_0[i16_3] = acc_i23_0; | |
} | |
for (int i16_2 = 0; i16_2 <= 11; ++i16_2) | |
{ | |
acc_i18 = 0.0; | |
for (int i18 = 0; i18 <= 5; ++i18) | |
acc_i18 = acc_i18 + cnst_2[6 * i16_2 + i18] * w_0_global[ltg_1[6 * iel + i18]]; | |
sum_tmp_1_0[i16_2] = acc_i18; | |
} | |
for (int i16_1 = 0; i16_1 <= 11; ++i16_1) | |
{ | |
acc_i18_0 = 0.0; | |
for (int i18_0 = 0; i18_0 <= 5; ++i18_0) | |
acc_i18_0 = acc_i18_0 + cnst_1[6 * i16_1 + i18_0] * w_0_global[ltg_1[6 * iel + i18_0]]; | |
sum_tmp_0_0[i16_1] = acc_i18_0; | |
} | |
for (int i16_0 = 0; i16_0 <= 11; ++i16_0) | |
{ | |
acc_i18_1 = 0.0; | |
for (int i18_1 = 0; i18_1 <= 5; ++i18_1) | |
acc_i18_1 = acc_i18_1 + cnst[6 * i16_0 + i18_1] * w_0_global[ltg_1[6 * iel + i18_1]]; | |
sum_tmp_2[i16_0] = acc_i18_1; | |
} | |
for (int i121_1 = 0; i121_1 <= 9; ++i121_1) | |
{ | |
acc_i16 = 0.0; | |
for (int i16 = 0; i16 <= 11; ++i16) | |
acc_i16 = acc_i16 + cnst_3[10 * i16 + i121_1] * sum_tmp_11_0[i16] * cnst_0[i16] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])); | |
sum_tmp_12_0[i121_1] = acc_i16; | |
} | |
for (int ibf_gather_2 = 0; ibf_gather_2 <= 9; ++ibf_gather_2) | |
{ | |
double loopy_old_val; | |
double loopy_new_val; | |
do | |
{ | |
loopy_old_val = A1_global[2 * ltg_2[10 * iel + ibf_gather_2] + 1]; | |
loopy_new_val = loopy_old_val + sum_tmp_12_0[ibf_gather_2]; | |
} | |
while (atom_cmpxchg((__global long *) &(A1_global[2 * ltg_2[10 * iel + ibf_gather_2] + 1]), *(long *) &loopy_old_val, *(long *) &loopy_new_val) != *(long *) &loopy_old_val); | |
} | |
for (int i121_0 = 0; i121_0 <= 9; ++i121_0) | |
{ | |
acc_i16_5 = 0.0; | |
for (int i16_5 = 0; i16_5 <= 11; ++i16_5) | |
acc_i16_5 = acc_i16_5 + cnst_3[10 * i16_5 + i121_0] * sum_tmp_9_0[i16_5] * cnst_0[i16_5] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])); | |
sum_tmp_10_0[i121_0] = acc_i16_5; | |
} | |
for (int ibf_gather_1 = 0; ibf_gather_1 <= 9; ++ibf_gather_1) | |
{ | |
double loopy_old_val_0; | |
double loopy_new_val_0; | |
do | |
{ | |
loopy_old_val_0 = A1_global[2 * ltg_2[10 * iel + ibf_gather_1]]; | |
loopy_new_val_0 = loopy_old_val_0 + sum_tmp_10_0[ibf_gather_1]; | |
} | |
while (atom_cmpxchg((__global long *) &(A1_global[2 * ltg_2[10 * iel + ibf_gather_1]]), *(long *) &loopy_old_val_0, *(long *) &loopy_new_val_0) != *(long *) &loopy_old_val_0); | |
} | |
for (int i120_0 = 0; i120_0 <= 5; ++i120_0) | |
{ | |
acc_i16_6 = 0.0; | |
for (int i16_6 = 0; i16_6 <= 11; ++i16_6) | |
acc_i16_6 = acc_i16_6 + cnst_2[6 * i16_6 + i120_0] * ((sum_tmp_0_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + sum_tmp_1_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]) + (sum_tmp_0_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) + sum_tmp_1_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]))) * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]))) * cnst_0[i16_6] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + cnst[6 * i16_6 + i120_0] * sum_tmp_2[i16_6] * cnst_0[i16_6] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + cnst_1[6 * i16_6 + i120_0] * ((sum_tmp_0_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + sum_tmp_1_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + (sum_tmp_0_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) + sum_tmp_1_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]))) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]])) * cnst_0[i16_6] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])); | |
sum_tmp_8_0[i120_0] = acc_i16_6; | |
} | |
for (int ibf_gather_0 = 0; ibf_gather_0 <= 5; ++ibf_gather_0) | |
{ | |
double loopy_old_val_1; | |
double loopy_new_val_1; | |
do | |
{ | |
loopy_old_val_1 = A0_global[ltg_1[6 * iel + ibf_gather_0]]; | |
loopy_new_val_1 = loopy_old_val_1 + sum_tmp_8_0[ibf_gather_0]; | |
} | |
while (atom_cmpxchg((__global long *) &(A0_global[ltg_1[6 * iel + ibf_gather_0]]), *(long *) &loopy_old_val_1, *(long *) &loopy_new_val_1) != *(long *) &loopy_old_val_1); | |
} | |
} | |
} |
Unrolled dim_init_1
and added few assumptions.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is the kernel without any
iname
splitting.