Skip to content

Instantly share code, notes, and snippets.

@kaushikcfd
Last active December 6, 2017 19:42
Show Gist options
  • Save kaushikcfd/a7ad1c5c9314e22f4b107665736f6838 to your computer and use it in GitHub Desktop.
Save kaushikcfd/a7ad1c5c9314e22f4b107665736f6838 to your computer and use it in GitHub Desktop.
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
__constant double const cnst[12 * 6] = { -0.05512856699248408, 0.6533077030470595, -0.055128566992484106, 0.2205142679699365, 0.015920894998035683, 0.2205142679699366, -0.055128566992484036, -0.05512856699248409, 0.6533077030470597, 0.22051426796993637, 0.2205142679699362, 0.015920894998035957, 0.6533077030470594, -0.055128566992484154, -0.055128566992484106, 0.01592089499803581, 0.2205142679699364, 0.2205142679699366, -0.1249989825350975, 0.0014305795177887936, -0.12499898253509754, 0.49999593014038923, 0.24857552527162577, 0.4999959301403913, -0.1249989825350975, -0.12499898253509759, 0.0014305795177888207, 0.49999593014038923, 0.4999959301403911, 0.24857552527162594, 0.0014305795177898299, -0.12499898253509759, -0.12499898253509754, 0.24857552527162485, 0.4999959301403901, 0.4999959301403903, -0.04749625719880001, 0.17376836365417403, -0.11771516330842915, 0.790160442765823, 0.06597478591860526, 0.1353078281686269, -0.1177151633084291, 0.17376836365417417, -0.04749625719880001, 0.13530782816862674, 0.06597478591860512, 0.7901604427658232, -0.04749625719879997, -0.1177151633084292, 0.17376836365417409, 0.790160442765823, 0.13530782816862671, 0.06597478591860538, 0.1737683636541741, -0.11771516330842913, -0.04749625719880001, 0.06597478591860524, 0.1353078281686266, 0.7901604427658232, -0.11771516330842914, -0.047496257198800095, 0.17376836365417409, 0.13530782816862683, 0.7901604427658229, 0.06597478591860538, 0.17376836365417406, -0.04749625719880015, -0.11771516330842915, 0.06597478591860535, 0.7901604427658229, 0.1353078281686269 };
__constant double const cnst_0[12] = { 0.0254224531851035, 0.0254224531851035, 0.0254224531851035, 0.0583931378631895, 0.0583931378631895, 0.0583931378631895, 0.041425537809187, 0.041425537809187, 0.041425537809187, 0.041425537809187, 0.041425537809187, 0.041425537809187 };
__constant double const cnst_1[12 * 6] = { 0.7476439420339936, 2.4952878840679835, 0.0, 0.25235605796600685, -0.25235605796600685, -3.2429318261019766, 0.7476439420339914, -0.747643942033992, 0.0, 3.4952878840679777, -3.4952878840679777, 0.0, -2.495287884067985, -0.7476439420339958, 0.0, 0.25235605796600624, -0.25235605796600624, 3.2429318261019806, 0.0028530193163572646, 1.0057060386327157, 0.0, 0.9971469806836384, -0.9971469806836384, -1.0085590579490729, 0.0028530193163565083, -0.0028530193163601946, 0.0, 2.0057060386327135, -2.0057060386327135, 0.0, -1.0057060386327197, -0.0028530193163612467, 0.0, 0.9971469806836384, -0.9971469806836384, 1.0085590579490813, 0.7874198006207375, 1.546009996485596, 0.0, 1.2414098041351382, -1.2414098041351382, -2.333429797106333, -0.24140980413513827, 1.5460099964855956, 0.0, 0.2125801993792626, -0.2125801993792626, -1.304600192350457, 0.7874198006207367, 0.24140980413514002, 0.0, 2.5460099964855925, -2.5460099964855925, -1.0288296047558765, -1.5460099964855953, 0.2414098041351382, 0.0, 0.21258019937926237, -0.21258019937926237, 1.3046001923504573, -0.24140980413514057, -0.7874198006207365, 0.0, 2.546009996485593, -2.546009996485593, 1.0288296047558774, -1.546009996485597, -0.787419800620738, 0.0, 1.2414098041351387, -1.2414098041351387, 2.333429797106335 };
__constant double const cnst_2[12 * 6] = { 0.7476439420339873, 0.0, -0.7476439420339914, 3.4952878840679706, 1.1215314724299662e-14, -3.4952878840679857, 0.7476439420339868, 0.0, 2.495287884067986, 0.25235605796599214, -3.242931826101967, -0.25235605796600574, -2.4952878840679884, 0.0, -0.7476439420339918, 0.2523560579659977, 3.2429318261019837, -0.2523560579660064, 0.0028530193163513367, 0.0, -0.002853019316358805, 2.005706038632703, 1.428660823968979e-14, -2.0057060386327175, 0.0028530193163510222, 0.0, 1.0057060386327183, 0.9971469806836263, -1.0085590579490629, -0.99714698068364, -1.005706038632725, 0.0, -0.0028530193163588775, 0.9971469806836281, 1.0085590579490893, -0.9971469806836403, 0.7874198006207309, 0.0, 0.24140980413514163, 2.5460099964855814, -1.0288296047558645, -2.5460099964855973, -0.2414098041351445, 0.0, -0.7874198006207358, 2.5460099964855836, 1.0288296047558865, -2.5460099964855973, 0.7874198006207308, 0.0, 1.5460099964855984, 1.2414098041351245, -2.333429797106322, -1.2414098041351396, -1.5460099964856004, 0.0, -0.7874198006207358, 1.241409804135129, 2.333429797106341, -1.2414098041351402, -0.24140980413514518, 0.0, 1.5460099964855982, 0.21258019937925107, -1.304600192350448, -0.21258019937926315, -1.5460099964856011, 0.0, 0.2414098041351414, 0.21258019937925327, 1.304600192350464, -0.21258019937926348 };
__constant double const cnst_3[12 * 10] = { 0.0463079953908666, 0.440268993398561, 0.04630799539086663, 0.40225091496147397, -0.20112545748073685, -0.01452104355632575, -0.014521043556325792, -0.201125457480737, 0.40225091496147386, 0.09390618797088351, 0.04630799539086659, 0.04630799539086662, 0.44026899339856124, -0.20112545748073674, 0.40225091496147364, -0.20112545748073687, 0.40225091496147347, -0.014521043556325774, -0.014521043556325662, 0.09390618797088346, 0.44026899339856096, 0.04630799539086654, 0.046307995390866656, -0.014521043556325686, -0.014521043556325821, 0.40225091496147364, -0.2011254574807369, 0.40225091496147364, -0.20112545748073685, 0.09390618797088351, 0.03935168581745833, -0.0626737220523999, 0.03935168581745875, 0.2836549261579288, -0.14182746307896527, -0.07051024619919724, -0.0705102461991982, -0.14182746307896418, 0.2836549261579299, 0.8413359166579489, 0.0393516858174583, 0.0393516858174587, -0.06267372205239984, -0.14182746307896513, 0.2836549261579287, -0.1418274630789641, 0.28365492615792975, -0.07051024619919738, -0.07051024619919796, 0.8413359166579489, -0.06267372205240003, 0.03935168581745864, 0.039351685817458765, -0.07051024619919775, -0.0705102461991979, 0.2836549261579309, -0.1418274630789656, 0.28365492615793075, -0.1418274630789653, 0.8413359166579472, 0.04111072846643509, -0.02619322659935562, 0.011435826065379854, 0.8084889526681459, -0.06128522144874131, -0.062388096817690426, -0.005117035916025209, -0.12795187989530318, 0.1384464196928617, 0.2834535337842932, 0.01143582606537985, -0.02619322659935547, 0.041110728466435105, 0.13844641969286162, -0.1279518798953029, -0.005117035916025165, -0.062388096817690385, -0.06128522144874131, 0.8084889526681459, 0.2834535337842927, 0.04111072846643504, 0.011435826065379873, -0.026193226599355485, -0.0612852214487412, 0.8084889526681459, -0.12795187989530296, 0.13844641969286167, -0.06238809681769047, -0.005117035916025135, 0.28345353378429305, -0.026193226599355555, 0.011435826065379759, 0.04111072846643509, -0.005117035916025144, -0.062388096817690426, 0.1384464196928615, -0.127951879895303, 0.8084889526681458, -0.061285221448741094, 0.2834535337842927, 0.01143582606537988, 0.04111072846643508, -0.026193226599355596, -0.127951879895303, 0.13844641969286164, -0.06128522144874138, 0.8084889526681458, -0.005117035916025219, -0.06238809681769036, 0.28345353378429317, -0.026193226599355586, 0.041110728466435056, 0.011435826065379875, -0.06238809681769045, -0.005117035916025181, 0.8084889526681456, -0.061285221448741406, 0.13844641969286173, -0.127951879895303, 0.28345353378429317 };
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel(int const w_1_global_len, __global double const *__restrict__ w_1_global, __global int const *__restrict__ ltg_2, int const nelements, int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size, __global volatile double *__restrict__ A1_global, int const A1_size)
{
for (int i_init_1 = 0; i_init_1 <= -1 + A1_size; ++i_init_1)
{
A1_global[2 * i_init_1] = 0.0;
A1_global[2 * i_init_1 + 1] = 0.0;
}
for (int i_init_0 = 0; i_init_0 <= -1 + A0_size; ++i_init_0)
A0_global[i_init_0] = 0.0;
}
__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_and_loopy_kernel_and_loopy_kernel_and_tsfc_kernel_and_loopy_kernel_0(int const w_1_global_len, __global double const *__restrict__ w_1_global, __global int const *__restrict__ ltg_2, int const nelements, int const w_0_global_len, __global double const *__restrict__ w_0_global, __global int const *__restrict__ ltg_1, int const coords_global_len, __global double const *__restrict__ coords_global, __global int const *__restrict__ ltg_0, __global volatile double *__restrict__ A0_global, int const A0_size, __global volatile double *__restrict__ A1_global, int const A1_size)
{
double acc_i16;
double acc_i16_5;
double acc_i16_6;
double acc_i18;
double acc_i18_0;
double acc_i18_1;
double acc_i23;
double acc_i23_0;
double sum_tmp_0_0[12];
double sum_tmp_10_0[10];
double sum_tmp_11_0[12];
double sum_tmp_12_0[10];
double sum_tmp_1_0[12];
double sum_tmp_2[12];
double sum_tmp_8_0[6];
double sum_tmp_9_0[12];
for (int iel = 0; iel <= -1 + nelements; ++iel)
{
for (int i16_4 = 0; i16_4 <= 11; ++i16_4)
{
acc_i23 = 0.0;
for (int i23 = 0; i23 <= 9; ++i23)
acc_i23 = acc_i23 + cnst_3[10 * i16_4 + i23] * w_1_global[2 * ltg_2[10 * iel + i23] + 1];
sum_tmp_11_0[i16_4] = acc_i23;
}
for (int i16_3 = 0; i16_3 <= 11; ++i16_3)
{
acc_i23_0 = 0.0;
for (int i23_0 = 0; i23_0 <= 9; ++i23_0)
acc_i23_0 = acc_i23_0 + cnst_3[10 * i16_3 + i23_0] * w_1_global[2 * ltg_2[10 * iel + i23_0]];
sum_tmp_9_0[i16_3] = acc_i23_0;
}
for (int i16_2 = 0; i16_2 <= 11; ++i16_2)
{
acc_i18 = 0.0;
for (int i18 = 0; i18 <= 5; ++i18)
acc_i18 = acc_i18 + cnst_2[6 * i16_2 + i18] * w_0_global[ltg_1[6 * iel + i18]];
sum_tmp_1_0[i16_2] = acc_i18;
}
for (int i16_1 = 0; i16_1 <= 11; ++i16_1)
{
acc_i18_0 = 0.0;
for (int i18_0 = 0; i18_0 <= 5; ++i18_0)
acc_i18_0 = acc_i18_0 + cnst_1[6 * i16_1 + i18_0] * w_0_global[ltg_1[6 * iel + i18_0]];
sum_tmp_0_0[i16_1] = acc_i18_0;
}
for (int i16_0 = 0; i16_0 <= 11; ++i16_0)
{
acc_i18_1 = 0.0;
for (int i18_1 = 0; i18_1 <= 5; ++i18_1)
acc_i18_1 = acc_i18_1 + cnst[6 * i16_0 + i18_1] * w_0_global[ltg_1[6 * iel + i18_1]];
sum_tmp_2[i16_0] = acc_i18_1;
}
for (int i121_1 = 0; i121_1 <= 9; ++i121_1)
{
acc_i16 = 0.0;
for (int i16 = 0; i16 <= 11; ++i16)
acc_i16 = acc_i16 + cnst_3[10 * i16 + i121_1] * sum_tmp_11_0[i16] * cnst_0[i16] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]));
sum_tmp_12_0[i121_1] = acc_i16;
}
for (int ibf_gather_2 = 0; ibf_gather_2 <= 9; ++ibf_gather_2)
{
double loopy_old_val;
double loopy_new_val;
do
{
loopy_old_val = A1_global[2 * ltg_2[10 * iel + ibf_gather_2] + 1];
loopy_new_val = loopy_old_val + sum_tmp_12_0[ibf_gather_2];
}
while (atom_cmpxchg((__global long *) &(A1_global[2 * ltg_2[10 * iel + ibf_gather_2] + 1]), *(long *) &loopy_old_val, *(long *) &loopy_new_val) != *(long *) &loopy_old_val);
}
for (int i121_0 = 0; i121_0 <= 9; ++i121_0)
{
acc_i16_5 = 0.0;
for (int i16_5 = 0; i16_5 <= 11; ++i16_5)
acc_i16_5 = acc_i16_5 + cnst_3[10 * i16_5 + i121_0] * sum_tmp_9_0[i16_5] * cnst_0[i16_5] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]));
sum_tmp_10_0[i121_0] = acc_i16_5;
}
for (int ibf_gather_1 = 0; ibf_gather_1 <= 9; ++ibf_gather_1)
{
double loopy_old_val_0;
double loopy_new_val_0;
do
{
loopy_old_val_0 = A1_global[2 * ltg_2[10 * iel + ibf_gather_1]];
loopy_new_val_0 = loopy_old_val_0 + sum_tmp_10_0[ibf_gather_1];
}
while (atom_cmpxchg((__global long *) &(A1_global[2 * ltg_2[10 * iel + ibf_gather_1]]), *(long *) &loopy_old_val_0, *(long *) &loopy_new_val_0) != *(long *) &loopy_old_val_0);
}
for (int i120_0 = 0; i120_0 <= 5; ++i120_0)
{
acc_i16_6 = 0.0;
for (int i16_6 = 0; i16_6 <= 11; ++i16_6)
acc_i16_6 = acc_i16_6 + cnst_2[6 * i16_6 + i120_0] * ((sum_tmp_0_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + sum_tmp_1_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]) + (sum_tmp_0_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) + sum_tmp_1_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]))) * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]))) * cnst_0[i16_6] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + cnst[6 * i16_6 + i120_0] * sum_tmp_2[i16_6] * cnst_0[i16_6] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + cnst_1[6 * i16_6 + i120_0] * ((sum_tmp_0_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + sum_tmp_1_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) + (sum_tmp_0_0[i16_6] * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) + sum_tmp_1_0[i16_6] * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]))) * 1.0 / ((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1])) * -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]])) * cnst_0[i16_6] * fabs((-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 1]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 2] + 1]) + -1.0 * (-1.0 * coords_global[2 * ltg_0[3 * iel]] + coords_global[2 * ltg_0[3 * iel + 2]]) * (-1.0 * coords_global[2 * ltg_0[3 * iel] + 1] + coords_global[2 * ltg_0[3 * iel + 1] + 1]));
sum_tmp_8_0[i120_0] = acc_i16_6;
}
for (int ibf_gather_0 = 0; ibf_gather_0 <= 5; ++ibf_gather_0)
{
double loopy_old_val_1;
double loopy_new_val_1;
do
{
loopy_old_val_1 = A0_global[ltg_1[6 * iel + ibf_gather_0]];
loopy_new_val_1 = loopy_old_val_1 + sum_tmp_8_0[ibf_gather_0];
}
while (atom_cmpxchg((__global long *) &(A0_global[ltg_1[6 * iel + ibf_gather_0]]), *(long *) &loopy_old_val_1, *(long *) &loopy_new_val_1) != *(long *) &loopy_old_val_1);
}
}
}
@kaushikcfd
Copy link
Author

Unrolled dim_init_1 and added few assumptions.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment