Skip to content

Instantly share code, notes, and snippets.

@castano
Created April 26, 2020 16:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save castano/8fff1eed89eb1bb49a132c92f5e62879 to your computer and use it in GitHub Desktop.
Save castano/8fff1eed89eb1bb49a132c92f5e62879 to your computer and use it in GitHub Desktop.
Wide8 besterror8 = broadcast8(FLT_MAX);
Vector3_Wide8 beststart8;
Vector3_Wide8 bestend8;
// check all possible clusters for this total order
for (int i = 0; i < s_fourClusterTotal[count - 1]; i += 8)
{
//uint c0 = s_fourCluster[i].c0;
//uint c1 = s_fourCluster[i].c1;
//uint c2 = s_fourCluster[i].c2;
// Load 4 uint8 per lane.
//__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]);
Vector3_Wide8 x0;// = x_sat[c0];
Wide8 w0;// = w_sat[c0];
Vector3_Wide8 x1;// = x_sat[c1];
Wide8 w1;// = w_sat[c1];
Vector3_Wide8 x2;// = x_sat[c2];
Wide8 w2;// = w_sat[c2];
// @@ Is there a better way to do this?
for (int l = 0; l < 8; l++) {
uint c0 = s_fourCluster[i+l].c0;
uint c1 = s_fourCluster[i+l].c1;
uint c2 = s_fourCluster[i+l].c2;
x0.x.e[l] = x_sat[c0].x;
x0.y.e[l] = x_sat[c0].y;
x0.z.e[l] = x_sat[c0].z;
w0.e[l] = w_sat[c0];
x1.x.e[l] = x_sat[c1].x;
x1.y.e[l] = x_sat[c1].y;
x1.z.e[l] = x_sat[c1].z;
w1.e[l] = w_sat[c1];
x2.x.e[l] = x_sat[c2].x;
x2.y.e[l] = x_sat[c2].y;
x2.z.e[l] = x_sat[c2].z;
w2.e[l] = w_sat[c2];
}
x2 = x2 - x1;
x1 = x1 - x0;
w2 = w2 - w1;
w1 = w1 - w0;
Wide8 w3 = broadcast8(m_wsum) - w0 - w1 - w2;
Wide8 alpha2_sum = mad8(w2, broadcast8(1.0f / 9.0f), mad8(w1, broadcast8(4.0f/ 9.0f), w0));
Wide8 beta2_sum = mad8(w1, broadcast8(1.0f / 9.0f), mad8(w2, broadcast8(4.0f / 9.0f), w3));
Wide8 alphabeta_sum = (w1 + w2) * broadcast8(2.0f / 9.0f);
Wide8 factor = rcp8(alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
Vector3_Wide8 alphax_sum = mad8(x2, broadcast8(1.0f / 3.0f), mad8(x1, broadcast8(2.0f / 3.0f), x0));
Vector3_Wide8 betax_sum = broadcast8(m_xsum) - alphax_sum;
Vector3_Wide8 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
Vector3_Wide8 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// clamp to the grid
a = saturate8(a);
b = saturate8(b);
a = round_ept8(a);
b = round_ept8(b);
// compute the error @@ Use fma here.
Vector3_Wide8 e1 = a * a * alpha2_sum + b * b * beta2_sum + (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum) * broadcast8(2.0f);
// apply the metric to the error term
//Wide8 error = dot8(e1, broadcast8(m_metricSqr));
Wide8 error = e1.x + e1.y + e1.z;
// keep the solution if it wins
auto mask = ge8(besterror8, error);
besterror8 = select8(mask, besterror8, error); // @@ Use min?
beststart8 = select8(mask, beststart8, a);
bestend8 = select8(mask, bestend8, b);
}
// Is there a better way to do this reduction?
int bestindex;
for (int i = 0; i < 8; i++) {
if (besterror8.e[i]< besterror) {
besterror = besterror8.e[i];
bestindex = i;
}
}
beststart.x = beststart8.x.e[bestindex];
beststart.y = beststart8.y.e[bestindex];
beststart.z = beststart8.z.e[bestindex];
bestend.x = bestend8.x.e[bestindex];
bestend.y = bestend8.y.e[bestindex];
bestend.z = bestend8.z.e[bestindex];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment