castano/avx-first-try.txt

## avx-first-try.txt
    Wide8 besterror8 = broadcast8(FLT_MAX);
    Vector3_Wide8 beststart8;
    Vector3_Wide8 bestend8;

    // check all possible clusters for this total order
    for (int i = 0; i < s_fourClusterTotal[count - 1]; i += 8)
    {
        //uint c0 = s_fourCluster[i].c0;
        //uint c1 = s_fourCluster[i].c1;
        //uint c2 = s_fourCluster[i].c2;

        // Load 4 uint8 per lane.
        //__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]);

        Vector3_Wide8 x0;// = x_sat[c0];
        Wide8 w0;// = w_sat[c0];

        Vector3_Wide8 x1;// = x_sat[c1];
        Wide8 w1;// = w_sat[c1];

        Vector3_Wide8 x2;// = x_sat[c2];
        Wide8 w2;// = w_sat[c2];

        // @@ Is there a better way to do this?
        for (int l = 0; l < 8; l++) {
            uint c0 = s_fourCluster[i+l].c0;
            uint c1 = s_fourCluster[i+l].c1;
            uint c2 = s_fourCluster[i+l].c2;

            x0.x.e[l] = x_sat[c0].x;
            x0.y.e[l] = x_sat[c0].y;
            x0.z.e[l] = x_sat[c0].z;
            w0.e[l] = w_sat[c0];

            x1.x.e[l] = x_sat[c1].x;
            x1.y.e[l] = x_sat[c1].y;
            x1.z.e[l] = x_sat[c1].z;
            w1.e[l] = w_sat[c1];

            x2.x.e[l] = x_sat[c2].x;
            x2.y.e[l] = x_sat[c2].y;
            x2.z.e[l] = x_sat[c2].z;
            w2.e[l] = w_sat[c2];
        }

        x2 = x2 - x1;
        x1 = x1 - x0;
        w2 = w2 - w1;
        w1 = w1 - w0;

        Wide8 w3 = broadcast8(m_wsum) - w0 - w1 - w2;

        Wide8 alpha2_sum = mad8(w2, broadcast8(1.0f / 9.0f), mad8(w1, broadcast8(4.0f/ 9.0f), w0));
        Wide8 beta2_sum  = mad8(w1, broadcast8(1.0f / 9.0f), mad8(w2, broadcast8(4.0f / 9.0f), w3));

        Wide8 alphabeta_sum = (w1 + w2) * broadcast8(2.0f / 9.0f);
        Wide8 factor = rcp8(alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

        Vector3_Wide8 alphax_sum = mad8(x2, broadcast8(1.0f / 3.0f), mad8(x1, broadcast8(2.0f / 3.0f), x0));
        Vector3_Wide8 betax_sum = broadcast8(m_xsum) - alphax_sum;

        Vector3_Wide8 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
        Vector3_Wide8 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

        // clamp to the grid
        a = saturate8(a);
        b = saturate8(b);
        a = round_ept8(a);
        b = round_ept8(b);

        // compute the error @@ Use fma here.
        Vector3_Wide8 e1 = a * a * alpha2_sum + b * b * beta2_sum + (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum) * broadcast8(2.0f);

        // apply the metric to the error term
        //Wide8 error = dot8(e1, broadcast8(m_metricSqr));
        Wide8 error = e1.x + e1.y + e1.z;

        // keep the solution if it wins
        auto mask = ge8(besterror8, error);
        besterror8 = select8(mask, besterror8, error);  // @@ Use min?
        beststart8 = select8(mask, beststart8, a);
        bestend8 = select8(mask, bestend8, b);
    }

    // Is there a better way to do this reduction?
    int bestindex;
    for (int i = 0; i < 8; i++) {
        if (besterror8.e[i]< besterror) {
            besterror = besterror8.e[i];
            bestindex = i;
        }
    }
    beststart.x = beststart8.x.e[bestindex];
    beststart.y = beststart8.y.e[bestindex];
    beststart.z = beststart8.z.e[bestindex];
    bestend.x = bestend8.x.e[bestindex];
    bestend.y = bestend8.y.e[bestindex];
    bestend.z = bestend8.z.e[bestindex];
	Wide8 besterror8 = broadcast8(FLT_MAX);
	Vector3_Wide8 beststart8;
	Vector3_Wide8 bestend8;

	// check all possible clusters for this total order
	for (int i = 0; i < s_fourClusterTotal[count - 1]; i += 8)
	{
	//uint c0 = s_fourCluster[i].c0;
	//uint c1 = s_fourCluster[i].c1;
	//uint c2 = s_fourCluster[i].c2;

	// Load 4 uint8 per lane.
	//__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]);

	Vector3_Wide8 x0;// = x_sat[c0];
	Wide8 w0;// = w_sat[c0];

	Vector3_Wide8 x1;// = x_sat[c1];
	Wide8 w1;// = w_sat[c1];

	Vector3_Wide8 x2;// = x_sat[c2];
	Wide8 w2;// = w_sat[c2];

	// @@ Is there a better way to do this?
	for (int l = 0; l < 8; l++) {
	uint c0 = s_fourCluster[i+l].c0;
	uint c1 = s_fourCluster[i+l].c1;
	uint c2 = s_fourCluster[i+l].c2;

	x0.x.e[l] = x_sat[c0].x;
	x0.y.e[l] = x_sat[c0].y;
	x0.z.e[l] = x_sat[c0].z;
	w0.e[l] = w_sat[c0];

	x1.x.e[l] = x_sat[c1].x;
	x1.y.e[l] = x_sat[c1].y;
	x1.z.e[l] = x_sat[c1].z;
	w1.e[l] = w_sat[c1];

	x2.x.e[l] = x_sat[c2].x;
	x2.y.e[l] = x_sat[c2].y;
	x2.z.e[l] = x_sat[c2].z;
	w2.e[l] = w_sat[c2];
	}

	x2 = x2 - x1;
	x1 = x1 - x0;
	w2 = w2 - w1;
	w1 = w1 - w0;

	Wide8 w3 = broadcast8(m_wsum) - w0 - w1 - w2;

	Wide8 alpha2_sum = mad8(w2, broadcast8(1.0f / 9.0f), mad8(w1, broadcast8(4.0f/ 9.0f), w0));
	Wide8 beta2_sum = mad8(w1, broadcast8(1.0f / 9.0f), mad8(w2, broadcast8(4.0f / 9.0f), w3));

	Wide8 alphabeta_sum = (w1 + w2) * broadcast8(2.0f / 9.0f);
	Wide8 factor = rcp8(alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

	Vector3_Wide8 alphax_sum = mad8(x2, broadcast8(1.0f / 3.0f), mad8(x1, broadcast8(2.0f / 3.0f), x0));
	Vector3_Wide8 betax_sum = broadcast8(m_xsum) - alphax_sum;

	Vector3_Wide8 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
	Vector3_Wide8 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

	// clamp to the grid
	a = saturate8(a);
	b = saturate8(b);
	a = round_ept8(a);
	b = round_ept8(b);

	// compute the error @@ Use fma here.
	Vector3_Wide8 e1 = a * a * alpha2_sum + b * b * beta2_sum + (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum) * broadcast8(2.0f);

	// apply the metric to the error term
	//Wide8 error = dot8(e1, broadcast8(m_metricSqr));
	Wide8 error = e1.x + e1.y + e1.z;

	// keep the solution if it wins
	auto mask = ge8(besterror8, error);
	besterror8 = select8(mask, besterror8, error); // @@ Use min?
	beststart8 = select8(mask, beststart8, a);
	bestend8 = select8(mask, bestend8, b);
	}

	// Is there a better way to do this reduction?
	int bestindex;
	for (int i = 0; i < 8; i++) {
	if (besterror8.e[i]< besterror) {
	besterror = besterror8.e[i];
	bestindex = i;
	}
	}
	beststart.x = beststart8.x.e[bestindex];
	beststart.y = beststart8.y.e[bestindex];
	beststart.z = beststart8.z.e[bestindex];
	bestend.x = bestend8.x.e[bestindex];
	bestend.y = bestend8.y.e[bestindex];
	bestend.z = bestend8.z.e[bestindex];