Skip to content

Instantly share code, notes, and snippets.

@gut
Last active April 7, 2017 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gut/0bac048d1539b26d326c7eb231e92df9 to your computer and use it in GitHub Desktop.
Save gut/0bac048d1539b26d326c7eb231e92df9 to your computer and use it in GitHub Desktop.
diff --git c/torch/lib/TH/vector/VSX.c w/torch/lib/TH/vector/VSX.c
index 796d3b8..04877e1 100644
--- c/torch/lib/TH/vector/VSX.c
+++ w/torch/lib/TH/vector/VSX.c
@@ -399,7 +399,7 @@ static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t
}
-static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t n)
+static void THDoubleVector_muls_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
@@ -407,23 +407,11 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+ vector double c_fp64vec2 = {c, c};
for (i = 0; i <= n-24; i += 24)
{
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y1_fp64vec2 = vec_xl(0, y+(i+2 ));
- y2_fp64vec2 = vec_xl(0, y+(i+4 ));
- y3_fp64vec2 = vec_xl(0, y+(i+6 ));
- y4_fp64vec2 = vec_xl(0, y+(i+8 ));
- y5_fp64vec2 = vec_xl(0, y+(i+10));
- y6_fp64vec2 = vec_xl(0, y+(i+12));
- y7_fp64vec2 = vec_xl(0, y+(i+14));
- y8_fp64vec2 = vec_xl(0, y+(i+16));
- y9_fp64vec2 = vec_xl(0, y+(i+18));
- y10_fp64vec2 = vec_xl(0, y+(i+20));
- y11_fp64vec2 = vec_xl(0, y+(i+22));
-
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
@@ -437,18 +425,18 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
- y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
- y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
- y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
- y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2);
- y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2);
- y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2);
- y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2);
- y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2);
- y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2);
- y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
- y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
+ y0_fp64vec2 = vec_mul(c_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_mul(c_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_mul(c_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_mul(c_fp64vec2, x3_fp64vec2);
+ y4_fp64vec2 = vec_mul(c_fp64vec2, x4_fp64vec2);
+ y5_fp64vec2 = vec_mul(c_fp64vec2, x5_fp64vec2);
+ y6_fp64vec2 = vec_mul(c_fp64vec2, x6_fp64vec2);
+ y7_fp64vec2 = vec_mul(c_fp64vec2, x7_fp64vec2);
+ y8_fp64vec2 = vec_mul(c_fp64vec2, x8_fp64vec2);
+ y9_fp64vec2 = vec_mul(c_fp64vec2, x9_fp64vec2);
+ y10_fp64vec2 = vec_mul(c_fp64vec2, x10_fp64vec2);
+ y11_fp64vec2 = vec_mul(c_fp64vec2, x11_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
@@ -465,20 +453,15 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t
}
for (; i <= n-8; i += 8)
{
- y0_fp64vec2 = vec_xl(0, y+(i ));
- y1_fp64vec2 = vec_xl(0, y+(i+2 ));
- y2_fp64vec2 = vec_xl(0, y+(i+4 ));
- y3_fp64vec2 = vec_xl(0, y+(i+6 ));
-
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
- y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
- y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
- y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
+ y0_fp64vec2 = vec_mul(c_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_mul(c_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_mul(c_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_mul(c_fp64vec2, x3_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
@@ -487,13 +470,12 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t
}
for (; i <= n-2; i += 2)
{
- y0_fp64vec2 = vec_xl(0, y+(i ));
x0_fp64vec2 = vec_xl(0, x+(i ));
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
+ y0_fp64vec2 = vec_mul(c_fp64vec2, x0_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
}
for (; i < n; i++)
- y[i] = y[i] * x[i];
+ y[i] = c * x[i];
}
@@ -885,7 +867,7 @@ static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n)
-static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n)
+static void THFloatVector_muls_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
@@ -893,23 +875,13 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n)
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+ vector float c_fp32vec4;
+ float val[4] = {c, c, c, c};
+ c_fp32vec4 = vec_xl(0, val);
for (i = 0; i <= n-48; i += 48)
{
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
- y4_fp32vec4 = vec_xl(0, y+(i+16));
- y5_fp32vec4 = vec_xl(0, y+(i+20));
- y6_fp32vec4 = vec_xl(0, y+(i+24));
- y7_fp32vec4 = vec_xl(0, y+(i+28));
- y8_fp32vec4 = vec_xl(0, y+(i+32));
- y9_fp32vec4 = vec_xl(0, y+(i+36));
- y10_fp32vec4 = vec_xl(0, y+(i+40));
- y11_fp32vec4 = vec_xl(0, y+(i+44));
-
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
@@ -923,18 +895,18 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n)
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
- y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
- y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
- y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
- y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4);
- y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4);
- y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4);
- y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4);
- y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4);
- y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4);
- y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
- y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
+ y0_fp32vec4 = vec_mul(c_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_mul(c_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_mul(c_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_mul(c_fp32vec4, x3_fp32vec4);
+ y4_fp32vec4 = vec_mul(c_fp32vec4, x4_fp32vec4);
+ y5_fp32vec4 = vec_mul(c_fp32vec4, x5_fp32vec4);
+ y6_fp32vec4 = vec_mul(c_fp32vec4, x6_fp32vec4);
+ y7_fp32vec4 = vec_mul(c_fp32vec4, x7_fp32vec4);
+ y8_fp32vec4 = vec_mul(c_fp32vec4, x8_fp32vec4);
+ y9_fp32vec4 = vec_mul(c_fp32vec4, x9_fp32vec4);
+ y10_fp32vec4 = vec_mul(c_fp32vec4, x10_fp32vec4);
+ y11_fp32vec4 = vec_mul(c_fp32vec4, x11_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
@@ -951,20 +923,15 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n)
}
for (; i <= n-16; i += 16)
{
- y0_fp32vec4 = vec_xl(0, y+(i ));
- y1_fp32vec4 = vec_xl(0, y+(i+4 ));
- y2_fp32vec4 = vec_xl(0, y+(i+8 ));
- y3_fp32vec4 = vec_xl(0, y+(i+12));
-
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
- y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
- y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
- y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
+ y0_fp32vec4 = vec_mul(c_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_mul(c_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_mul(c_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_mul(c_fp32vec4, x3_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
@@ -973,13 +940,12 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n)
}
for (; i <= n-4; i += 4)
{
- y0_fp32vec4 = vec_xl(0, y+(i ));
x0_fp32vec4 = vec_xl(0, x+(i ));
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
+ y0_fp32vec4 = vec_mul(c_fp32vec4, x0_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
}
for (; i < n; i++)
- y[i] = y[i] * x[i];
+ y[i] = c * x[i];
}
@@ -1105,16 +1071,16 @@ static void standardFloat_scale(float *y, const float c, const ptrdiff_t n)
y[i] *= c;
}
-static void standardDouble_mul(double *y, const double *x, const ptrdiff_t n)
+static void standardDouble_muls(double *y, const double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] *= x[i];
+ y[i] = c * x[i];
}
-static void standardFloat_mul(float *y, const float *x, const ptrdiff_t n)
+static void standardFloat_muls(float *y, const float *x, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
- y[i] *= x[i];
+ y[i] = c * x[i];
}
double randDouble()
@@ -1721,6 +1687,7 @@ void test_THDoubleVector_muls_VSX()
double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
@@ -1736,20 +1703,20 @@ void test_THDoubleVector_muls_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS );
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS );
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
@@ -1759,17 +1726,17 @@ void test_THDoubleVector_muls_VSX()
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardDouble_muls( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THDoubleVector_muls_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardDouble_muls( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THDoubleVector_muls_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardDouble_muls( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THDoubleVector_muls_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardDouble_muls( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THDoubleVector_muls_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardDouble_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardDouble_muls( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THDoubleVector_muls_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardDouble_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
@@ -1793,6 +1760,7 @@ void test_THFloatVector_muls_VSX()
float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float c = (float)randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
@@ -1808,20 +1776,20 @@ void test_THFloatVector_muls_VSX()
// Performance Test
//-------------------------------------------------
start = clock();
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS );
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS );
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
@@ -1831,17 +1799,17 @@ void test_THFloatVector_muls_VSX()
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
- standardFloat_muls( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- THFloatVector_muls_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
- standardFloat_muls( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- THFloatVector_muls_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
- standardFloat_muls( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- THFloatVector_muls_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
- standardFloat_muls( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
- THFloatVector_muls_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ standardFloat_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
- standardFloat_muls( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
- THFloatVector_muls_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ standardFloat_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment