Last active
April 7, 2017 14:05
-
-
Save gut/0bac048d1539b26d326c7eb231e92df9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git c/torch/lib/TH/vector/VSX.c w/torch/lib/TH/vector/VSX.c | |
index 796d3b8..04877e1 100644 | |
--- c/torch/lib/TH/vector/VSX.c | |
+++ w/torch/lib/TH/vector/VSX.c | |
@@ -399,7 +399,7 @@ static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t | |
} | |
-static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t n) | |
+static void THDoubleVector_muls_VSX(double *y, const double *x, const double c, const ptrdiff_t n) | |
{ | |
ptrdiff_t i; | |
@@ -407,23 +407,11 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t | |
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; | |
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; | |
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; | |
+ vector double c_fp64vec2 = {c, c}; | |
for (i = 0; i <= n-24; i += 24) | |
{ | |
- y0_fp64vec2 = vec_xl(0, y+(i )); | |
- y1_fp64vec2 = vec_xl(0, y+(i+2 )); | |
- y2_fp64vec2 = vec_xl(0, y+(i+4 )); | |
- y3_fp64vec2 = vec_xl(0, y+(i+6 )); | |
- y4_fp64vec2 = vec_xl(0, y+(i+8 )); | |
- y5_fp64vec2 = vec_xl(0, y+(i+10)); | |
- y6_fp64vec2 = vec_xl(0, y+(i+12)); | |
- y7_fp64vec2 = vec_xl(0, y+(i+14)); | |
- y8_fp64vec2 = vec_xl(0, y+(i+16)); | |
- y9_fp64vec2 = vec_xl(0, y+(i+18)); | |
- y10_fp64vec2 = vec_xl(0, y+(i+20)); | |
- y11_fp64vec2 = vec_xl(0, y+(i+22)); | |
- | |
x0_fp64vec2 = vec_xl(0, x+(i )); | |
x1_fp64vec2 = vec_xl(0, x+(i+2 )); | |
x2_fp64vec2 = vec_xl(0, x+(i+4 )); | |
@@ -437,18 +425,18 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t | |
x10_fp64vec2 = vec_xl(0, x+(i+20)); | |
x11_fp64vec2 = vec_xl(0, x+(i+22)); | |
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); | |
- y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2); | |
- y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2); | |
- y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2); | |
- y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2); | |
- y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2); | |
- y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2); | |
- y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2); | |
- y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2); | |
- y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2); | |
- y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2); | |
- y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2); | |
+ y0_fp64vec2 = vec_mul(c_fp64vec2, x0_fp64vec2); | |
+ y1_fp64vec2 = vec_mul(c_fp64vec2, x1_fp64vec2); | |
+ y2_fp64vec2 = vec_mul(c_fp64vec2, x2_fp64vec2); | |
+ y3_fp64vec2 = vec_mul(c_fp64vec2, x3_fp64vec2); | |
+ y4_fp64vec2 = vec_mul(c_fp64vec2, x4_fp64vec2); | |
+ y5_fp64vec2 = vec_mul(c_fp64vec2, x5_fp64vec2); | |
+ y6_fp64vec2 = vec_mul(c_fp64vec2, x6_fp64vec2); | |
+ y7_fp64vec2 = vec_mul(c_fp64vec2, x7_fp64vec2); | |
+ y8_fp64vec2 = vec_mul(c_fp64vec2, x8_fp64vec2); | |
+ y9_fp64vec2 = vec_mul(c_fp64vec2, x9_fp64vec2); | |
+ y10_fp64vec2 = vec_mul(c_fp64vec2, x10_fp64vec2); | |
+ y11_fp64vec2 = vec_mul(c_fp64vec2, x11_fp64vec2); | |
vec_xst(y0_fp64vec2, 0, y+(i )); | |
vec_xst(y1_fp64vec2, 0, y+(i+2 )); | |
@@ -465,20 +453,15 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t | |
} | |
for (; i <= n-8; i += 8) | |
{ | |
- y0_fp64vec2 = vec_xl(0, y+(i )); | |
- y1_fp64vec2 = vec_xl(0, y+(i+2 )); | |
- y2_fp64vec2 = vec_xl(0, y+(i+4 )); | |
- y3_fp64vec2 = vec_xl(0, y+(i+6 )); | |
- | |
x0_fp64vec2 = vec_xl(0, x+(i )); | |
x1_fp64vec2 = vec_xl(0, x+(i+2 )); | |
x2_fp64vec2 = vec_xl(0, x+(i+4 )); | |
x3_fp64vec2 = vec_xl(0, x+(i+6 )); | |
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); | |
- y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2); | |
- y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2); | |
- y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2); | |
+ y0_fp64vec2 = vec_mul(c_fp64vec2, x0_fp64vec2); | |
+ y1_fp64vec2 = vec_mul(c_fp64vec2, x1_fp64vec2); | |
+ y2_fp64vec2 = vec_mul(c_fp64vec2, x2_fp64vec2); | |
+ y3_fp64vec2 = vec_mul(c_fp64vec2, x3_fp64vec2); | |
vec_xst(y0_fp64vec2, 0, y+(i )); | |
vec_xst(y1_fp64vec2, 0, y+(i+2 )); | |
@@ -487,13 +470,12 @@ static void THDoubleVector_muls_VSX(double *y, const double *x, const ptrdiff_t | |
} | |
for (; i <= n-2; i += 2) | |
{ | |
- y0_fp64vec2 = vec_xl(0, y+(i )); | |
x0_fp64vec2 = vec_xl(0, x+(i )); | |
- y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); | |
+ y0_fp64vec2 = vec_mul(c_fp64vec2, x0_fp64vec2); | |
vec_xst(y0_fp64vec2, 0, y+(i )); | |
} | |
for (; i < n; i++) | |
- y[i] = y[i] * x[i]; | |
+ y[i] = c * x[i]; | |
} | |
@@ -885,7 +867,7 @@ static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n) | |
-static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n) | |
+static void THFloatVector_muls_VSX(float *y, const float *x, const float c, const ptrdiff_t n) | |
{ | |
ptrdiff_t i; | |
@@ -893,23 +875,13 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n) | |
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; | |
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; | |
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; | |
+ vector float c_fp32vec4; | |
+ float val[4] = {c, c, c, c}; | |
+ c_fp32vec4 = vec_xl(0, val); | |
for (i = 0; i <= n-48; i += 48) | |
{ | |
- y0_fp32vec4 = vec_xl(0, y+(i )); | |
- y1_fp32vec4 = vec_xl(0, y+(i+4 )); | |
- y2_fp32vec4 = vec_xl(0, y+(i+8 )); | |
- y3_fp32vec4 = vec_xl(0, y+(i+12)); | |
- y4_fp32vec4 = vec_xl(0, y+(i+16)); | |
- y5_fp32vec4 = vec_xl(0, y+(i+20)); | |
- y6_fp32vec4 = vec_xl(0, y+(i+24)); | |
- y7_fp32vec4 = vec_xl(0, y+(i+28)); | |
- y8_fp32vec4 = vec_xl(0, y+(i+32)); | |
- y9_fp32vec4 = vec_xl(0, y+(i+36)); | |
- y10_fp32vec4 = vec_xl(0, y+(i+40)); | |
- y11_fp32vec4 = vec_xl(0, y+(i+44)); | |
- | |
x0_fp32vec4 = vec_xl(0, x+(i )); | |
x1_fp32vec4 = vec_xl(0, x+(i+4 )); | |
x2_fp32vec4 = vec_xl(0, x+(i+8 )); | |
@@ -923,18 +895,18 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n) | |
x10_fp32vec4 = vec_xl(0, x+(i+40)); | |
x11_fp32vec4 = vec_xl(0, x+(i+44)); | |
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); | |
- y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4); | |
- y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4); | |
- y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4); | |
- y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4); | |
- y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4); | |
- y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4); | |
- y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4); | |
- y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4); | |
- y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4); | |
- y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4); | |
- y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4); | |
+ y0_fp32vec4 = vec_mul(c_fp32vec4, x0_fp32vec4); | |
+ y1_fp32vec4 = vec_mul(c_fp32vec4, x1_fp32vec4); | |
+ y2_fp32vec4 = vec_mul(c_fp32vec4, x2_fp32vec4); | |
+ y3_fp32vec4 = vec_mul(c_fp32vec4, x3_fp32vec4); | |
+ y4_fp32vec4 = vec_mul(c_fp32vec4, x4_fp32vec4); | |
+ y5_fp32vec4 = vec_mul(c_fp32vec4, x5_fp32vec4); | |
+ y6_fp32vec4 = vec_mul(c_fp32vec4, x6_fp32vec4); | |
+ y7_fp32vec4 = vec_mul(c_fp32vec4, x7_fp32vec4); | |
+ y8_fp32vec4 = vec_mul(c_fp32vec4, x8_fp32vec4); | |
+ y9_fp32vec4 = vec_mul(c_fp32vec4, x9_fp32vec4); | |
+ y10_fp32vec4 = vec_mul(c_fp32vec4, x10_fp32vec4); | |
+ y11_fp32vec4 = vec_mul(c_fp32vec4, x11_fp32vec4); | |
vec_xst(y0_fp32vec4, 0, y+(i )); | |
vec_xst(y1_fp32vec4, 0, y+(i+4 )); | |
@@ -951,20 +923,15 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n) | |
} | |
for (; i <= n-16; i += 16) | |
{ | |
- y0_fp32vec4 = vec_xl(0, y+(i )); | |
- y1_fp32vec4 = vec_xl(0, y+(i+4 )); | |
- y2_fp32vec4 = vec_xl(0, y+(i+8 )); | |
- y3_fp32vec4 = vec_xl(0, y+(i+12)); | |
- | |
x0_fp32vec4 = vec_xl(0, x+(i )); | |
x1_fp32vec4 = vec_xl(0, x+(i+4 )); | |
x2_fp32vec4 = vec_xl(0, x+(i+8 )); | |
x3_fp32vec4 = vec_xl(0, x+(i+12)); | |
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); | |
- y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4); | |
- y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4); | |
- y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4); | |
+ y0_fp32vec4 = vec_mul(c_fp32vec4, x0_fp32vec4); | |
+ y1_fp32vec4 = vec_mul(c_fp32vec4, x1_fp32vec4); | |
+ y2_fp32vec4 = vec_mul(c_fp32vec4, x2_fp32vec4); | |
+ y3_fp32vec4 = vec_mul(c_fp32vec4, x3_fp32vec4); | |
vec_xst(y0_fp32vec4, 0, y+(i )); | |
vec_xst(y1_fp32vec4, 0, y+(i+4 )); | |
@@ -973,13 +940,12 @@ static void THFloatVector_muls_VSX(float *y, const float *x, const ptrdiff_t n) | |
} | |
for (; i <= n-4; i += 4) | |
{ | |
- y0_fp32vec4 = vec_xl(0, y+(i )); | |
x0_fp32vec4 = vec_xl(0, x+(i )); | |
- y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); | |
+ y0_fp32vec4 = vec_mul(c_fp32vec4, x0_fp32vec4); | |
vec_xst(y0_fp32vec4, 0, y+(i )); | |
} | |
for (; i < n; i++) | |
- y[i] = y[i] * x[i]; | |
+ y[i] = c * x[i]; | |
} | |
@@ -1105,16 +1071,16 @@ static void standardFloat_scale(float *y, const float c, const ptrdiff_t n) | |
y[i] *= c; | |
} | |
-static void standardDouble_mul(double *y, const double *x, const ptrdiff_t n) | |
+static void standardDouble_muls(double *y, const double *x, const double c, const ptrdiff_t n) | |
{ | |
for (ptrdiff_t i = 0; i < n; i++) | |
- y[i] *= x[i]; | |
+ y[i] = c * x[i]; | |
} | |
-static void standardFloat_mul(float *y, const float *x, const ptrdiff_t n) | |
+static void standardFloat_muls(float *y, const float *x, const float c, const ptrdiff_t n) | |
{ | |
for (ptrdiff_t i = 0; i < n; i++) | |
- y[i] *= x[i]; | |
+ y[i] = c * x[i]; | |
} | |
double randDouble() | |
@@ -1721,6 +1687,7 @@ void test_THDoubleVector_muls_VSX() | |
double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); | |
double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); | |
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); | |
+ double c = randDouble(); | |
// Initialize randomly | |
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) | |
@@ -1736,20 +1703,20 @@ void test_THDoubleVector_muls_VSX() | |
// Performance Test | |
//------------------------------------------------- | |
start = clock(); | |
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS ); | |
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
- standardDouble_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); | |
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
+ standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
end = clock(); | |
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; | |
printf("standardDouble_muls() test took %.5lf seconds\n", elapsedSeconds_standard); | |
start = clock(); | |
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS ); | |
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
- THDoubleVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); | |
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
+ THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
end = clock(); | |
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; | |
@@ -1759,17 +1726,17 @@ void test_THDoubleVector_muls_VSX() | |
//------------------------------------------------- | |
// Correctness Test | |
//------------------------------------------------- | |
- standardDouble_muls( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
- THDoubleVector_muls_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
- standardDouble_muls( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
- THDoubleVector_muls_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
- standardDouble_muls( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
- THDoubleVector_muls_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
- standardDouble_muls( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
- THDoubleVector_muls_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
+ standardDouble_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
+ THDoubleVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
+ standardDouble_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
+ THDoubleVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
+ standardDouble_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
+ THDoubleVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
+ standardDouble_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
+ THDoubleVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
int r = rand() % 258; | |
- standardDouble_muls( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
- THDoubleVector_muls_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
+ standardDouble_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
+ THDoubleVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) | |
{ | |
if(!near(y_optimized[i], y_standard[i])) | |
@@ -1793,6 +1760,7 @@ void test_THFloatVector_muls_VSX() | |
float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); | |
float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); | |
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); | |
+ float c = (float)randDouble(); | |
// Initialize randomly | |
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) | |
@@ -1808,20 +1776,20 @@ void test_THFloatVector_muls_VSX() | |
// Performance Test | |
//------------------------------------------------- | |
start = clock(); | |
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS ); | |
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
- standardFloat_muls(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); | |
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
+ standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
end = clock(); | |
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; | |
printf("standardFloat_muls() test took %.5lf seconds\n", elapsedSeconds_standard); | |
start = clock(); | |
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS ); | |
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
- THFloatVector_muls_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); | |
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); | |
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); | |
+ THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); | |
end = clock(); | |
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; | |
@@ -1831,17 +1799,17 @@ void test_THFloatVector_muls_VSX() | |
//------------------------------------------------- | |
// Correctness Test | |
//------------------------------------------------- | |
- standardFloat_muls( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
- THFloatVector_muls_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
- standardFloat_muls( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
- THFloatVector_muls_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
- standardFloat_muls( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
- THFloatVector_muls_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
- standardFloat_muls( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
- THFloatVector_muls_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
+ standardFloat_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
+ THFloatVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); | |
+ standardFloat_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
+ THFloatVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); | |
+ standardFloat_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
+ THFloatVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); | |
+ standardFloat_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
+ THFloatVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); | |
int r = rand() % 258; | |
- standardFloat_muls( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
- THFloatVector_muls_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
+ standardFloat_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
+ THFloatVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); | |
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) | |
{ | |
if(!near(y_optimized[i], y_standard[i])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment