-
-
Save dev-zzo/142abea8476cc5309db1d6462ef91ac6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void matrix_multiply(const data_t *M, const data_t *x, data_t *y, size_t width, size_t height) | |
{ | |
// assert(height > 0); | |
// assert(width >= 4); | |
// assert((width & 3) == 0); | |
IACA_START; | |
#if 1 | |
if (height >= 4) { | |
const data_t *M2 = M + width * 2; | |
size_t counter = height >> 2; | |
height &= 3; | |
do { | |
size_t i; | |
__m128 r; | |
__m128 accum0 = _mm_setzero_ps(); | |
__m128 accum1 = _mm_setzero_ps(); | |
__m128 accum2 = _mm_setzero_ps(); | |
__m128 accum3 = _mm_setzero_ps(); | |
for (i = 0; i < width; i += 4) { | |
__m128 a = _mm_load_ps(&x[i]); | |
accum0 = _mm_add_ps(accum0, _mm_mul_ps(a, _mm_load_ps(M))); | |
accum1 = _mm_add_ps(accum1, _mm_mul_ps(a, _mm_load_ps(M + width))); | |
accum2 = _mm_add_ps(accum2, _mm_mul_ps(a, _mm_load_ps(M2))); | |
accum3 = _mm_add_ps(accum3, _mm_mul_ps(a, _mm_load_ps(M2 + width))); | |
M += 4; | |
M2 += 4; | |
} | |
M += width * 3; | |
M2 += width * 3; | |
_MM_TRANSPOSE4_PS(accum0, accum1, accum2, accum3); | |
r = _mm_add_ps(_mm_add_ps(accum0, accum1), _mm_add_ps(accum2, accum3)); | |
_mm_store_ps(y, r); | |
y += 4; | |
} while (--counter > 0); | |
} | |
#endif | |
IACA_END; | |
while (height-- > 0) { | |
size_t i; | |
__m128 accum = _mm_setzero_ps(); | |
for (i = 0; i < width; i += 4) { | |
__m128 a = _mm_load_ps(&x[i]); | |
__m128 b = _mm_load_ps(&M[i]); | |
accum = _mm_add_ps(accum, _mm_mul_ps(a, b)); | |
} | |
M += width; | |
// 3+0, 2+1, 1+2, 0+3 | |
// (3+0)+(1+2), (2+1)+(0+3), (1+2)+(3+0), (0+3)+(2+1) | |
accum = _mm_add_ps(accum, _mm_shuffle_ps(accum, accum, _MM_SHUFFLE(0, 1, 2, 3))); | |
accum = _mm_add_ps(accum, _mm_movehl_ps(accum, accum)); | |
_mm_store_ss(y++, accum); | |
} | |
} | |
#if 0 | |
Multiplication result with the block multiplication disabled: | |
[0] 0.22798002 float | |
[1] 1.7965736 float | |
[2] -1.0273601 float | |
[3] 2.0052950 float | |
[4] -0.18169379 float | |
[5] -1.1956211 float | |
[6] -1.3414007 float | |
[7] -0.52710074 float | |
[8] 0.83735448 float | |
[9] 0.18121457 float | |
[10] 1.1256711 float | |
[11] -1.5259976 float | |
[12] 0.72421610 float | |
[13] 0.096263677 float | |
[14] -3.0765963 float | |
[15] -1.8290104 float | |
[16] 1.3304036 float | |
[17] -0.43048298 float | |
[18] 2.5751204 float | |
[19] -1.5781865 float | |
[20] 0.52282488 float | |
[21] -0.58924145 float | |
[22] -0.31996733 float | |
[23] 1.1963365 float | |
[24] 1.6498970 float | |
[25] 0.75210774 float | |
[26] -2.0755475 float | |
[27] 0.58657491 float | |
[28] 0.00000000 float | |
[29] 0.00000000 float | |
[30] 0.00000000 float | |
[31] 0.00000000 float | |
Whole computation results: | |
30,11,506,61.47,61.47 | |
31,11,597,56.93,56.93 | |
32,11,542,56.93,56.93 | |
33,11,537,59.52,59.52 | |
34,11,585,62.34,62.34 | |
35,11,594,59.09,59.09 | |
36,11,740,58.44,58.44 | |
#endif | |
#if 0 | |
Multiplication result with the block multiplication enabled: | |
[0] 0.22797999 float | |
[1] 1.7965735 float | |
[2] -1.0273601 float | |
[3] 2.0052950 float | |
[4] -0.18169381 float | |
[5] -1.1956210 float | |
[6] -1.3414009 float | |
[7] -0.52710068 float | |
[8] 0.83735454 float | |
[9] 0.18121463 float | |
[10] 1.1256711 float | |
[11] -1.5259976 float | |
[12] 0.72421610 float | |
[13] 0.096263707 float | |
[14] -3.0765963 float | |
[15] -1.8290104 float | |
[16] 1.3304036 float | |
[17] -0.43048298 float | |
[18] 2.5751204 float | |
[19] -1.5781866 float | |
[20] 0.52282488 float | |
[21] -0.58924139 float | |
[22] -0.31996733 float | |
[23] 1.1963365 float | |
[24] 1.6498970 float | |
[25] 0.75210774 float | |
[26] -2.0755475 float | |
[27] 0.58657491 float | |
[28] 0.00000000 float | |
[29] 0.00000000 float | |
[30] 0.00000000 float | |
[31] 0.00000000 float | |
Whole computation results: | |
30,11,524,64.07,64.07 | |
31,11,651,60.39,60.39 | |
32,11,477,56.93,56.93 | |
33,11,593,58.87,58.87 | |
34,11,535,61.69,61.69 | |
35,11,509,59.96,59.96 | |
36,11,745,59.31,59.31 | |
#endif | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Apparently, error control is required. :(