Skip to content

Instantly share code, notes, and snippets.

@dev-zzo
Last active June 1, 2016 21:38
Show Gist options
  • Save dev-zzo/142abea8476cc5309db1d6462ef91ac6 to your computer and use it in GitHub Desktop.
Save dev-zzo/142abea8476cc5309db1d6462ef91ac6 to your computer and use it in GitHub Desktop.
void matrix_multiply(const data_t *M, const data_t *x, data_t *y, size_t width, size_t height)
{
// assert(height > 0);
// assert(width >= 4);
// assert((width & 3) == 0);
IACA_START;
#if 1
if (height >= 4) {
const data_t *M2 = M + width * 2;
size_t counter = height >> 2;
height &= 3;
do {
size_t i;
__m128 r;
__m128 accum0 = _mm_setzero_ps();
__m128 accum1 = _mm_setzero_ps();
__m128 accum2 = _mm_setzero_ps();
__m128 accum3 = _mm_setzero_ps();
for (i = 0; i < width; i += 4) {
__m128 a = _mm_load_ps(&x[i]);
accum0 = _mm_add_ps(accum0, _mm_mul_ps(a, _mm_load_ps(M)));
accum1 = _mm_add_ps(accum1, _mm_mul_ps(a, _mm_load_ps(M + width)));
accum2 = _mm_add_ps(accum2, _mm_mul_ps(a, _mm_load_ps(M2)));
accum3 = _mm_add_ps(accum3, _mm_mul_ps(a, _mm_load_ps(M2 + width)));
M += 4;
M2 += 4;
}
M += width * 3;
M2 += width * 3;
_MM_TRANSPOSE4_PS(accum0, accum1, accum2, accum3);
r = _mm_add_ps(_mm_add_ps(accum0, accum1), _mm_add_ps(accum2, accum3));
_mm_store_ps(y, r);
y += 4;
} while (--counter > 0);
}
#endif
IACA_END;
while (height-- > 0) {
size_t i;
__m128 accum = _mm_setzero_ps();
for (i = 0; i < width; i += 4) {
__m128 a = _mm_load_ps(&x[i]);
__m128 b = _mm_load_ps(&M[i]);
accum = _mm_add_ps(accum, _mm_mul_ps(a, b));
}
M += width;
// 3+0, 2+1, 1+2, 0+3
// (3+0)+(1+2), (2+1)+(0+3), (1+2)+(3+0), (0+3)+(2+1)
accum = _mm_add_ps(accum, _mm_shuffle_ps(accum, accum, _MM_SHUFFLE(0, 1, 2, 3)));
accum = _mm_add_ps(accum, _mm_movehl_ps(accum, accum));
_mm_store_ss(y++, accum);
}
}
#if 0
Multiplication result with the block multiplication disabled:
[0] 0.22798002 float
[1] 1.7965736 float
[2] -1.0273601 float
[3] 2.0052950 float
[4] -0.18169379 float
[5] -1.1956211 float
[6] -1.3414007 float
[7] -0.52710074 float
[8] 0.83735448 float
[9] 0.18121457 float
[10] 1.1256711 float
[11] -1.5259976 float
[12] 0.72421610 float
[13] 0.096263677 float
[14] -3.0765963 float
[15] -1.8290104 float
[16] 1.3304036 float
[17] -0.43048298 float
[18] 2.5751204 float
[19] -1.5781865 float
[20] 0.52282488 float
[21] -0.58924145 float
[22] -0.31996733 float
[23] 1.1963365 float
[24] 1.6498970 float
[25] 0.75210774 float
[26] -2.0755475 float
[27] 0.58657491 float
[28] 0.00000000 float
[29] 0.00000000 float
[30] 0.00000000 float
[31] 0.00000000 float
Whole computation results:
30,11,506,61.47,61.47
31,11,597,56.93,56.93
32,11,542,56.93,56.93
33,11,537,59.52,59.52
34,11,585,62.34,62.34
35,11,594,59.09,59.09
36,11,740,58.44,58.44
#endif
#if 0
Multiplication result with the block multiplication enabled:
[0] 0.22797999 float
[1] 1.7965735 float
[2] -1.0273601 float
[3] 2.0052950 float
[4] -0.18169381 float
[5] -1.1956210 float
[6] -1.3414009 float
[7] -0.52710068 float
[8] 0.83735454 float
[9] 0.18121463 float
[10] 1.1256711 float
[11] -1.5259976 float
[12] 0.72421610 float
[13] 0.096263707 float
[14] -3.0765963 float
[15] -1.8290104 float
[16] 1.3304036 float
[17] -0.43048298 float
[18] 2.5751204 float
[19] -1.5781866 float
[20] 0.52282488 float
[21] -0.58924139 float
[22] -0.31996733 float
[23] 1.1963365 float
[24] 1.6498970 float
[25] 0.75210774 float
[26] -2.0755475 float
[27] 0.58657491 float
[28] 0.00000000 float
[29] 0.00000000 float
[30] 0.00000000 float
[31] 0.00000000 float
Whole computation results:
30,11,524,64.07,64.07
31,11,651,60.39,60.39
32,11,477,56.93,56.93
33,11,593,58.87,58.87
34,11,535,61.69,61.69
35,11,509,59.96,59.96
36,11,745,59.31,59.31
#endif
@dev-zzo
Copy link
Author

dev-zzo commented Jun 1, 2016

Apparently, error control is required. :(

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment