Created Apr 29, 2014
int dotproduct_sse(int*F, int*S) {
__m128i F1 = _mm_load_si128((__m128i*)F); //load aligned
__m128i F2 = _mm_load_si128((__m128i*)F+1);
__m128i S1 = _mm_loadu_si128((__m128i*)S); //load unaligned
__m128i S2 = _mm_loadu_si128((__m128i*)S+1);
__m128i res = _mm_add_epi32(_mm_mullo_epi32(F1, S1), _mm_mullo_epi32(F2, S2));
res = _mm_add_epi32(res, _mm_srli_si128(res, 8));
res = _mm_add_epi32(res, _mm_srli_si128(res, 4));
return _mm_cvtsi128_si32(res);
