Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#include "stdio.h"
#include "stdlib.h"
#include "emmintrin.h"
#include "time.h"
float ssesum_aligned(float* a, size_t n, double* time) {
int i;
float* p;
float result;
__m128 v1, v2;
__m128 sum = _mm_set1_ps(0.0f);
struct timeval start, end;
gettimeofday(&start, NULL);
for (i = 0; i < 100; i++) {
for (p = a; p < a + n; p += 4) {
v1 = _mm_load_ps(p);
sum = _mm_add_ps(sum, v1);
}
}
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1));
_mm_store_ss(&result, sum);
gettimeofday(&end, NULL);
*time = ((end.tv_sec * 1000000 + end.tv_usec)
- (start.tv_sec * 1000000 + start.tv_usec)) / 10000000.0;
return result;
}
float ssesum_unaligned(float* a, size_t n, double* time) {
int i;
float* p;
float result;
__m128 v1, v2;
__m128 sum = _mm_set1_ps(0.0f);
struct timeval start, end;
gettimeofday(&start, NULL);
for (i = 0; i < 100; i++) {
for (p = a; p < a + n; p += 4) {
v1 = _mm_loadu_ps(p);
sum = _mm_add_ps(sum, v1);
}
}
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1));
_mm_store_ss(&result, sum);
gettimeofday(&end, NULL);
*time = ((end.tv_sec * 1000000 + end.tv_usec)
- (start.tv_sec * 1000000 + start.tv_usec)) / 10000000.0;
return result;
}
int main(int argc, char* argv[]) {
float* arr;
int ierr;
int i;
double t1, t2, t3;
float r1, r2, r3;
size_t length;
if (argc != 2) {
fprintf(stderr, "usage: %s <length>", argv[0]);
exit(EXIT_FAILURE);
}
length = atoi(argv[1]);
if (length <= 0) {
fprintf(stderr, "length must be positive");
exit(EXIT_FAILURE);
}
ierr = posix_memalign((void**) &arr, 16, (length+1)*sizeof(float));
if (ierr != 0) {
fprintf(stderr, "Memory allocation failure.");
exit(EXIT_FAILURE);
}
printf("Array Size: %.3f MB\n", length*sizeof(float) / 1048576.0);
for (i = 0; i < 5; i++) {
printf("Trial %d\n", i+1);
r1 = ssesum_aligned(arr, length, &t1);
r2 = ssesum_unaligned(arr, length, &t2);
r3 = ssesum_unaligned(arr+1, length, &t3);
printf("_mm_load_ps with aligned memory: %f\n", t1);
printf("_mm_loadu_ps with aligned memory: %f\n", t2);
printf("_mm_loadu_ps with unaligned memory: %f\n", t3);
}
free(arr);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment