Last active
April 15, 2018 19:52
-
-
Save rmcgibbo/7689820 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stdio.h" | |
#include "stdlib.h" | |
#include "emmintrin.h" | |
#include "time.h" | |
float ssesum_aligned(float* a, size_t n, double* time) { | |
int i; | |
float* p; | |
float result; | |
__m128 v1, v2; | |
__m128 sum = _mm_set1_ps(0.0f); | |
struct timeval start, end; | |
gettimeofday(&start, NULL); | |
for (i = 0; i < 100; i++) { | |
for (p = a; p < a + n; p += 4) { | |
v1 = _mm_load_ps(p); | |
sum = _mm_add_ps(sum, v1); | |
} | |
} | |
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); | |
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1)); | |
_mm_store_ss(&result, sum); | |
gettimeofday(&end, NULL); | |
*time = ((end.tv_sec * 1000000 + end.tv_usec) | |
- (start.tv_sec * 1000000 + start.tv_usec)) / 10000000.0; | |
return result; | |
} | |
float ssesum_unaligned(float* a, size_t n, double* time) { | |
int i; | |
float* p; | |
float result; | |
__m128 v1, v2; | |
__m128 sum = _mm_set1_ps(0.0f); | |
struct timeval start, end; | |
gettimeofday(&start, NULL); | |
for (i = 0; i < 100; i++) { | |
for (p = a; p < a + n; p += 4) { | |
v1 = _mm_loadu_ps(p); | |
sum = _mm_add_ps(sum, v1); | |
} | |
} | |
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); | |
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1)); | |
_mm_store_ss(&result, sum); | |
gettimeofday(&end, NULL); | |
*time = ((end.tv_sec * 1000000 + end.tv_usec) | |
- (start.tv_sec * 1000000 + start.tv_usec)) / 10000000.0; | |
return result; | |
} | |
int main(int argc, char* argv[]) { | |
float* arr; | |
int ierr; | |
int i; | |
double t1, t2, t3; | |
float r1, r2, r3; | |
size_t length; | |
if (argc != 2) { | |
fprintf(stderr, "usage: %s <length>", argv[0]); | |
exit(EXIT_FAILURE); | |
} | |
length = atoi(argv[1]); | |
if (length <= 0) { | |
fprintf(stderr, "length must be positive"); | |
exit(EXIT_FAILURE); | |
} | |
ierr = posix_memalign((void**) &arr, 16, (length+1)*sizeof(float)); | |
if (ierr != 0) { | |
fprintf(stderr, "Memory allocation failure."); | |
exit(EXIT_FAILURE); | |
} | |
printf("Array Size: %.3f MB\n", length*sizeof(float) / 1048576.0); | |
for (i = 0; i < 5; i++) { | |
printf("Trial %d\n", i+1); | |
r1 = ssesum_aligned(arr, length, &t1); | |
r2 = ssesum_unaligned(arr, length, &t2); | |
r3 = ssesum_unaligned(arr+1, length, &t3); | |
printf("_mm_load_ps with aligned memory: %f\n", t1); | |
printf("_mm_loadu_ps with aligned memory: %f\n", t2); | |
printf("_mm_loadu_ps with unaligned memory: %f\n", t3); | |
} | |
free(arr); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment