-
-
Save anonymous/7dca6d4b7dbd12c19fb4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <xmmintrin.h> | |
#include <immintrin.h> | |
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <malloc.h> | |
#define N 32*1024*1024*8 | |
void *aligned_alloc(size_t alignment, size_t size); | |
void __attribute__ ((noinline)) __attribute__((optimize("no-tree-vectorize"))) | |
uno(float * restrict res, float * restrict uno, float * restrict due) { | |
for (int i = 0; i < N; i++) | |
res[i] = uno[i] + due[i]; | |
} | |
void __attribute__ ((noinline)) | |
due(float * restrict res, float * restrict uno, float * restrict due) { | |
for (int i = 0; i < N; i+=8) { | |
__m256 a1 = _mm256_load_ps(uno + i); | |
__m256 a2 = _mm256_load_ps(due + i); | |
_mm256_store_ps(res + i, _mm256_add_ps(a1, a2)); | |
} | |
} | |
void main() { | |
srand((unsigned)time(NULL)); | |
float *a = aligned_alloc (32, N*sizeof(float)); | |
float *b = aligned_alloc (32, N*sizeof(float)); | |
float *c = aligned_alloc (32, N*sizeof(float)); | |
float *d = aligned_alloc (32, N*sizeof(float)); | |
for (int i = 0; i < N; i++) { | |
a[N] = ((float)rand()/(float)(RAND_MAX)); | |
b[N] = ((float)rand()/(float)(RAND_MAX)); | |
c[N] = ((float)rand()/(float)(RAND_MAX)); | |
d[N] = ((float)rand()/(float)(RAND_MAX)); | |
} | |
void diff(struct timespec *res, struct timespec *start, struct timespec *end) | |
{ | |
if ((end->tv_nsec-start->tv_nsec)<0) { | |
res->tv_sec = end->tv_sec-start->tv_sec-1; | |
res->tv_nsec = 1000000000+end->tv_nsec-start->tv_nsec; | |
} else { | |
res->tv_sec = end->tv_sec-start->tv_sec; | |
res->tv_nsec = end->tv_nsec-start->tv_nsec; | |
} | |
} | |
struct timespec ts1, ts2, diffe; | |
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); | |
uno(a,b,c); | |
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); | |
diff(&diffe, &ts1, &ts2); | |
printf("Non optimized, passed %d sec and %d nanosec\n", diffe.tv_sec, diffe.tv_nsec); | |
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); | |
due(d,b,c); | |
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); | |
diff(&diffe, &ts1, &ts2); | |
printf("Optimized, passed %d sec and %d nanosec\n", diffe.tv_sec, diffe.tv_nsec); | |
for (int i = 0; i < N; i++) | |
if (a[i] != d[i]) | |
exit(-1); | |
puts("Result checked!\n"); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment