Created September 7, 2015 13:46
#include <xmmintrin.h>
#include <immintrin.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#define N 32*1024*1024*8
void *aligned_alloc(size_t alignment, size_t size);
void __attribute__ ((noinline)) __attribute__((optimize("no-tree-vectorize")))
uno(float * restrict res, float * restrict uno, float * restrict due) {
for (int i = 0; i < N; i++)
res[i] = uno[i] + due[i];
void __attribute__ ((noinline))
due(float * restrict res, float * restrict uno, float * restrict due) {
for (int i = 0; i < N; i+=8) {
__m256 a1 = _mm256_load_ps(uno + i);
__m256 a2 = _mm256_load_ps(due + i);
_mm256_store_ps(res + i, _mm256_add_ps(a1, a2));
void main() {
float *a = aligned_alloc (32, N*sizeof(float));
float *b = aligned_alloc (32, N*sizeof(float));
float *c = aligned_alloc (32, N*sizeof(float));
float *d = aligned_alloc (32, N*sizeof(float));
for (int i = 0; i < N; i++) {
a[N] = ((float)rand()/(float)(RAND_MAX));
b[N] = ((float)rand()/(float)(RAND_MAX));
c[N] = ((float)rand()/(float)(RAND_MAX));
d[N] = ((float)rand()/(float)(RAND_MAX));
void diff(struct timespec *res, struct timespec *start, struct timespec *end)
if ((end->tv_nsec-start->tv_nsec)<0) {
res->tv_sec = end->tv_sec-start->tv_sec-1;
res->tv_nsec = 1000000000+end->tv_nsec-start->tv_nsec;
} else {
res->tv_sec = end->tv_sec-start->tv_sec;
res->tv_nsec = end->tv_nsec-start->tv_nsec;
struct timespec ts1, ts2, diffe;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2);
diff(&diffe, &ts1, &ts2);
printf("Non optimized, passed %d sec and %d nanosec\n", diffe.tv_sec, diffe.tv_nsec);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2);
diff(&diffe, &ts1, &ts2);
printf("Optimized, passed %d sec and %d nanosec\n", diffe.tv_sec, diffe.tv_nsec);
for (int i = 0; i < N; i++)
if (a[i] != d[i])
puts("Result checked!\n");
