14roiron/code.c

## code.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <immintrin.h>
#include <sys/time.h> // for timing
#include <pthread.h>
#define VEC_SIZE  2147483647

double t1,t2;
float vecteurBase[VEC_SIZE] __attribute__((aligned(32)));
__m256 array;

struct thread_data{
  unsigned int thread_id;
  float *U;
  int n;
  int k;
  float a;
  float result;
};


double now(){
   // Retourne l'heure actuelle en secondes
   struct timeval t; double f_t;
   gettimeofday(&t, NULL);
   f_t = t.tv_usec; f_t = f_t/1000000.0; f_t +=t.tv_sec;
   return f_t;
}


void initVect(float *vecteurBase)
{
    srand(time(0));
    for(int i=0;i<VEC_SIZE;i++)
    {
        vecteurBase[i] = (float) i;//rand();
    }
    return;
}
float* copyVec(float *vecteurBase)
{
    float* copy;
    copy = (float*) _mm_malloc(sizeof(float)*VEC_SIZE,32);
    for(int i =0;i<VEC_SIZE;i++)
    {
        copy[i]=vecteurBase[i];
    }
    return copy;
}

//first naive implementation of the function, double loop
float gm(float *U, float a, int k, int n){
    float result,p;
    result=0;
    for(int i=0;i<n;i++){
        p=1;
        for(int j=0;j<k;j++)
            p*=U[i]-a; //do the exponentiation
        result+=p;//sum all the steps
    }
    return result;
}

/*second implementation of the function*/
/*float gm2(float *U, float a, int k, int n){*/
    /*float result;*/
    /*result=0;*/
    /*for(int i=0;i<n;i++){*/
        /*result+=(float) exp(k* log((double)(U[i]-a)));*/
    /*}*/
    /*return result;*/
/*}*/

/**
 * doing the vector function
 */
float gm_vec(float* vec,float a, int k, int n){
    float result[8] __attribute__((aligned(32)));
    float *rest;
    int sizeMod8 = n / 8; //we are going to loop around sizeMod8, and then we are going to finish
    __m256 vector, vectA,accumulator,temp;

    const vectA = _mm256_set1_ps(a);//populate vectA with 8 times a to do the vector sub
    accumulator = _mm256_setzero_ps();
    for(int i=0;i<sizeMod8;i++){

        vector = _mm256_load_ps(&(vec[8*i])); // we load 8 floats, starting from the 8 * k
        vector = _mm256_sub_ps(vector,vectA);
        temp = _mm256_set1_ps(1.); //init to 1
            for(int j=0;j<k;j++)
            {
                temp = _mm256_mul_ps(temp,vector);
            }
        accumulator = _mm256_add_ps(accumulator,temp);
    }
    accumulator = _mm256_hadd_ps(accumulator,accumulator);
    accumulator = _mm256_hadd_ps(accumulator,accumulator);
    accumulator = _mm256_hadd_ps(accumulator,accumulator);
    //after three times all the cells equals to the sum of the vector, we wan to extract the first one

    _mm256_store_ps(result,accumulator);

    rest = (&(vec[sizeMod8*8])); // the modulo 8 array we can't vectorize
    result[0] = result[0] + gm(rest,a,k,n-sizeMod8*8);//we add the end that we can't vectorize, using previous function
    return result[0];
}
float variance(float* U, int n, int mode)
{
    float mean;
    if(mode)
    {
        mean = gm_vec(U,0.,1,n)/(float) n;
        return gm_vec(U,mean,2,n)/(float) n;
    }
    else{
        mean = gm(U,0.,1,n)/(float) n;
        return gm(U,mean,2,n)/(float) n;
    }
}
/*-------------------------------------------------------------*/
/* This is the thread version of the selected code portion     */
/*-------------------------------------------------------------*/
void *thread_function(void *threadarg){
 /* Local variables */
    int s;

    /* Association between shared variables and their correspondances */
    struct thread_data *thread_pointer_data;
    thread_pointer_data = (struct thread_data *)threadarg;
    thread_pointer_data->result =
        gm_vec(thread_pointer_data->U,thread_pointer_data->a,thread_pointer_data->k,thread_pointer_data->n);
    //send the result of the vect function to the structure back
    pthread_exit(NULL);
    return 0;
}

float create_thread(float* U, float a,int k,int n, int nb_threads)
{
    float result=0;
    int i;
    int split_size = n/nb_threads;
    struct thread_data *threads_data =   malloc(nb_threads*sizeof(struct thread_data));
    pthread_t *(thread_ptr) = malloc(nb_threads*sizeof(pthread_t *));
    //create pointers for threads and data
    //we split the datas before launching each thread
    for(i=0;i<nb_threads-1;i++){ // last thread is different
        threads_data[i].thread_id = i;
        threads_data[i].n = split_size;
        threads_data[i].U = &(U[i*split_size]);
        threads_data[i].result=0;
        threads_data[i].a = a;
        threads_data[i].k = k;
        pthread_create(&(thread_ptr[i]), NULL, thread_function, (void *) &threads_data[i]);
   }
    //last thread
    i=nb_threads - 1;
    threads_data[i].thread_id = i;
    threads_data[i].n = n - i*split_size;
    threads_data[i].U = &(U[i*split_size]);
    threads_data[i].result=0;
    threads_data[i].a = a;
    threads_data[i].k = k;
    pthread_create(&thread_ptr[i], NULL, thread_function, (void *) &threads_data[i]);

    /* Wait for every thread to complete  */
    for(i=0;i<nb_threads;i++) pthread_join(thread_ptr[i], NULL);
    for(i=0;i<nb_threads;i++) result += threads_data[i].result;//add all the results
    return result/(float) n;
}
float variance_thread(float* U, int n, int nb_threads)
{
    float mean;
    mean = create_thread(U,0.,1,n,nb_threads);
    return create_thread(U,mean,2,n,nb_threads);

}
int main (int argc, char *argv[]){
    double t1,t2,scalaire,vect,thread;
    float result,a;
    int k, nb_threads;
    initVect(vecteurBase);
    if(argc > 1)
        nb_threads = atoi(argv[1]);
    else
        nb_threads = 2;

    printf("Calcul scalaire\n");
    t1=now();
    result = variance(vecteurBase,VEC_SIZE,0);
    t2=now();
    scalaire = t2-t1;
    printf("Resultat: %.3e, time: %.4e\n", result,t2-t1);


    printf ("\n ---------- \n");
    printf("Calcul vectoriel\n");
    t1=now();
    result = variance(vecteurBase,VEC_SIZE,1);
    t2=now();
    vect=t2-t1;
    printf("Resultat: %.3e, time: %.4e\n", result,t2-t1);
    //*/
    //
    printf ("\n ---------- \n");
    printf("Calcul Vectoriel Multi-Thread\n");
    printf("Nombre de threads: %i\n", nb_threads);
    t1=now();
    result = variance_thread(vecteurBase,VEC_SIZE,nb_threads);
    t2=now();
    thread = t2-t1;
    printf("Resultat: %.3e, time: %.4e\n", result,t2-t1);
    //*/

    printf ("\n ---------- \n");

    printf ("Acceleration Vectoriel: %f %%\n", (scalaire-vect)/vect*100);
    printf ("Acceleration Thread : %f %%\n", (vect-thread)/thread*100);
    printf ("Acceleration Thread Vectoriel: %f %%\n", (scalaire-thread)/thread*100);
    printf ("\n ---------- \n");

}
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <immintrin.h>
	#include <sys/time.h> // for timing
	#include <pthread.h>
	#define VEC_SIZE 2147483647

	double t1,t2;
	float vecteurBase[VEC_SIZE] __attribute__((aligned(32)));
	__m256 array;

	struct thread_data{
	unsigned int thread_id;
	float *U;
	int n;
	int k;
	float a;
	float result;
	};



	double now(){
	// Retourne l'heure actuelle en secondes
	struct timeval t; double f_t;
	gettimeofday(&t, NULL);
	f_t = t.tv_usec; f_t = f_t/1000000.0; f_t +=t.tv_sec;
	return f_t;
	}



	void initVect(float *vecteurBase)
	{
	srand(time(0));
	for(int i=0;i<VEC_SIZE;i++)
	{
	vecteurBase[i] = (float) i;//rand();
	}
	return;
	}
	float* copyVec(float *vecteurBase)
	{
	float* copy;
	copy = (float) _mm_malloc(sizeof(float)VEC_SIZE,32);
	for(int i =0;i<VEC_SIZE;i++)
	{
	copy[i]=vecteurBase[i];
	}
	return copy;
	}

	//first naive implementation of the function, double loop
	float gm(float *U, float a, int k, int n){
	float result,p;
	result=0;
	for(int i=0;i<n;i++){
	p=1;
	for(int j=0;j<k;j++)
	p*=U[i]-a; //do the exponentiation
	result+=p;//sum all the steps
	}
	return result;
	}

	/second implementation of the function/
	/float gm2(float U, float a, int k, int n){*/
	/float result;/
	/result=0;/
	/for(int i=0;i<n;i++){/
	/result+=(float) exp(k log((double)(U[i]-a)));*/
	/}/
	/return result;/
	/}/

	/**
	* doing the vector function
	*/
	float gm_vec(float* vec,float a, int k, int n){
	float result[8] __attribute__((aligned(32)));
	float *rest;
	int sizeMod8 = n / 8; //we are going to loop around sizeMod8, and then we are going to finish
	__m256 vector, vectA,accumulator,temp;

	const vectA = _mm256_set1_ps(a);//populate vectA with 8 times a to do the vector sub
	accumulator = _mm256_setzero_ps();
	for(int i=0;i<sizeMod8;i++){

	vector = _mm256_load_ps(&(vec[8i])); // we load 8 floats, starting from the 8 k
	vector = _mm256_sub_ps(vector,vectA);
	temp = _mm256_set1_ps(1.); //init to 1
	for(int j=0;j<k;j++)
	{
	temp = _mm256_mul_ps(temp,vector);
	}
	accumulator = _mm256_add_ps(accumulator,temp);
	}
	accumulator = _mm256_hadd_ps(accumulator,accumulator);
	accumulator = _mm256_hadd_ps(accumulator,accumulator);
	accumulator = _mm256_hadd_ps(accumulator,accumulator);
	//after three times all the cells equals to the sum of the vector, we wan to extract the first one

	_mm256_store_ps(result,accumulator);

	rest = (&(vec[sizeMod8*8])); // the modulo 8 array we can't vectorize
	result[0] = result[0] + gm(rest,a,k,n-sizeMod8*8);//we add the end that we can't vectorize, using previous function
	return result[0];
	}
	float variance(float* U, int n, int mode)
	{
	float mean;
	if(mode)
	{
	mean = gm_vec(U,0.,1,n)/(float) n;
	return gm_vec(U,mean,2,n)/(float) n;
	}
	else{
	mean = gm(U,0.,1,n)/(float) n;
	return gm(U,mean,2,n)/(float) n;
	}
	}
	/-------------------------------------------------------------/
	/* This is the thread version of the selected code portion */
	/-------------------------------------------------------------/
	void thread_function(void threadarg){
	/* Local variables */
	int s;

	/* Association between shared variables and their correspondances */
	struct thread_data *thread_pointer_data;
	thread_pointer_data = (struct thread_data *)threadarg;
	thread_pointer_data->result =
	gm_vec(thread_pointer_data->U,thread_pointer_data->a,thread_pointer_data->k,thread_pointer_data->n);
	//send the result of the vect function to the structure back
	pthread_exit(NULL);
	return 0;
	}

	float create_thread(float* U, float a,int k,int n, int nb_threads)
	{
	float result=0;
	int i;
	int split_size = n/nb_threads;
	struct thread_data threads_data = malloc(nb_threadssizeof(struct thread_data));
	pthread_t (thread_ptr) = malloc(nb_threadssizeof(pthread_t *));
	//create pointers for threads and data
	//we split the datas before launching each thread
	for(i=0;i<nb_threads-1;i++){ // last thread is different
	threads_data[i].thread_id = i;
	threads_data[i].n = split_size;
	threads_data[i].U = &(U[i*split_size]);
	threads_data[i].result=0;
	threads_data[i].a = a;
	threads_data[i].k = k;
	pthread_create(&(thread_ptr[i]), NULL, thread_function, (void *) &threads_data[i]);
	}
	//last thread
	i=nb_threads - 1;
	threads_data[i].thread_id = i;
	threads_data[i].n = n - i*split_size;
	threads_data[i].U = &(U[i*split_size]);
	threads_data[i].result=0;
	threads_data[i].a = a;
	threads_data[i].k = k;
	pthread_create(&thread_ptr[i], NULL, thread_function, (void *) &threads_data[i]);

	/* Wait for every thread to complete */
	for(i=0;i<nb_threads;i++) pthread_join(thread_ptr[i], NULL);
	for(i=0;i<nb_threads;i++) result += threads_data[i].result;//add all the results
	return result/(float) n;
	}
	float variance_thread(float* U, int n, int nb_threads)
	{
	float mean;
	mean = create_thread(U,0.,1,n,nb_threads);
	return create_thread(U,mean,2,n,nb_threads);

	}
	int main (int argc, char *argv[]){
	double t1,t2,scalaire,vect,thread;
	float result,a;
	int k, nb_threads;
	initVect(vecteurBase);
	if(argc > 1)
	nb_threads = atoi(argv[1]);
	else
	nb_threads = 2;

	printf("Calcul scalaire\n");
	t1=now();
	result = variance(vecteurBase,VEC_SIZE,0);
	t2=now();
	scalaire = t2-t1;
	printf("Resultat: %.3e, time: %.4e\n", result,t2-t1);


	printf ("\n ---------- \n");
	printf("Calcul vectoriel\n");
	t1=now();
	result = variance(vecteurBase,VEC_SIZE,1);
	t2=now();
	vect=t2-t1;
	printf("Resultat: %.3e, time: %.4e\n", result,t2-t1);
	//*/
	//
	printf ("\n ---------- \n");
	printf("Calcul Vectoriel Multi-Thread\n");
	printf("Nombre de threads: %i\n", nb_threads);
	t1=now();
	result = variance_thread(vecteurBase,VEC_SIZE,nb_threads);
	t2=now();
	thread = t2-t1;
	printf("Resultat: %.3e, time: %.4e\n", result,t2-t1);
	//*/

	printf ("\n ---------- \n");

	printf ("Acceleration Vectoriel: %f %%\n", (scalaire-vect)/vect*100);
	printf ("Acceleration Thread : %f %%\n", (vect-thread)/thread*100);
	printf ("Acceleration Thread Vectoriel: %f %%\n", (scalaire-thread)/thread*100);
	printf ("\n ---------- \n");

	}