okeefm/mm.c

## mm.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <mpi.h>
#include <string.h>

#if defined(__i386__)

static __inline__ unsigned long long rdtsc(void)
{
  unsigned long long int x;
  __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
  return x;
}
#elif defined(__x86_64__)


static __inline__ unsigned long long rdtsc(void)
{
  unsigned hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

#elif defined(__powerpc__)
static __inline__ unsigned long long rdtsc(void)
{
  unsigned long long int result=0;
  unsigned long int upper, lower,tmp;
  __asm__ volatile(
                "0:                  \n"
                "\tmftbu   %0           \n"
                "\tmftb    %1           \n"
                "\tmftbu   %2           \n"
                "\tcmpw    %2,%0        \n"
                "\tbne     0b         \n"
                : "=r"(upper),"=r"(lower),"=r"(tmp)
		   );
  result = upper;
  result = result<<32;
  result = result|lower;

  return(result);
}
#endif

/***********************************************************************/
/* START: MT 19937******************************************************/
/***********************************************************************/

/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

static unsigned long mt[N]; /* the array for the state vector  */
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

/* initializes mt[N] with a seed */
void init_genrand(unsigned long s)
{
    mt[0]= s & 0xffffffffUL;
    for (mti=1; mti<N; mti++) {
        mt[mti] =
	    (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
        /* In the previous versions, MSBs of the seed affect   */
        /* only MSBs of the array mt[].                        */
        /* 2002/01/09 modified by Makoto Matsumoto             */
        mt[mti] &= 0xffffffffUL;
        /* for >32 bit machines */
    }
}

/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
/* slight change for C++, 2004/2/26 */
void init_by_array(unsigned long init_key[], int key_length)
{
    int i, j, k;
    init_genrand(19650218UL);
    i=1; j=0;
    k = (N>key_length ? N : key_length);
    for (; k; k--) {
        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
          + init_key[j] + j; /* non linear */
        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
        i++; j++;
        if (i>=N) { mt[0] = mt[N-1]; i=1; }
        if (j>=key_length) j=0;
    }
    for (k=N-1; k; k--) {
        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
          - i; /* non linear */
        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
        i++;
        if (i>=N) { mt[0] = mt[N-1]; i=1; }
    }

    mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
}

/* generates a random number on [0,0xffffffff]-interval */
unsigned long genrand_int32(void)
{
    unsigned long y;
    static unsigned long mag01[2]={0x0UL, MATRIX_A};
    /* mag01[x] = x * MATRIX_A  for x=0,1 */

    if (mti >= N) { /* generate N words at one time */
        int kk;

        if (mti == N+1)   /* if init_genrand() has not been called, */
            init_genrand(5489UL); /* a default initial seed is used */

        for (kk=0;kk<N-M;kk++) {
            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
            mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        for (;kk<N-1;kk++) {
            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
            mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
        mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];

        mti = 0;
    }

    y = mt[mti++];

    /* Tempering */
    y ^= (y >> 11);
    y ^= (y << 7) & 0x9d2c5680UL;
    y ^= (y << 15) & 0xefc60000UL;
    y ^= (y >> 18);

    return y;
}

/* generates a random number on [0,0x7fffffff]-interval */
long genrand_int31(void)
{
    return (long)(genrand_int32()>>1);
}

/* generates a random number on [0,1]-real-interval */
double genrand_real1(void)
{
    return genrand_int32()*(1.0/4294967295.0);
    /* divided by 2^32-1 */
}

/* generates a random number on [0,1)-real-interval */
double genrand_real2(void)
{
    return genrand_int32()*(1.0/4294967296.0);
    /* divided by 2^32 */
}

/* generates a random number on (0,1)-real-interval */
double genrand_real3(void)
{
    return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
    /* divided by 2^32 */
}

/* generates a random number on [0,1) with 53-bit resolution*/
double genrand_res53(void)
{
    unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
    return(a*67108864.0+b)*(1.0/9007199254740992.0);
}
/* These real versions are due to Isaku Wada, 2002/01/09 added */

/***********************************************************************/
/* END: MT 19937 *******************************************************/
/***********************************************************************/

/* Standard matrix multiplication */
/* Arrays start at 0 */

char DEBUG_ON = 0;

double **A=NULL;
double **C=NULL;
double *B_buf_in = NULL;
double *B_buf_out = NULL;
unsigned int Nc=8000;
unsigned long rng_init_seeds[6]={0x0, 0x123, 0x234, 0x345, 0x456, 0x789};
unsigned long rng_init_length=6;
double clock_rateK=2666700000.0;	// Kratos
double clock_rateBGL=700000000.0;	// Blue Gene/L
double clock_rate = 0;	// set for BGL or Kratos

double matrix_multiply( double **A, double *B, double **C, int B_start, int NP )
{
	int i=0, j=0, k=0;
	unsigned long long start=rdtsc();
	unsigned long long end=rdtsc();
	for (i = 0; i< NP; i++)
		for( j = 0; j < NP; j++ )
			for( k = 0; k < Nc; k++ ) {
				C[ i ][ j+B_start ] += A[i][k] * B[k*NP + j];
				//C[ i+A_start ][ j+B_start ] += A[i+A_start][k] * B[k*NP + j]; //A_start only needed when allocating all of A and C for each process
			}
	end = rdtsc();
	return ((double)end - (double)start)/clock_rate;
}

void main( int argc, char* argv[])
{
	int i, j;
	int taskid, numtasks, dest;
	int intsize,dbsize;
	int P,NP,sizeB;
	int count = 0;
	double mult_time_l = 0, send_time_l = 0;
	double mult_time[1024];
	double send_time[1024];

	double mult[3];
	double send[3];
	double total_time = 0;
	double data;

	int Use_nodes = 0; //max nodes to use for a job, use all if 0

	unsigned long long start,start_a, end,end_a;

	MPI_Status status;
	MPI_Request recv_req[16], send_req[16];
	int recv_index[16], recv_count = 0;

	intsize = sizeof(int);
	dbsize = sizeof(double);

	start_a = rdtsc();

	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
	MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

	clock_rate = clock_rateBGL;	// set for BGL

	if(argc > 1) {
		Use_nodes = atoi(argv[1]);
		numtasks = Use_nodes
	}
	if(argc > 2) {
		if (argv[2] == "k" || argv[2] == "K")
			clock_rate = clock_rateK;	// set for Kratos
	}
	if(argc > 3) {
		Nc = atoi(argv[3]);
	}
	if(argc > 4) {
		DEBUG_ON = (char)atoi(argv[4]);
	}

	if (Use_nodes > 0 && taskid < Use_nodes) {

		P = numtasks;
		NP = Nc/P;
		sizeB = NP*Nc;
		if(taskid==0 && DEBUG_ON > 1) {
			printf("sizeB:%d\n",sizeB);
			printf("taskid:%d\n",taskid);
			printf("NP:%d\n", NP);
			printf("P:%d\n",P);
		}
		dest = (taskid+1)%numtasks;

		// WHEN USING MPI DO: rng_init_seeds[0] = my_rank;
		rng_init_seeds[0] = taskid;
		init_by_array(rng_init_seeds, rng_init_length);

		//Allocate space
		B_buf_in = (double*)calloc(sizeB, dbsize);
		B_buf_out = (double*)calloc(sizeB, dbsize);

		A = (double **)calloc( NP, sizeof(double*));
		for( i = 0; i < NP; i++ )
			A[i] = (double *)calloc( Nc, sizeof(double));

		C = (double **)calloc( NP, sizeof(double*));
		for( i = 0; i < NP; i++ )
			C[i] = (double *)calloc( Nc, sizeof(double));

		//Initialize A and B
		for( i = 0; i < NP; i++ ) {
			for( j = 0; j < Nc; j++ ) {
				A[i][j] = genrand_res53();
				B_buf_in[(j*NP)+i] = genrand_res53();
			}
		}
		if(DEBUG_ON > 1) {
			printf("\nA:\n");
			for( i=0; i<NP; i++) {
				for (j=0; j<Nc; j++) {
					printf("%f ",A[i][j]);
				}
				printf("--%d\n",taskid);
			}
			printf("\nB:\n");
			for( i=0; i<Nc; i++) {
				for (j=0; j<NP; j++) {
					printf("%f ",B_buf_in[i*NP + j]);
				}
				printf("--%d\n",taskid);
			}
		}

		for (count = 0; count < P; count++) {
			//move B to sending buffer
			memcpy(B_buf_out, B_buf_in, sizeB*dbsize);
			//send B to next process
			MPI_Isend(B_buf_out, sizeB, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD, &(send_req[0]));
			//perform multiplication
			mult_time_l += matrix_multiply( A, B_buf_in, C, ((count+taskid)%P)*NP, NP );
			//receive B from previous process
			start = rdtsc();
			MPI_Irecv(B_buf_in, sizeB, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &(recv_req[0]));
			//wait for new B
			while (recv_count == 0) {
				MPI_Testsome(0, recv_req, &recv_count, recv_index, MPI_STATUSES_IGNORE);
			}
			recv_count = 0;
			end = rdtsc();
			send_time_l += ((double)end - (double)start)/clock_rate;

			if(DEBUG_ON) {
				printf("\nC(%d):\n",count);
				for( i=0; i<NP; i++) {
					for (j=0; j<Nc; j++) {
						printf("%d ",(int)C[i][j]);
					}
					printf("--%d\n",taskid);
				}
			}
		}

		// Collect stats
		MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
		MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
		mult[0] = mult_time[0];
		send[0] = send_time[0];
		MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		mult[1] = mult_time[0];
		send[1] = send_time[0];
		MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		mult[1] = mult_time[0];
		send[1] = send_time[0];
	}

	//Finalize
	MPI_Finalize();

	//report stats
	if (taskid == 0) {
		end_a = rdtsc();
		total_time = ((double)end_a - (double)start_a)/clock_rate;
		data = sizeB*P;
		printf("NUM CORES --- AVG MULT --- AVG SEND --- MIN MULT --- MIN SEND --- MAX MULT --- MAX SEND --- TOTAL EXE\n");
		printf("%d \t%lf \t%f \t%lf \t%lf \t%lf \t%lf \t%lf\n",P,data/mult[0],data/send[0],data/mult[1],data/send[0],data/mult[1],data/send[0],total_time);
	}

}
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <mpi.h>
	#include <string.h>

	#if defined(__i386__)

	static __inline__ unsigned long long rdtsc(void)
	{
	unsigned long long int x;
	__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
	return x;
	}
	#elif defined(__x86_64__)


	static __inline__ unsigned long long rdtsc(void)
	{
	unsigned hi, lo;
	__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
	return ( (unsigned long long)lo)\|( ((unsigned long long)hi)<<32 );
	}

	#elif defined(__powerpc__)
	static __inline__ unsigned long long rdtsc(void)
	{
	unsigned long long int result=0;
	unsigned long int upper, lower,tmp;
	__asm__ volatile(
	"0: \n"
	"\tmftbu %0 \n"
	"\tmftb %1 \n"
	"\tmftbu %2 \n"
	"\tcmpw %2,%0 \n"
	"\tbne 0b \n"
	: "=r"(upper),"=r"(lower),"=r"(tmp)
	);
	result = upper;
	result = result<<32;
	result = result\|lower;

	return(result);
	}
	#endif

	/***********************************************************************/
	/* START: MT 19937******************************************************/
	/***********************************************************************/

	/* Period parameters */
	#define N 624
	#define M 397
	#define MATRIX_A 0x9908b0dfUL /* constant vector a */
	#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
	#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

	static unsigned long mt[N]; /* the array for the state vector */
	static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

	/* initializes mt[N] with a seed */
	void init_genrand(unsigned long s)
	{
	mt[0]= s & 0xffffffffUL;
	for (mti=1; mti<N; mti++) {
	mt[mti] =
	(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
	/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
	/* In the previous versions, MSBs of the seed affect */
	/* only MSBs of the array mt[]. */
	/* 2002/01/09 modified by Makoto Matsumoto */
	mt[mti] &= 0xffffffffUL;
	/* for >32 bit machines */
	}
	}

	/* initialize by an array with array-length */
	/* init_key is the array for initializing keys */
	/* key_length is its length */
	/* slight change for C++, 2004/2/26 */
	void init_by_array(unsigned long init_key[], int key_length)
	{
	int i, j, k;
	init_genrand(19650218UL);
	i=1; j=0;
	k = (N>key_length ? N : key_length);
	for (; k; k--) {
	mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
	+ init_key[j] + j; /* non linear */
	mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
	i++; j++;
	if (i>=N) { mt[0] = mt[N-1]; i=1; }
	if (j>=key_length) j=0;
	}
	for (k=N-1; k; k--) {
	mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
	- i; /* non linear */
	mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
	i++;
	if (i>=N) { mt[0] = mt[N-1]; i=1; }
	}

	mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
	}

	/* generates a random number on [0,0xffffffff]-interval */
	unsigned long genrand_int32(void)
	{
	unsigned long y;
	static unsigned long mag01[2]={0x0UL, MATRIX_A};
	/* mag01[x] = x * MATRIX_A for x=0,1 */

	if (mti >= N) { /* generate N words at one time */
	int kk;

	if (mti == N+1) /* if init_genrand() has not been called, */
	init_genrand(5489UL); /* a default initial seed is used */

	for (kk=0;kk<N-M;kk++) {
	y = (mt[kk]&UPPER_MASK)\|(mt[kk+1]&LOWER_MASK);
	mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
	}
	for (;kk<N-1;kk++) {
	y = (mt[kk]&UPPER_MASK)\|(mt[kk+1]&LOWER_MASK);
	mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
	}
	y = (mt[N-1]&UPPER_MASK)\|(mt[0]&LOWER_MASK);
	mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];

	mti = 0;
	}

	y = mt[mti++];

	/* Tempering */
	y ^= (y >> 11);
	y ^= (y << 7) & 0x9d2c5680UL;
	y ^= (y << 15) & 0xefc60000UL;
	y ^= (y >> 18);

	return y;
	}

	/* generates a random number on [0,0x7fffffff]-interval */
	long genrand_int31(void)
	{
	return (long)(genrand_int32()>>1);
	}

	/* generates a random number on [0,1]-real-interval */
	double genrand_real1(void)
	{
	return genrand_int32()*(1.0/4294967295.0);
	/* divided by 2^32-1 */
	}

	/* generates a random number on [0,1)-real-interval */
	double genrand_real2(void)
	{
	return genrand_int32()*(1.0/4294967296.0);
	/* divided by 2^32 */
	}

	/* generates a random number on (0,1)-real-interval */
	double genrand_real3(void)
	{
	return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
	/* divided by 2^32 */
	}

	/* generates a random number on [0,1) with 53-bit resolution*/
	double genrand_res53(void)
	{
	unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
	return(a67108864.0+b)(1.0/9007199254740992.0);
	}
	/* These real versions are due to Isaku Wada, 2002/01/09 added */

	/***********************************************************************/
	/* END: MT 19937 *******************************************************/
	/***********************************************************************/

	/* Standard matrix multiplication */
	/* Arrays start at 0 */

	char DEBUG_ON = 0;

	double **A=NULL;
	double **C=NULL;
	double *B_buf_in = NULL;
	double *B_buf_out = NULL;
	unsigned int Nc=8000;
	unsigned long rng_init_seeds[6]={0x0, 0x123, 0x234, 0x345, 0x456, 0x789};
	unsigned long rng_init_length=6;
	double clock_rateK=2666700000.0; // Kratos
	double clock_rateBGL=700000000.0; // Blue Gene/L
	double clock_rate = 0; // set for BGL or Kratos

	double matrix_multiply( double *A, double B, double **C, int B_start, int NP )
	{
	int i=0, j=0, k=0;
	unsigned long long start=rdtsc();
	unsigned long long end=rdtsc();
	for (i = 0; i< NP; i++)
	for( j = 0; j < NP; j++ )
	for( k = 0; k < Nc; k++ ) {
	C[ i ][ j+B_start ] += A[i][k] * B[k*NP + j];
	//C[ i+A_start ][ j+B_start ] += A[i+A_start][k] * B[k*NP + j]; //A_start only needed when allocating all of A and C for each process
	}
	end = rdtsc();
	return ((double)end - (double)start)/clock_rate;
	}

	void main( int argc, char* argv[])
	{
	int i, j;
	int taskid, numtasks, dest;
	int intsize,dbsize;
	int P,NP,sizeB;
	int count = 0;
	double mult_time_l = 0, send_time_l = 0;
	double mult_time[1024];
	double send_time[1024];

	double mult[3];
	double send[3];
	double total_time = 0;
	double data;

	int Use_nodes = 0; //max nodes to use for a job, use all if 0

	unsigned long long start,start_a, end,end_a;

	MPI_Status status;
	MPI_Request recv_req[16], send_req[16];
	int recv_index[16], recv_count = 0;

	intsize = sizeof(int);
	dbsize = sizeof(double);

	start_a = rdtsc();

	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
	MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

	clock_rate = clock_rateBGL; // set for BGL

	if(argc > 1) {
	Use_nodes = atoi(argv[1]);
	numtasks = Use_nodes
	}
	if(argc > 2) {
	if (argv[2] == "k" \|\| argv[2] == "K")
	clock_rate = clock_rateK; // set for Kratos
	}
	if(argc > 3) {
	Nc = atoi(argv[3]);
	}
	if(argc > 4) {
	DEBUG_ON = (char)atoi(argv[4]);
	}

	if (Use_nodes > 0 && taskid < Use_nodes) {

	P = numtasks;
	NP = Nc/P;
	sizeB = NP*Nc;
	if(taskid==0 && DEBUG_ON > 1) {
	printf("sizeB:%d\n",sizeB);
	printf("taskid:%d\n",taskid);
	printf("NP:%d\n", NP);
	printf("P:%d\n",P);
	}
	dest = (taskid+1)%numtasks;

	// WHEN USING MPI DO: rng_init_seeds[0] = my_rank;
	rng_init_seeds[0] = taskid;
	init_by_array(rng_init_seeds, rng_init_length);

	//Allocate space
	B_buf_in = (double*)calloc(sizeB, dbsize);
	B_buf_out = (double*)calloc(sizeB, dbsize);

	A = (double *)calloc( NP, sizeof(double));
	for( i = 0; i < NP; i++ )
	A[i] = (double *)calloc( Nc, sizeof(double));

	C = (double *)calloc( NP, sizeof(double));
	for( i = 0; i < NP; i++ )
	C[i] = (double *)calloc( Nc, sizeof(double));

	//Initialize A and B
	for( i = 0; i < NP; i++ ) {
	for( j = 0; j < Nc; j++ ) {
	A[i][j] = genrand_res53();
	B_buf_in[(j*NP)+i] = genrand_res53();
	}
	}
	if(DEBUG_ON > 1) {
	printf("\nA:\n");
	for( i=0; i<NP; i++) {
	for (j=0; j<Nc; j++) {
	printf("%f ",A[i][j]);
	}
	printf("--%d\n",taskid);
	}
	printf("\nB:\n");
	for( i=0; i<Nc; i++) {
	for (j=0; j<NP; j++) {
	printf("%f ",B_buf_in[i*NP + j]);
	}
	printf("--%d\n",taskid);
	}
	}

	for (count = 0; count < P; count++) {
	//move B to sending buffer
	memcpy(B_buf_out, B_buf_in, sizeB*dbsize);
	//send B to next process
	MPI_Isend(B_buf_out, sizeB, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD, &(send_req[0]));
	//perform multiplication
	mult_time_l += matrix_multiply( A, B_buf_in, C, ((count+taskid)%P)*NP, NP );
	//receive B from previous process
	start = rdtsc();
	MPI_Irecv(B_buf_in, sizeB, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &(recv_req[0]));
	//wait for new B
	while (recv_count == 0) {
	MPI_Testsome(0, recv_req, &recv_count, recv_index, MPI_STATUSES_IGNORE);
	}
	recv_count = 0;
	end = rdtsc();
	send_time_l += ((double)end - (double)start)/clock_rate;

	if(DEBUG_ON) {
	printf("\nC(%d):\n",count);
	for( i=0; i<NP; i++) {
	for (j=0; j<Nc; j++) {
	printf("%d ",(int)C[i][j]);
	}
	printf("--%d\n",taskid);
	}
	}
	}

	// Collect stats
	MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	mult[0] = mult_time[0];
	send[0] = send_time[0];
	MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
	MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
	mult[1] = mult_time[0];
	send[1] = send_time[0];
	MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
	MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
	mult[1] = mult_time[0];
	send[1] = send_time[0];
	}

	//Finalize
	MPI_Finalize();

	//report stats
	if (taskid == 0) {
	end_a = rdtsc();
	total_time = ((double)end_a - (double)start_a)/clock_rate;
	data = sizeB*P;
	printf("NUM CORES --- AVG MULT --- AVG SEND --- MIN MULT --- MIN SEND --- MAX MULT --- MAX SEND --- TOTAL EXE\n");
	printf("%d \t%lf \t%f \t%lf \t%lf \t%lf \t%lf \t%lf\n",P,data/mult[0],data/send[0],data/mult[1],data/send[0],data/mult[1],data/send[0],total_time);
	}

	}