nattoheaven/gist:5420160

## gistfile1.c
/********************************************************************

 This benchmark test program is measuring a cpu performance
 of floating point operation by a Poisson equation solver.

 If you have any question, please ask me via email.
 written by Ryutaro HIMENO, November 26, 2001.
 Version 3.0
 ----------------------------------------------
 Ryutaro Himeno, Dr. of Eng.
 Head of Computer Information Division,
 RIKEN (The Institute of Pysical and Chemical Research)
 Email : himeno@postman.riken.go.jp
 ---------------------------------------------------------------
 You can adjust the size of this benchmark code to fit your target
 computer. In that case, please chose following sets of
 [mimax][mjmax][mkmax]:
 small : 33,33,65
 small : 65,65,129
 midium: 129,129,257
 large : 257,257,513
 ext.large: 513,513,1025
 This program is to measure a computer performance in MFLOPS
 by using a kernel which appears in a linear solver of pressure
 Poisson eq. which appears in an incompressible Navier-Stokes solver.
 A point-Jacobi method is employed in this solver as this method can
 be easyly vectrized and be parallelized.
 ------------------
 Finite-difference method, curvilinear coodinate system
 Vectorizable and parallelizable on each grid point
 No. of grid points : imax x jmax x kmax including boundaries
 ------------------
 A,B,C:coefficient matrix, wrk1: source term of Poisson equation
 wrk2 : working area, OMEGA : relaxation parameter
 BND:control variable for boundaries and objects ( = 0 or 1)
 P: pressure

 AVX implementation for the Dr. Himeno's program
 is written by NISHIMURA Ryohei.
********************************************************************/

#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include <immintrin.h>

#define MR(mt,n,r,c,d)  mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)]

#ifdef HALF
typedef short mat_t;
#define LOAD(mt,n,r,c,d)  _mm256_cvtph_ps(_mm_load_si128((const __m128i *)(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))))
#define LOADU(mt,n,r,c,d)  _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))))
#define STORE(mt,n,r,c,d,x)  _mm_store_si128((__m128i *)(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)), _mm256_cvtps_ph((x), 0))
#define STORE1(mt,n,r,c,d,x)  (mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)] = _cvtss_sh((x), 0))
#else
typedef float mat_t;
#define LOAD(mt,n,r,c,d)  _mm256_load_ps(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))
#define LOADU(mt,n,r,c,d)  _mm256_loadu_ps(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))
#define STORE(mt,n,r,c,d,x)  _mm256_store_ps(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d), (x))
#define STORE1(mt,n,r,c,d,x)  (mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)] = (x))
#endif

struct Mat {
  mat_t* m;
  int mnums;
  int mrows;
  int mcols;
  int mdeps;
};

/* prototypes */
typedef struct Mat Matrix;

int newMat(Matrix* Mat, int mnums, int mrows, int mcols, int mdeps);
void clearMat(Matrix* Mat);
void set_param(int i[],char *size);
void mat_set(Matrix* Mat,int l,float z);
void mat_set_init(Matrix* Mat);
float jacobi(int n,Matrix* M1,Matrix* M2,Matrix* M3,
             Matrix* M4,Matrix* M5,Matrix* M6,Matrix* M7);
double second();

__m256   omega;
Matrix  a,b,c,p,bnd,wrk1,wrk2;

int
main(int argc, char *argv[])
{
  int    nn;
  int    imax,jmax,kmax,mimax,mjmax,mkmax,msize[3];
  float  gosa,target;
  double  cpu0,cpu1,cpu,xmflops2,score,flop;
  char   size[10];

  if(argc == 2){
    strcpy(size,argv[1]);
  } else {
    printf("For example: \n");
    printf(" Grid-size= XS (32x32x64)\n");
    printf("\t    S  (64x64x128)\n");
    printf("\t    M  (128x128x256)\n");
    printf("\t    L  (256x256x512)\n");
    printf("\t    XL (512x512x1024)\n\n");
    printf("Grid-size = ");
    scanf("%s",size);
    printf("\n");
  }

  omega=_mm256_set1_ps(0.8);
  set_param(msize,size);

  mimax= msize[0];
  mjmax= msize[1];
  mkmax= msize[2];
  imax= mimax-1;
  jmax= mjmax-1;
  kmax= mkmax-1;

  target = 60.0;

  printf("mimax = %d mjmax = %d mkmax = %d\n",mimax,mjmax,mkmax);
  printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax);

  /*
   *    Initializing matrixes
   */
  newMat(&p,1,mimax,mjmax,mkmax);
  newMat(&bnd,1,mimax,mjmax,mkmax);
  newMat(&wrk1,1,mimax,mjmax,mkmax);
  newMat(&wrk2,1,mimax,mjmax,mkmax);
  newMat(&a,4,mimax,mjmax,mkmax);
  newMat(&b,3,mimax,mjmax,mkmax);
  newMat(&c,3,mimax,mjmax,mkmax);

  mat_set_init(&p);
  mat_set(&bnd,0,1.0);
  mat_set(&wrk1,0,0.0);
  mat_set(&wrk2,0,0.0);
  mat_set(&a,0,1.0);
  mat_set(&a,1,1.0);
  mat_set(&a,2,1.0);
  mat_set(&a,3,1.0/6.0);
  mat_set(&b,0,0.0);
  mat_set(&b,1,0.0);
  mat_set(&b,2,0.0);
  mat_set(&c,0,1.0);
  mat_set(&c,1,1.0);
  mat_set(&c,2,1.0);

  /*
   *    Start measuring
   */
  nn= 3;
  printf(" Start rehearsal measurement process.\n");
  printf(" Measure the performance in %d times.\n\n",nn);

  cpu0= second();
  gosa= jacobi(nn,&a,&b,&c,&p,&bnd,&wrk1,&wrk2);
  cpu1= second();
  cpu= cpu1 - cpu0;
  flop = (double)(kmax-1)*(double)(jmax-1)*(double)(imax-1)*34.0;

  if(cpu != 0.0)
    xmflops2= flop/cpu*1.e-6*nn;

  printf(" MFLOPS: %f time(s): %f %e\n\n",xmflops2,cpu,gosa);

  nn= (int)(target/(cpu/3.0));

  printf(" Now, start the actual measurement process.\n");
  printf(" The loop will be excuted in %d times\n",nn);
  printf(" This will take about one minute.\n");
  printf(" Wait for a while\n\n");

  cpu0 = second();
  gosa = jacobi(nn,&a,&b,&c,&p,&bnd,&wrk1,&wrk2);
  cpu1 = second();
  cpu = cpu1 - cpu0;

  if(cpu != 0.0)
    xmflops2 = (double)flop/cpu*1.0e-6*nn;

  printf("cpu : %f sec.\n", cpu);
  printf("Loop executed for %d times\n",nn);
  printf("Gosa : %e \n",gosa);
  printf("MFLOPS measured : %f\n",xmflops2);
  score = xmflops2/82.84;
  printf("Score based on Pentium III 600MHz using Fortran 77: %f\n",score);

  /*
   *   Matrix free
   */
  clearMat(&p);
  clearMat(&bnd);
  clearMat(&wrk1);
  clearMat(&wrk2);
  clearMat(&a);
  clearMat(&b);
  clearMat(&c);

  return (0);
}

void
set_param(int is[],char *size)
{
  if(!strcmp(size,"XS") || !strcmp(size,"xs")){
    is[0]= 32;
    is[1]= 32;
    is[2]= 64;
    return;
  }
  if(!strcmp(size,"S") || !strcmp(size,"s")){
    is[0]= 64;
    is[1]= 64;
    is[2]= 128;
    return;
  }
  if(!strcmp(size,"M") || !strcmp(size,"m")){
    is[0]= 128;
    is[1]= 128;
    is[2]= 256;
    return;
  }
  if(!strcmp(size,"L") || !strcmp(size,"l")){
    is[0]= 256;
    is[1]= 256;
    is[2]= 512;
    return;
  }
  if(!strcmp(size,"Xl") || !strcmp(size,"xl")){
    is[0]= 512;
    is[1]= 512;
    is[2]= 1024;
    return;
  }
}

int
newMat(Matrix* Mat, int mnums,int mrows, int mcols, int mdeps)
{
  Mat->mnums= mnums;
  Mat->mrows= mrows;
  Mat->mcols= mcols;
  Mat->mdeps= mdeps;
  Mat->m= NULL;
  Mat->m= (mat_t*)
    _mm_malloc((mnums * mrows * mcols * mdeps + 2 * 8) * sizeof(mat_t),
               8 * sizeof(mat_t)) + 8;
  if (Mat->m != NULL) {
    memset(Mat->m - 8, 0, (mnums * mrows * mcols * mdeps + 2 * 8) * sizeof(mat_t));
  }

  return(Mat->m != NULL) ? 1:0;
}

void
clearMat(Matrix* Mat)
{
  if(Mat->m)
    _mm_free(Mat->m - 8);
  Mat->m= NULL;
  Mat->mnums= 0;
  Mat->mcols= 0;
  Mat->mrows= 0;
  Mat->mdeps= 0;
}

void
mat_set(Matrix* Mat, int l, float val)
{
  int i,j,k;

    for(i=1; i<Mat->mrows-1; i++)
      for(j=1; j<Mat->mcols-1; j++)
        for(k=1; k<Mat->mdeps-1; k++)
          STORE1(Mat,l,i,j,k,val);
}

void
mat_set_init(Matrix* Mat)
{
  int  i,j,k,l;
  float tt;

  for(i=0; i<Mat->mrows; i++)
    for(j=0; j<Mat->mcols; j++)
      for(k=0; k<Mat->mdeps; k++)
        STORE1(Mat,0,i,j,k,(float)(i*i)
          /(float)((Mat->mrows - 1)*(Mat->mrows - 1)));
}

float
jacobi(int nn, Matrix* a,Matrix* b,Matrix* c,
       Matrix* p,Matrix* bnd,Matrix* wrk1,Matrix* wrk2)
{
  int    i,j,k,n,imax,jmax,kmax;
  float  gosa;
  __m256 gosa1,s0,ss;

  imax= p->mrows-1;
  jmax= p->mcols-1;
  kmax= p->mdeps-1;
#pragma omp parallel shared(a,p,b,c,bnd,wrk1,wrk2,nn,imax,jmax,kmax,omega,gosa) private(i,j,k,s0,ss,gosa1,n)
{
  for(n=0 ; n<nn ; n++){
#pragma omp barrier
#pragma omp master
    {
      gosa = 0.0;
    }
    gosa1= _mm256_setzero_ps();
#pragma omp for nowait
    for(i=1 ; i<imax; i++)
      for(j=1 ; j<jmax ; j++)
        for(k=0 ; k<=kmax ; k += 8){
          s0= _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(LOAD(a,0,i,j,k),LOAD(p,0,i+1,j,  k)),
            _mm256_mul_ps(LOAD(a,1,i,j,k),LOAD(p,0,i,  j+1,k))),
            _mm256_mul_ps(LOAD(a,2,i,j,k),LOADU(p,0,i,  j,  k+1))),
            _mm256_mul_ps(LOAD(b,0,i,j,k),
             _mm256_add_ps(_mm256_sub_ps(_mm256_sub_ps(LOAD(p,0,i+1,j+1,k),LOAD(p,0,i+1,j-1,k)),
              LOAD(p,0,i-1,j+1,k)),LOAD(p,0,i-1,j-1,k)))),
            _mm256_mul_ps(LOAD(b,1,i,j,k),
             _mm256_add_ps(_mm256_sub_ps(_mm256_sub_ps(LOADU(p,0,i,j+1,k+1),LOADU(p,0,i,j-1,k+1)),
              LOADU(p,0,i,j+1,k-1)),LOADU(p,0,i,j-1,k-1)))),
            _mm256_mul_ps(LOAD(b,2,i,j,k),
             _mm256_add_ps(_mm256_sub_ps(_mm256_sub_ps(LOADU(p,0,i+1,j,k+1),LOADU(p,0,i-1,j,k+1)),
              LOADU(p,0,i+1,j,k-1)),LOADU(p,0,i-1,j,k-1)))),
            _mm256_mul_ps(LOAD(c,0,i,j,k),LOAD(p,0,i-1,j,  k))),
            _mm256_mul_ps(LOAD(c,1,i,j,k),LOAD(p,0,i,  j-1,k))),
            _mm256_mul_ps(LOAD(c,2,i,j,k),LOADU(p,0,i,  j,  k-1))),
            LOAD(wrk1,0,i,j,k));

          ss= _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s0,LOAD(a,3,i,j,k)),LOAD(p,0,i,j,k)),LOAD(bnd,0,i,j,k));

          gosa1= _mm256_add_ps(gosa1,_mm256_mul_ps(ss,ss));

          STORE(wrk2,0,i,j,k,_mm256_add_ps(LOAD(p,0,i,j,k),_mm256_mul_ps(omega,ss)));
        }
#pragma omp barrier
#pragma omp for nowait
    for(i=1 ; i<imax ; i++)
      for(j=1 ; j<jmax ; j++)
        for(k=1 ; k<kmax ; k++)
          MR(p,0,i,j,k)= MR(wrk2,0,i,j,k);

#pragma omp critical
    {
      float gosa2;
      gosa1=_mm256_hadd_ps(gosa1, gosa1);
      gosa1=_mm256_hadd_ps(gosa1, gosa1);
      _MM_EXTRACT_FLOAT(gosa2, _mm_add_ps(_mm256_extractf128_ps(gosa1, 0), _mm256_extractf128_ps(gosa1, 1)), 0);
      gosa+= gosa2;
    }
  } /* end n loop */
}
  return(gosa);
}

double
second()
{

  struct timeval tm;
  double t ;

  static int base_sec = 0,base_usec = 0;

  gettimeofday(&tm, NULL);

  if(base_sec == 0 && base_usec == 0)
    {
      base_sec = tm.tv_sec;
      base_usec = tm.tv_usec;
      t = 0.0;
  } else {
    t = (double) (tm.tv_sec-base_sec) +
      ((double) (tm.tv_usec-base_usec))/1.0e6 ;
  }

  return t ;
}
	/********************************************************************

	This benchmark test program is measuring a cpu performance
	of floating point operation by a Poisson equation solver.

	If you have any question, please ask me via email.
	written by Ryutaro HIMENO, November 26, 2001.
	Version 3.0
	----------------------------------------------
	Ryutaro Himeno, Dr. of Eng.
	Head of Computer Information Division,
	RIKEN (The Institute of Pysical and Chemical Research)
	Email : himeno@postman.riken.go.jp
	---------------------------------------------------------------
	You can adjust the size of this benchmark code to fit your target
	computer. In that case, please chose following sets of
	[mimax][mjmax][mkmax]:
	small : 33,33,65
	small : 65,65,129
	midium: 129,129,257
	large : 257,257,513
	ext.large: 513,513,1025
	This program is to measure a computer performance in MFLOPS
	by using a kernel which appears in a linear solver of pressure
	Poisson eq. which appears in an incompressible Navier-Stokes solver.
	A point-Jacobi method is employed in this solver as this method can
	be easyly vectrized and be parallelized.
	------------------
	Finite-difference method, curvilinear coodinate system
	Vectorizable and parallelizable on each grid point
	No. of grid points : imax x jmax x kmax including boundaries
	------------------
	A,B,C:coefficient matrix, wrk1: source term of Poisson equation
	wrk2 : working area, OMEGA : relaxation parameter
	BND:control variable for boundaries and objects ( = 0 or 1)
	P: pressure

	AVX implementation for the Dr. Himeno's program
	is written by NISHIMURA Ryohei.
	********************************************************************/

	#include <stdio.h>
	#include <string.h>
	#include <sys/time.h>
	#include <immintrin.h>

	#define MR(mt,n,r,c,d) mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)]

	#ifdef HALF
	typedef short mat_t;
	#define LOAD(mt,n,r,c,d) _mm256_cvtph_ps(_mm_load_si128((const __m128i )(mt->m + (n) mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))))
	#define LOADU(mt,n,r,c,d) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i )(mt->m + (n) mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))))
	#define STORE(mt,n,r,c,d,x) _mm_store_si128((__m128i )(mt->m + (n) mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)), _mm256_cvtps_ph((x), 0))
	#define STORE1(mt,n,r,c,d,x) (mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)] = _cvtss_sh((x), 0))
	#else
	typedef float mat_t;
	#define LOAD(mt,n,r,c,d) _mm256_load_ps(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))
	#define LOADU(mt,n,r,c,d) _mm256_loadu_ps(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d))
	#define STORE(mt,n,r,c,d,x) _mm256_store_ps(mt->m + (n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d), (x))
	#define STORE1(mt,n,r,c,d,x) (mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)] = (x))
	#endif

	struct Mat {
	mat_t* m;
	int mnums;
	int mrows;
	int mcols;
	int mdeps;
	};

	/* prototypes */
	typedef struct Mat Matrix;

	int newMat(Matrix* Mat, int mnums, int mrows, int mcols, int mdeps);
	void clearMat(Matrix* Mat);
	void set_param(int i[],char *size);
	void mat_set(Matrix* Mat,int l,float z);
	void mat_set_init(Matrix* Mat);
	float jacobi(int n,Matrix* M1,Matrix* M2,Matrix* M3,
	Matrix* M4,Matrix* M5,Matrix* M6,Matrix* M7);
	double second();

	__m256 omega;
	Matrix a,b,c,p,bnd,wrk1,wrk2;

	int
	main(int argc, char *argv[])
	{
	int nn;
	int imax,jmax,kmax,mimax,mjmax,mkmax,msize[3];
	float gosa,target;
	double cpu0,cpu1,cpu,xmflops2,score,flop;
	char size[10];

	if(argc == 2){
	strcpy(size,argv[1]);
	} else {
	printf("For example: \n");
	printf(" Grid-size= XS (32x32x64)\n");
	printf("\t S (64x64x128)\n");
	printf("\t M (128x128x256)\n");
	printf("\t L (256x256x512)\n");
	printf("\t XL (512x512x1024)\n\n");
	printf("Grid-size = ");
	scanf("%s",size);
	printf("\n");
	}

	omega=_mm256_set1_ps(0.8);
	set_param(msize,size);

	mimax= msize[0];
	mjmax= msize[1];
	mkmax= msize[2];
	imax= mimax-1;
	jmax= mjmax-1;
	kmax= mkmax-1;

	target = 60.0;

	printf("mimax = %d mjmax = %d mkmax = %d\n",mimax,mjmax,mkmax);
	printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax);

	/*
	* Initializing matrixes
	*/
	newMat(&p,1,mimax,mjmax,mkmax);
	newMat(&bnd,1,mimax,mjmax,mkmax);
	newMat(&wrk1,1,mimax,mjmax,mkmax);
	newMat(&wrk2,1,mimax,mjmax,mkmax);
	newMat(&a,4,mimax,mjmax,mkmax);
	newMat(&b,3,mimax,mjmax,mkmax);
	newMat(&c,3,mimax,mjmax,mkmax);

	mat_set_init(&p);
	mat_set(&bnd,0,1.0);
	mat_set(&wrk1,0,0.0);
	mat_set(&wrk2,0,0.0);
	mat_set(&a,0,1.0);
	mat_set(&a,1,1.0);
	mat_set(&a,2,1.0);
	mat_set(&a,3,1.0/6.0);
	mat_set(&b,0,0.0);
	mat_set(&b,1,0.0);
	mat_set(&b,2,0.0);
	mat_set(&c,0,1.0);
	mat_set(&c,1,1.0);
	mat_set(&c,2,1.0);

	/*
	* Start measuring
	*/
	nn= 3;
	printf(" Start rehearsal measurement process.\n");
	printf(" Measure the performance in %d times.\n\n",nn);

	cpu0= second();
	gosa= jacobi(nn,&a,&b,&c,&p,&bnd,&wrk1,&wrk2);
	cpu1= second();
	cpu= cpu1 - cpu0;
	flop = (double)(kmax-1)(double)(jmax-1)(double)(imax-1)*34.0;

	if(cpu != 0.0)
	xmflops2= flop/cpu1.e-6nn;

	printf(" MFLOPS: %f time(s): %f %e\n\n",xmflops2,cpu,gosa);

	nn= (int)(target/(cpu/3.0));

	printf(" Now, start the actual measurement process.\n");
	printf(" The loop will be excuted in %d times\n",nn);
	printf(" This will take about one minute.\n");
	printf(" Wait for a while\n\n");

	cpu0 = second();
	gosa = jacobi(nn,&a,&b,&c,&p,&bnd,&wrk1,&wrk2);
	cpu1 = second();
	cpu = cpu1 - cpu0;

	if(cpu != 0.0)
	xmflops2 = (double)flop/cpu1.0e-6nn;

	printf("cpu : %f sec.\n", cpu);
	printf("Loop executed for %d times\n",nn);
	printf("Gosa : %e \n",gosa);
	printf("MFLOPS measured : %f\n",xmflops2);
	score = xmflops2/82.84;
	printf("Score based on Pentium III 600MHz using Fortran 77: %f\n",score);

	/*
	* Matrix free
	*/
	clearMat(&p);
	clearMat(&bnd);
	clearMat(&wrk1);
	clearMat(&wrk2);
	clearMat(&a);
	clearMat(&b);
	clearMat(&c);

	return (0);
	}

	void
	set_param(int is[],char *size)
	{
	if(!strcmp(size,"XS") \|\| !strcmp(size,"xs")){
	is[0]= 32;
	is[1]= 32;
	is[2]= 64;
	return;
	}
	if(!strcmp(size,"S") \|\| !strcmp(size,"s")){
	is[0]= 64;
	is[1]= 64;
	is[2]= 128;
	return;
	}
	if(!strcmp(size,"M") \|\| !strcmp(size,"m")){
	is[0]= 128;
	is[1]= 128;
	is[2]= 256;
	return;
	}
	if(!strcmp(size,"L") \|\| !strcmp(size,"l")){
	is[0]= 256;
	is[1]= 256;
	is[2]= 512;
	return;
	}
	if(!strcmp(size,"Xl") \|\| !strcmp(size,"xl")){
	is[0]= 512;
	is[1]= 512;
	is[2]= 1024;
	return;
	}
	}

	int
	newMat(Matrix* Mat, int mnums,int mrows, int mcols, int mdeps)
	{
	Mat->mnums= mnums;
	Mat->mrows= mrows;
	Mat->mcols= mcols;
	Mat->mdeps= mdeps;
	Mat->m= NULL;
	Mat->m= (mat_t*)
	_mm_malloc((mnums * mrows * mcols * mdeps + 2 * 8) * sizeof(mat_t),
	8 * sizeof(mat_t)) + 8;
	if (Mat->m != NULL) {
	memset(Mat->m - 8, 0, (mnums * mrows * mcols * mdeps + 2 * 8) * sizeof(mat_t));
	}

	return(Mat->m != NULL) ? 1:0;
	}

	void
	clearMat(Matrix* Mat)
	{
	if(Mat->m)
	_mm_free(Mat->m - 8);
	Mat->m= NULL;
	Mat->mnums= 0;
	Mat->mcols= 0;
	Mat->mrows= 0;
	Mat->mdeps= 0;
	}

	void
	mat_set(Matrix* Mat, int l, float val)
	{
	int i,j,k;

	for(i=1; i<Mat->mrows-1; i++)
	for(j=1; j<Mat->mcols-1; j++)
	for(k=1; k<Mat->mdeps-1; k++)
	STORE1(Mat,l,i,j,k,val);
	}

	void
	mat_set_init(Matrix* Mat)
	{
	int i,j,k,l;
	float tt;

	for(i=0; i<Mat->mrows; i++)
	for(j=0; j<Mat->mcols; j++)
	for(k=0; k<Mat->mdeps; k++)
	STORE1(Mat,0,i,j,k,(float)(i*i)
	/(float)((Mat->mrows - 1)*(Mat->mrows - 1)));
	}

	float
	jacobi(int nn, Matrix* a,Matrix* b,Matrix* c,
	Matrix* p,Matrix* bnd,Matrix* wrk1,Matrix* wrk2)
	{
	int i,j,k,n,imax,jmax,kmax;
	float gosa;
	__m256 gosa1,s0,ss;

	imax= p->mrows-1;
	jmax= p->mcols-1;
	kmax= p->mdeps-1;
	#pragma omp parallel shared(a,p,b,c,bnd,wrk1,wrk2,nn,imax,jmax,kmax,omega,gosa) private(i,j,k,s0,ss,gosa1,n)
	{
	for(n=0 ; n<nn ; n++){
	#pragma omp barrier
	#pragma omp master
	{
	gosa = 0.0;
	}
	gosa1= _mm256_setzero_ps();
	#pragma omp for nowait
	for(i=1 ; i<imax; i++)
	for(j=1 ; j<jmax ; j++)
	for(k=0 ; k<=kmax ; k += 8){
	s0= _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(LOAD(a,0,i,j,k),LOAD(p,0,i+1,j, k)),
	_mm256_mul_ps(LOAD(a,1,i,j,k),LOAD(p,0,i, j+1,k))),
	_mm256_mul_ps(LOAD(a,2,i,j,k),LOADU(p,0,i, j, k+1))),
	_mm256_mul_ps(LOAD(b,0,i,j,k),
	_mm256_add_ps(_mm256_sub_ps(_mm256_sub_ps(LOAD(p,0,i+1,j+1,k),LOAD(p,0,i+1,j-1,k)),
	LOAD(p,0,i-1,j+1,k)),LOAD(p,0,i-1,j-1,k)))),
	_mm256_mul_ps(LOAD(b,1,i,j,k),
	_mm256_add_ps(_mm256_sub_ps(_mm256_sub_ps(LOADU(p,0,i,j+1,k+1),LOADU(p,0,i,j-1,k+1)),
	LOADU(p,0,i,j+1,k-1)),LOADU(p,0,i,j-1,k-1)))),
	_mm256_mul_ps(LOAD(b,2,i,j,k),
	_mm256_add_ps(_mm256_sub_ps(_mm256_sub_ps(LOADU(p,0,i+1,j,k+1),LOADU(p,0,i-1,j,k+1)),
	LOADU(p,0,i+1,j,k-1)),LOADU(p,0,i-1,j,k-1)))),
	_mm256_mul_ps(LOAD(c,0,i,j,k),LOAD(p,0,i-1,j, k))),
	_mm256_mul_ps(LOAD(c,1,i,j,k),LOAD(p,0,i, j-1,k))),
	_mm256_mul_ps(LOAD(c,2,i,j,k),LOADU(p,0,i, j, k-1))),
	LOAD(wrk1,0,i,j,k));

	ss= _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s0,LOAD(a,3,i,j,k)),LOAD(p,0,i,j,k)),LOAD(bnd,0,i,j,k));

	gosa1= _mm256_add_ps(gosa1,_mm256_mul_ps(ss,ss));

	STORE(wrk2,0,i,j,k,_mm256_add_ps(LOAD(p,0,i,j,k),_mm256_mul_ps(omega,ss)));
	}
	#pragma omp barrier
	#pragma omp for nowait
	for(i=1 ; i<imax ; i++)
	for(j=1 ; j<jmax ; j++)
	for(k=1 ; k<kmax ; k++)
	MR(p,0,i,j,k)= MR(wrk2,0,i,j,k);

	#pragma omp critical
	{
	float gosa2;
	gosa1=_mm256_hadd_ps(gosa1, gosa1);
	gosa1=_mm256_hadd_ps(gosa1, gosa1);
	_MM_EXTRACT_FLOAT(gosa2, _mm_add_ps(_mm256_extractf128_ps(gosa1, 0), _mm256_extractf128_ps(gosa1, 1)), 0);
	gosa+= gosa2;
	}
	} /* end n loop */
	}
	return(gosa);
	}

	double
	second()
	{

	struct timeval tm;
	double t ;

	static int base_sec = 0,base_usec = 0;

	gettimeofday(&tm, NULL);

	if(base_sec == 0 && base_usec == 0)
	{
	base_sec = tm.tv_sec;
	base_usec = tm.tv_usec;
	t = 0.0;
	} else {
	t = (double) (tm.tv_sec-base_sec) +
	((double) (tm.tv_usec-base_usec))/1.0e6 ;
	}

	return t ;
	}