elect-gombe/3tai.cpp

## 3tai.cpp
#include <omp.h>
#include <iostream>
#include <cmath>
#include <cstdlib>
#include <x86intrin.h>
#ifdef _USE_SDL
#include <SDL2/SDL.h>
#endif
const int w=1000;
const int h=1000;

unsigned char rgba[w*h*4];

/*max point number*/
#define N 4096*4

/*calculate gravity and potential energy*/
void gravity128(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
#pragma omp parallel for
  for(int i=0;i<n;i++){
    //    p[i] = 0;
    //    ansx[i]=0;    ansy[i]=0;    ansz[i]=0;
    __m128 ofs=_mm_set1_ps(0.03125f*0.03125f*4.f); //softening
    __m128 ax,ay,az;
    ax = _mm_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0)
    ay = _mm_xor_ps(ay,ay);
    az = _mm_xor_ps(az,az);
    __m128 xi=_mm_broadcast_ss(&x[i]);
    __m128 yi=_mm_broadcast_ss(&y[i]);
    __m128 zi=_mm_broadcast_ss(&z[i]);
    for(int j=0;j<n;j+=4){
      __m128 xj=_mm_load_ps(&x[j]);
      __m128 yj=_mm_load_ps(&y[j]);
      __m128 zj=_mm_load_ps(&z[j]);
      //      double x_d=x[i]-x[j];
      //      double y_d=y[i]-y[j];
      //      double z_d=z[i]-z[j];
      __m128 dx=_mm_sub_ps(xi,xj);
      __m128 dy=_mm_sub_ps(yi,yj);
      __m128 dz=_mm_sub_ps(zi,zj);
      __m128 dot=_mm_mul_ps(dx,dx);
      dot=_mm_fmadd_ps(dy,dy,dot);
      dot=_mm_fmadd_ps(dz,dz,dot);
      dot=_mm_add_ps(ofs,dot);

      //      double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125));
      __m128 rsqdot = _mm_rsqrt_ps(dot);
      __m128 rdot = _mm_mul_ps(rsqdot,rsqdot);
      __m128 r2_3rddot = _mm_mul_ps(rsqdot,rdot);//r2*r2*r2
      __m128 mj=_mm_broadcast_ss(m+j);//load m[j]

      __m128 mj_r2_3rddot=_mm_mul_ps(mj,r2_3rddot);

      //      ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d;
      //      ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d;
      //      ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d;

      ax=_mm_fmadd_ps(dx,mj_r2_3rddot,ax);
      ay=_mm_fmadd_ps(dy,mj_r2_3rddot,ay);
      az=_mm_fmadd_ps(dz,mj_r2_3rddot,az);
    }

    ansx[i]=0;
    ansy[i]=0;
    ansz[i]=0;
    for(int j=0;j<4;j++){
      ansx[i]-=ax[j];
      ansy[i]-=ay[j];
      ansz[i]-=az[j];
    }
  }
}

/*calculate gravity and potential energy*/
void gravity1(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
#pragma omp parallel for
  for(int i=0;i<n;i++){
    //    p[i] = 0;
    //    ansx[i]=0;    ansy[i]=0;    ansz[i]=0;
    float ofs=(0.03125f*0.03125f); //softening
    float ax,ay,az;
    ax = 0;
    ay = 0;
    az = 0;
    for(int j=0;j<n;j++){
      float xj=*(&x[j]);
      float yj=*(&y[j]);
      float zj=*(&z[j]);
      //      double x_d=x[i]-x[j];
      //      double y_d=y[i]-y[j];
      //      double z_d=z[i]-z[j];
      float dx=(x[i]-xj);
      float dy=(y[i]-yj);
      float dz=(z[i]-zj);
      float dot=(dx*dx);
      dot=(dy*dy+dot);
      dot=(dz*dz+dot);
      dot=(ofs+dot);

      //      double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125));
      float rsqdot = 1.f/sqrtf(dot);
      float rdot = (rsqdot*rsqdot);
      float r2_3rddot = (rsqdot*rdot);//r2*r2*r2
      float mj=*(m+j);//load m[j]

      float mj_r2_3rddot=(mj,r2_3rddot);

      //      ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d;
      //      ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d;
      //      ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d;

      ax=(dx*mj_r2_3rddot+ax);
      ay=(dy*mj_r2_3rddot+ay);
      az=(dz*mj_r2_3rddot+az);
    }

    ansx[i]=0;
    ansy[i]=0;
    ansz[i]=0;
    for(int j=0;j<1;j++){
      ansx[i]-=ax;
      ansy[i]-=ay;
      ansz[i]-=az;
    }
  }
}

/*calculate gravity and potential energy*/
void gravity(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
#pragma omp parallel for
  for(int i=0;i<n;i++){
    //    p[i] = 0;
    //    ansx[i]=0;    ansy[i]=0;    ansz[i]=0;
    __m256 ofs=_mm256_set1_ps(0.03125f*0.03125f); //softening
    __m256 ax,ay,az;
    ax = _mm256_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0)
    ay = _mm256_xor_ps(ay,ay);
    az = _mm256_xor_ps(az,az);
    __m256 xi=_mm256_broadcast_ss(&x[i]);
    __m256 yi=_mm256_broadcast_ss(&y[i]);
    __m256 zi=_mm256_broadcast_ss(&z[i]);
    for(int j=0;j<n;j+=8){
      __m256 xj=_mm256_load_ps(&x[j]);
      __m256 yj=_mm256_load_ps(&y[j]);
      __m256 zj=_mm256_load_ps(&z[j]);
      //      double x_d=x[i]-x[j];
      //      double y_d=y[i]-y[j];
      //      double z_d=z[i]-z[j];
      __m256 dx=_mm256_sub_ps(xi,xj);
      __m256 dy=_mm256_sub_ps(yi,yj);
      __m256 dz=_mm256_sub_ps(zi,zj);
      __m256 dot=_mm256_mul_ps(dx,dx);
      dot=_mm256_fmadd_ps(dy,dy,dot);
      dot=_mm256_fmadd_ps(dz,dz,dot);
      dot=_mm256_add_ps(ofs,dot);

      //      double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125));
      __m256 rsqdot = _mm256_rsqrt_ps(dot);
      __m256 rdot = _mm256_mul_ps(rsqdot,rsqdot);
      __m256 r2_3rddot = _mm256_mul_ps(rsqdot,rdot);//r2*r2*r2
      __m256 mj=_mm256_broadcast_ss(m+j);//load m[j]

      __m256 mj_r2_3rddot=_mm256_mul_ps(mj,r2_3rddot);

      //      ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d;
      //      ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d;
      //      ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d;

      ax=_mm256_fmadd_ps(dx,mj_r2_3rddot,ax);
      ay=_mm256_fmadd_ps(dy,mj_r2_3rddot,ay);
      az=_mm256_fmadd_ps(dz,mj_r2_3rddot,az);
    }

    ansx[i]=0;
    ansy[i]=0;
    ansz[i]=0;
    for(int j=0;j<8;j++){
      ansx[i]-=ax[j];
      ansy[i]-=ay[j];
      ansz[i]-=az[j];
    }
  }
}


void (*gsolver[3])(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n)={gravity1,gravity128,gravity};

float kx[5][N];
float lx[5][N];
float ky[5][N];
float ly[5][N];
float kz[5][N];
float lz[5][N];

/*runge-kutta 4th solver*/
void runge4( float x[],float y[],float z[],float vx[],float vy[],float vz[],float ax[],float ay[],float az[],float m[],float *p,const double dt,const int n,int solver){
  gsolver[solver](x,y,z,m,ax,ay,az,p,n);
  for(int j=0;j<n;j++){
    kx[1][j] = vx[j]*dt;
    lx[1][j] = ax[j]*dt;
    x[j] += kx[1][j]*(1./2.);//do
    ky[1][j] = vy[j]*dt;
    ly[1][j] = ay[j]*dt;
    y[j] += ky[1][j]*(1./2.);//do
    kz[1][j] = vz[j]*dt;
    lz[1][j] = az[j]*dt;
    z[j] += kz[1][j]*(1./2.);//do
  }
  gsolver[solver](x,y,z,m,ax,ay,az,p,n);
  for(int j=0;j<n;j++){
    x[j] -= kx[1][j]*(1./2.); //undo
    kx[2][j] = (vx[j]+lx[1][j]*0.5)*dt;
    lx[2][j] = (ax[j])*dt;
    x[j] += kx[2][j]*(1./2.);
    y[j] -= ky[1][j]*(1./2.); //undo
    ky[2][j] = (vy[j]+ly[1][j]*0.5)*dt;
    ly[2][j] = (ay[j])*dt;
    y[j] += ky[2][j]*(1./2.);
    z[j] -= kz[1][j]*(1./2.); //undo
    kz[2][j] = (vz[j]+lz[1][j]*0.5)*dt;
    lz[2][j] = (az[j])*dt;
    z[j] += kz[2][j]*(1./2.);
  }
  gsolver[solver](x,y,z,m,ax,ay,az,p,n);
  for(int j=0;j<n;j++){
    x[j] -= kx[2][j]*(1./2.); //undo
    kx[3][j] = (vx[j]+lx[2][j]*0.5)*dt;
    lx[3][j] = (ax[j])*dt;
    x[j] += kx[3][j];
    y[j] -= ky[2][j]*(1./2.); //undo
    ky[3][j] = (vy[j]+ly[2][j]*0.5)*dt;
    ly[3][j] = (ay[j])*dt;
    y[j] += ky[3][j];
    z[j] -= kz[2][j]*(1./2.); //undo
    kz[3][j] = (vz[j]+lz[2][j]*0.5)*dt;
    lz[3][j] = (az[j])*dt;
    z[j] += kz[3][j];
  }
  gsolver[solver](x,y,z,m,ax,ay,az,p,n);
  for(int j=0;j<n;j++){
    x[j] -= kx[3][j]; //undo
    kx[4][j] = (vx[j]+lx[3][j])*dt;
    lx[4][j] = (ax[j])*dt;
    kx[0][j] = (kx[1][j]+2*kx[2][j]+2*kx[3][j]+kx[4][j])*(1./6.);
    lx[0][j] = (lx[1][j]+2*lx[2][j]+2*lx[3][j]+lx[4][j])*(1./6.);
    x[j] += kx[0][j];
    vx[j] += lx[0][j];
    y[j] -= ky[3][j]; //undo
    ky[4][j] = (vy[j]+ly[3][j])*dt;
    ly[4][j] = (ay[j])*dt;
    ky[0][j] = (ky[1][j]+2*ky[2][j]+2*ky[3][j]+ky[4][j])*(1./6.);
    ly[0][j] = (ly[1][j]+2*ly[2][j]+2*ly[3][j]+ly[4][j])*(1./6.);
    y[j] += ky[0][j];
    vy[j] += ly[0][j];
    z[j] -= kz[3][j]; //undo
    kz[4][j] = (vz[j]+lz[3][j])*dt;
    lz[4][j] = (az[j])*dt;
    kz[0][j] = (kz[1][j]+2*kz[2][j]+2*kz[3][j]+kz[4][j])*(1./6.);
    lz[0][j] = (lz[1][j]+2*lz[2][j]+2*lz[3][j]+lz[4][j])*(1./6.);
    z[j] += kz[0][j];
    vz[j] += lz[0][j];
  }
}


double GetTimeStamp() {
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return (ts.tv_sec*1000000.+ts.tv_nsec/1000.);
}
float __attribute__((aligned(32))) x[N], vx[N], ax[N];
float __attribute__((aligned(32))) y[N], vy[N], ay[N];
float __attribute__((aligned(32))) z[N], vz[N], az[N];
float __attribute__((aligned(32))) m[N], p[N];

int main(int argc,const char **argv){
  int n,method;
#ifdef _USE_SDL
  SDL_Init(SDL_INIT_VIDEO);
  SDL_Window* sdlWindow;
  SDL_Event	ev;
  SDL_Texture *sdlTexture;
  SDL_Renderer *sdlRenderer;
  SDL_CreateWindowAndRenderer(w, h, SDL_WINDOW_OPENGL, &sdlWindow, &sdlRenderer);
  sdlTexture = SDL_CreateTexture(sdlRenderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_TARGET, w, h);
  //  SDL_Delay(5000);
#endif


  double dt,t=0.,tend,e0;
  int solver,numcore;

  //  std::cin >> n >> dt >> tend;
  if(argc!=4)std::cerr<<"usage:"<<std::endl<<argv[0]<<" <BODY COUNT> <SOLVER TYPE> <CORE COUNT>"<<std::endl;
  sscanf(argv[1],"%d",&n);
  sscanf(argv[2],"%d",&solver);
  sscanf(argv[3],"%d",&numcore);

  omp_set_num_threads(numcore);

  dt=0.2;
  tend=5;
  double msum;
  for(int i=0; i<n;i++){
    m[i]=((double)rand())/RAND_MAX+0.1;
    msum+=m[i];
    double r=((double)rand()+0.01)/RAND_MAX;
    r*=r;
    double uz =((double)rand())/RAND_MAX;
    double ruz = sqrt(1.-uz*uz);
    double phi =M_PI-(2.*M_PI*(double)rand())/RAND_MAX;
    x[i]=r*ruz*sin(phi);
    z[i]=r*uz;
    y[i]=r*ruz*cos(phi);//
    //  std::cin >> m[i]>>x[i]>>y[i]>>z[i]>>vx[i]>>vy[i]>>vz[i];
  }
  for(int i=0; i<n;i++){
    m[i]/=msum;
  }

  while(t<tend){
    for(int i=0;i<w*h;i++){
      rgba[i*4+3]=rgba[i*4+3]*0.9;
      rgba[i*4+2]=rgba[i*4+2]*0.9;
      rgba[i*4+1]=rgba[i*4+1]*0.9;
    }
    double mindis=dt;//at least dt
    static double prtime;
    double rtime;
    rtime = GetTimeStamp();
    //    std::cerr<<1000000./(rtime-prtime);
    prtime = rtime;

    runge4(x,y,z,vx,vy,vz,ax,ay,az,m,p,dt,n,solver);
    t+=dt;
    static int count=0;
    //    std::cout << t;
    int len=0;
    for(int i=0;i<n;i++){
#ifndef _USE_SDL
      printf("%5f\t%5f\t%5f\n",x[i],y[i],z[i]);//better performance compared to std::cout
#else
      int idx=-1;
      int px=(x[i])*(w)+w/2,py=(y[i])*(h)+h/2;
      if(px<0||px>=w||py<0||py>=h)continue;
      idx=(px+py*w)*4;
      int r=64;
      int g=64;
      int b=64;
      rgba[idx+3]+=r;rgba[idx+2]+=g;rgba[idx+1]+=b;rgba[idx+0]=255;
      if(rgba[idx+3]>=190)rgba[idx+3]=190;
      if(rgba[idx+2]>=190)rgba[idx+2]=190;
      if(rgba[idx+1]>=190)rgba[idx+1]=190;
#endif
    }
#ifndef _USE_SDL
        puts("");
#else
       SDL_UpdateTexture(sdlTexture, NULL, rgba, w*4);
    {
      SDL_Rect dstrect;
      dstrect.x = 0;
      dstrect.y = 0;
      dstrect.w = w;
      dstrect.h = h;
      SDL_RenderCopy(sdlRenderer, sdlTexture, NULL, &dstrect);
    }
    SDL_RenderPresent(sdlRenderer);
    SDL_Delay(20);
    for(int i=0;i<64;i++){
      if(SDL_PollEvent(&ev)){
	if(ev.type == SDL_QUIT)	{
	  break;
	}
      }
    }
    if(ev.type == SDL_QUIT)	{
      break;
    }
#endif
    //    std::cerr<<"fps"<<std::endl;
  }

#ifdef _USE_SDL
  SDL_Quit();
#endif
  return 0;
}
	#include <omp.h>
	#include <iostream>
	#include <cmath>
	#include <cstdlib>
	#include <x86intrin.h>
	#ifdef _USE_SDL
	#include <SDL2/SDL.h>
	#endif
	const int w=1000;
	const int h=1000;

	unsigned char rgba[wh4];

	/max point number/
	#define N 4096*4

	/calculate gravity and potential energy/
	void gravity128(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
	#pragma omp parallel for
	for(int i=0;i<n;i++){
	// p[i] = 0;
	// ansx[i]=0; ansy[i]=0; ansz[i]=0;
	__m128 ofs=_mm_set1_ps(0.03125f0.03125f4.f); //softening
	__m128 ax,ay,az;
	ax = _mm_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0)
	ay = _mm_xor_ps(ay,ay);
	az = _mm_xor_ps(az,az);
	__m128 xi=_mm_broadcast_ss(&x[i]);
	__m128 yi=_mm_broadcast_ss(&y[i]);
	__m128 zi=_mm_broadcast_ss(&z[i]);
	for(int j=0;j<n;j+=4){
	__m128 xj=_mm_load_ps(&x[j]);
	__m128 yj=_mm_load_ps(&y[j]);
	__m128 zj=_mm_load_ps(&z[j]);
	// double x_d=x[i]-x[j];
	// double y_d=y[i]-y[j];
	// double z_d=z[i]-z[j];
	__m128 dx=_mm_sub_ps(xi,xj);
	__m128 dy=_mm_sub_ps(yi,yj);
	__m128 dz=_mm_sub_ps(zi,zj);
	__m128 dot=_mm_mul_ps(dx,dx);
	dot=_mm_fmadd_ps(dy,dy,dot);
	dot=_mm_fmadd_ps(dz,dz,dot);
	dot=_mm_add_ps(ofs,dot);

	// double r2=sqrt(1./(x_dx_d+y_dy_d+z_dz_d+0.031250.03125));
	__m128 rsqdot = _mm_rsqrt_ps(dot);
	__m128 rdot = _mm_mul_ps(rsqdot,rsqdot);
	__m128 r2_3rddot = _mm_mul_ps(rsqdot,rdot);//r2r2r2
	__m128 mj=_mm_broadcast_ss(m+j);//load m[j]

	__m128 mj_r2_3rddot=_mm_mul_ps(mj,r2_3rddot);

	// ansx[i] -= (m[j]sqrt(r2r2r2))x_d;
	// ansy[i] -= (m[j]sqrt(r2r2r2))y_d;
	// ansz[i] -= (m[j]sqrt(r2r2r2))z_d;

	ax=_mm_fmadd_ps(dx,mj_r2_3rddot,ax);
	ay=_mm_fmadd_ps(dy,mj_r2_3rddot,ay);
	az=_mm_fmadd_ps(dz,mj_r2_3rddot,az);
	}

	ansx[i]=0;
	ansy[i]=0;
	ansz[i]=0;
	for(int j=0;j<4;j++){
	ansx[i]-=ax[j];
	ansy[i]-=ay[j];
	ansz[i]-=az[j];
	}
	}
	}

	/calculate gravity and potential energy/
	void gravity1(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
	#pragma omp parallel for
	for(int i=0;i<n;i++){
	// p[i] = 0;
	// ansx[i]=0; ansy[i]=0; ansz[i]=0;
	float ofs=(0.03125f*0.03125f); //softening
	float ax,ay,az;
	ax = 0;
	ay = 0;
	az = 0;
	for(int j=0;j<n;j++){
	float xj=*(&x[j]);
	float yj=*(&y[j]);
	float zj=*(&z[j]);
	// double x_d=x[i]-x[j];
	// double y_d=y[i]-y[j];
	// double z_d=z[i]-z[j];
	float dx=(x[i]-xj);
	float dy=(y[i]-yj);
	float dz=(z[i]-zj);
	float dot=(dx*dx);
	dot=(dy*dy+dot);
	dot=(dz*dz+dot);
	dot=(ofs+dot);

	// double r2=sqrt(1./(x_dx_d+y_dy_d+z_dz_d+0.031250.03125));
	float rsqdot = 1.f/sqrtf(dot);
	float rdot = (rsqdot*rsqdot);
	float r2_3rddot = (rsqdotrdot);//r2r2*r2
	float mj=*(m+j);//load m[j]

	float mj_r2_3rddot=(mj,r2_3rddot);

	// ansx[i] -= (m[j]sqrt(r2r2r2))x_d;
	// ansy[i] -= (m[j]sqrt(r2r2r2))y_d;
	// ansz[i] -= (m[j]sqrt(r2r2r2))z_d;

	ax=(dx*mj_r2_3rddot+ax);
	ay=(dy*mj_r2_3rddot+ay);
	az=(dz*mj_r2_3rddot+az);
	}

	ansx[i]=0;
	ansy[i]=0;
	ansz[i]=0;
	for(int j=0;j<1;j++){
	ansx[i]-=ax;
	ansy[i]-=ay;
	ansz[i]-=az;
	}
	}
	}

	/calculate gravity and potential energy/
	void gravity(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
	#pragma omp parallel for
	for(int i=0;i<n;i++){
	// p[i] = 0;
	// ansx[i]=0; ansy[i]=0; ansz[i]=0;
	__m256 ofs=_mm256_set1_ps(0.03125f*0.03125f); //softening
	__m256 ax,ay,az;
	ax = _mm256_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0)
	ay = _mm256_xor_ps(ay,ay);
	az = _mm256_xor_ps(az,az);
	__m256 xi=_mm256_broadcast_ss(&x[i]);
	__m256 yi=_mm256_broadcast_ss(&y[i]);
	__m256 zi=_mm256_broadcast_ss(&z[i]);
	for(int j=0;j<n;j+=8){
	__m256 xj=_mm256_load_ps(&x[j]);
	__m256 yj=_mm256_load_ps(&y[j]);
	__m256 zj=_mm256_load_ps(&z[j]);
	// double x_d=x[i]-x[j];
	// double y_d=y[i]-y[j];
	// double z_d=z[i]-z[j];
	__m256 dx=_mm256_sub_ps(xi,xj);
	__m256 dy=_mm256_sub_ps(yi,yj);
	__m256 dz=_mm256_sub_ps(zi,zj);
	__m256 dot=_mm256_mul_ps(dx,dx);
	dot=_mm256_fmadd_ps(dy,dy,dot);
	dot=_mm256_fmadd_ps(dz,dz,dot);
	dot=_mm256_add_ps(ofs,dot);

	// double r2=sqrt(1./(x_dx_d+y_dy_d+z_dz_d+0.031250.03125));
	__m256 rsqdot = _mm256_rsqrt_ps(dot);
	__m256 rdot = _mm256_mul_ps(rsqdot,rsqdot);
	__m256 r2_3rddot = _mm256_mul_ps(rsqdot,rdot);//r2r2r2
	__m256 mj=_mm256_broadcast_ss(m+j);//load m[j]

	__m256 mj_r2_3rddot=_mm256_mul_ps(mj,r2_3rddot);

	// ansx[i] -= (m[j]sqrt(r2r2r2))x_d;
	// ansy[i] -= (m[j]sqrt(r2r2r2))y_d;
	// ansz[i] -= (m[j]sqrt(r2r2r2))z_d;

	ax=_mm256_fmadd_ps(dx,mj_r2_3rddot,ax);
	ay=_mm256_fmadd_ps(dy,mj_r2_3rddot,ay);
	az=_mm256_fmadd_ps(dz,mj_r2_3rddot,az);
	}

	ansx[i]=0;
	ansy[i]=0;
	ansz[i]=0;
	for(int j=0;j<8;j++){
	ansx[i]-=ax[j];
	ansy[i]-=ay[j];
	ansz[i]-=az[j];
	}
	}
	}


	void (*gsolver[3])(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n)={gravity1,gravity128,gravity};

	float kx[5][N];
	float lx[5][N];
	float ky[5][N];
	float ly[5][N];
	float kz[5][N];
	float lz[5][N];

	/runge-kutta 4th solver/
	void runge4( float x[],float y[],float z[],float vx[],float vy[],float vz[],float ax[],float ay[],float az[],float m[],float *p,const double dt,const int n,int solver){
	gsolver[solver](x,y,z,m,ax,ay,az,p,n);
	for(int j=0;j<n;j++){
	kx[1][j] = vx[j]*dt;
	lx[1][j] = ax[j]*dt;
	x[j] += kx[1][j]*(1./2.);//do
	ky[1][j] = vy[j]*dt;
	ly[1][j] = ay[j]*dt;
	y[j] += ky[1][j]*(1./2.);//do
	kz[1][j] = vz[j]*dt;
	lz[1][j] = az[j]*dt;
	z[j] += kz[1][j]*(1./2.);//do
	}
	gsolver[solver](x,y,z,m,ax,ay,az,p,n);
	for(int j=0;j<n;j++){
	x[j] -= kx[1][j]*(1./2.); //undo
	kx[2][j] = (vx[j]+lx[1][j]0.5)dt;
	lx[2][j] = (ax[j])*dt;
	x[j] += kx[2][j]*(1./2.);
	y[j] -= ky[1][j]*(1./2.); //undo
	ky[2][j] = (vy[j]+ly[1][j]0.5)dt;
	ly[2][j] = (ay[j])*dt;
	y[j] += ky[2][j]*(1./2.);
	z[j] -= kz[1][j]*(1./2.); //undo
	kz[2][j] = (vz[j]+lz[1][j]0.5)dt;
	lz[2][j] = (az[j])*dt;
	z[j] += kz[2][j]*(1./2.);
	}
	gsolver[solver](x,y,z,m,ax,ay,az,p,n);
	for(int j=0;j<n;j++){
	x[j] -= kx[2][j]*(1./2.); //undo
	kx[3][j] = (vx[j]+lx[2][j]0.5)dt;
	lx[3][j] = (ax[j])*dt;
	x[j] += kx[3][j];
	y[j] -= ky[2][j]*(1./2.); //undo
	ky[3][j] = (vy[j]+ly[2][j]0.5)dt;
	ly[3][j] = (ay[j])*dt;
	y[j] += ky[3][j];
	z[j] -= kz[2][j]*(1./2.); //undo
	kz[3][j] = (vz[j]+lz[2][j]0.5)dt;
	lz[3][j] = (az[j])*dt;
	z[j] += kz[3][j];
	}
	gsolver[solver](x,y,z,m,ax,ay,az,p,n);
	for(int j=0;j<n;j++){
	x[j] -= kx[3][j]; //undo
	kx[4][j] = (vx[j]+lx[3][j])*dt;
	lx[4][j] = (ax[j])*dt;
	kx[0][j] = (kx[1][j]+2kx[2][j]+2kx[3][j]+kx[4][j])*(1./6.);
	lx[0][j] = (lx[1][j]+2lx[2][j]+2lx[3][j]+lx[4][j])*(1./6.);
	x[j] += kx[0][j];
	vx[j] += lx[0][j];
	y[j] -= ky[3][j]; //undo
	ky[4][j] = (vy[j]+ly[3][j])*dt;
	ly[4][j] = (ay[j])*dt;
	ky[0][j] = (ky[1][j]+2ky[2][j]+2ky[3][j]+ky[4][j])*(1./6.);
	ly[0][j] = (ly[1][j]+2ly[2][j]+2ly[3][j]+ly[4][j])*(1./6.);
	y[j] += ky[0][j];
	vy[j] += ly[0][j];
	z[j] -= kz[3][j]; //undo
	kz[4][j] = (vz[j]+lz[3][j])*dt;
	lz[4][j] = (az[j])*dt;
	kz[0][j] = (kz[1][j]+2kz[2][j]+2kz[3][j]+kz[4][j])*(1./6.);
	lz[0][j] = (lz[1][j]+2lz[2][j]+2lz[3][j]+lz[4][j])*(1./6.);
	z[j] += kz[0][j];
	vz[j] += lz[0][j];
	}
	}


	double GetTimeStamp() {
	struct timespec ts;
	clock_gettime(CLOCK_REALTIME, &ts);
	return (ts.tv_sec*1000000.+ts.tv_nsec/1000.);
	}
	float __attribute__((aligned(32))) x[N], vx[N], ax[N];
	float __attribute__((aligned(32))) y[N], vy[N], ay[N];
	float __attribute__((aligned(32))) z[N], vz[N], az[N];
	float __attribute__((aligned(32))) m[N], p[N];

	int main(int argc,const char **argv){
	int n,method;
	#ifdef _USE_SDL
	SDL_Init(SDL_INIT_VIDEO);
	SDL_Window* sdlWindow;
	SDL_Event ev;
	SDL_Texture *sdlTexture;
	SDL_Renderer *sdlRenderer;
	SDL_CreateWindowAndRenderer(w, h, SDL_WINDOW_OPENGL, &sdlWindow, &sdlRenderer);
	sdlTexture = SDL_CreateTexture(sdlRenderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_TARGET, w, h);
	// SDL_Delay(5000);
	#endif



	double dt,t=0.,tend,e0;
	int solver,numcore;

	// std::cin >> n >> dt >> tend;
	if(argc!=4)std::cerr<<"usage:"<<std::endl<<argv[0]<<" <BODY COUNT> <SOLVER TYPE> <CORE COUNT>"<<std::endl;
	sscanf(argv[1],"%d",&n);
	sscanf(argv[2],"%d",&solver);
	sscanf(argv[3],"%d",&numcore);

	omp_set_num_threads(numcore);

	dt=0.2;
	tend=5;
	double msum;
	for(int i=0; i<n;i++){
	m[i]=((double)rand())/RAND_MAX+0.1;
	msum+=m[i];
	double r=((double)rand()+0.01)/RAND_MAX;
	r*=r;
	double uz =((double)rand())/RAND_MAX;
	double ruz = sqrt(1.-uz*uz);
	double phi =M_PI-(2.M_PI(double)rand())/RAND_MAX;
	x[i]=rruzsin(phi);
	z[i]=r*uz;
	y[i]=rruzcos(phi);//
	// std::cin >> m[i]>>x[i]>>y[i]>>z[i]>>vx[i]>>vy[i]>>vz[i];
	}
	for(int i=0; i<n;i++){
	m[i]/=msum;
	}

	while(t<tend){
	for(int i=0;i<w*h;i++){
	rgba[i4+3]=rgba[i4+3]*0.9;
	rgba[i4+2]=rgba[i4+2]*0.9;
	rgba[i4+1]=rgba[i4+1]*0.9;
	}
	double mindis=dt;//at least dt
	static double prtime;
	double rtime;
	rtime = GetTimeStamp();
	// std::cerr<<1000000./(rtime-prtime);
	prtime = rtime;

	runge4(x,y,z,vx,vy,vz,ax,ay,az,m,p,dt,n,solver);
	t+=dt;
	static int count=0;
	// std::cout << t;
	int len=0;
	for(int i=0;i<n;i++){
	#ifndef _USE_SDL
	printf("%5f\t%5f\t%5f\n",x[i],y[i],z[i]);//better performance compared to std::cout
	#else
	int idx=-1;
	int px=(x[i])(w)+w/2,py=(y[i])(h)+h/2;
	if(px<0\|\|px>=w\|\|py<0\|\|py>=h)continue;
	idx=(px+pyw)4;
	int r=64;
	int g=64;
	int b=64;
	rgba[idx+3]+=r;rgba[idx+2]+=g;rgba[idx+1]+=b;rgba[idx+0]=255;
	if(rgba[idx+3]>=190)rgba[idx+3]=190;
	if(rgba[idx+2]>=190)rgba[idx+2]=190;
	if(rgba[idx+1]>=190)rgba[idx+1]=190;
	#endif
	}
	#ifndef _USE_SDL
	puts("");
	#else
	SDL_UpdateTexture(sdlTexture, NULL, rgba, w*4);
	{
	SDL_Rect dstrect;
	dstrect.x = 0;
	dstrect.y = 0;
	dstrect.w = w;
	dstrect.h = h;
	SDL_RenderCopy(sdlRenderer, sdlTexture, NULL, &dstrect);
	}
	SDL_RenderPresent(sdlRenderer);
	SDL_Delay(20);
	for(int i=0;i<64;i++){
	if(SDL_PollEvent(&ev)){
	if(ev.type == SDL_QUIT) {
	break;
	}
	}
	}
	if(ev.type == SDL_QUIT) {
	break;
	}
	#endif
	// std::cerr<<"fps"<<std::endl;
	}

	#ifdef _USE_SDL
	SDL_Quit();
	#endif
	return 0;
	}