Last active
February 15, 2020 02:32
-
-
Save elect-gombe/24a8664bcd124a259d1fc4c86fb55326 to your computer and use it in GitHub Desktop.
multi-body simulation using SIMD or SISD instruction modes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <omp.h> | |
#include <iostream> | |
#include <cmath> | |
#include <cstdlib> | |
#include <x86intrin.h> | |
#ifdef _USE_SDL | |
#include <SDL2/SDL.h> | |
#endif | |
const int w=1000; | |
const int h=1000; | |
unsigned char rgba[w*h*4]; | |
/*max point number*/ | |
#define N 4096*4 | |
/*calculate gravity and potential energy*/ | |
void gravity128(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){ | |
#pragma omp parallel for | |
for(int i=0;i<n;i++){ | |
// p[i] = 0; | |
// ansx[i]=0; ansy[i]=0; ansz[i]=0; | |
__m128 ofs=_mm_set1_ps(0.03125f*0.03125f*4.f); //softening | |
__m128 ax,ay,az; | |
ax = _mm_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0) | |
ay = _mm_xor_ps(ay,ay); | |
az = _mm_xor_ps(az,az); | |
__m128 xi=_mm_broadcast_ss(&x[i]); | |
__m128 yi=_mm_broadcast_ss(&y[i]); | |
__m128 zi=_mm_broadcast_ss(&z[i]); | |
for(int j=0;j<n;j+=4){ | |
__m128 xj=_mm_load_ps(&x[j]); | |
__m128 yj=_mm_load_ps(&y[j]); | |
__m128 zj=_mm_load_ps(&z[j]); | |
// double x_d=x[i]-x[j]; | |
// double y_d=y[i]-y[j]; | |
// double z_d=z[i]-z[j]; | |
__m128 dx=_mm_sub_ps(xi,xj); | |
__m128 dy=_mm_sub_ps(yi,yj); | |
__m128 dz=_mm_sub_ps(zi,zj); | |
__m128 dot=_mm_mul_ps(dx,dx); | |
dot=_mm_fmadd_ps(dy,dy,dot); | |
dot=_mm_fmadd_ps(dz,dz,dot); | |
dot=_mm_add_ps(ofs,dot); | |
// double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125)); | |
__m128 rsqdot = _mm_rsqrt_ps(dot); | |
__m128 rdot = _mm_mul_ps(rsqdot,rsqdot); | |
__m128 r2_3rddot = _mm_mul_ps(rsqdot,rdot);//r2*r2*r2 | |
__m128 mj=_mm_broadcast_ss(m+j);//load m[j] | |
__m128 mj_r2_3rddot=_mm_mul_ps(mj,r2_3rddot); | |
// ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d; | |
// ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d; | |
// ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d; | |
ax=_mm_fmadd_ps(dx,mj_r2_3rddot,ax); | |
ay=_mm_fmadd_ps(dy,mj_r2_3rddot,ay); | |
az=_mm_fmadd_ps(dz,mj_r2_3rddot,az); | |
} | |
ansx[i]=0; | |
ansy[i]=0; | |
ansz[i]=0; | |
for(int j=0;j<4;j++){ | |
ansx[i]-=ax[j]; | |
ansy[i]-=ay[j]; | |
ansz[i]-=az[j]; | |
} | |
} | |
} | |
/*calculate gravity and potential energy*/ | |
void gravity1(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){ | |
#pragma omp parallel for | |
for(int i=0;i<n;i++){ | |
// p[i] = 0; | |
// ansx[i]=0; ansy[i]=0; ansz[i]=0; | |
float ofs=(0.03125f*0.03125f); //softening | |
float ax,ay,az; | |
ax = 0; | |
ay = 0; | |
az = 0; | |
for(int j=0;j<n;j++){ | |
float xj=*(&x[j]); | |
float yj=*(&y[j]); | |
float zj=*(&z[j]); | |
// double x_d=x[i]-x[j]; | |
// double y_d=y[i]-y[j]; | |
// double z_d=z[i]-z[j]; | |
float dx=(x[i]-xj); | |
float dy=(y[i]-yj); | |
float dz=(z[i]-zj); | |
float dot=(dx*dx); | |
dot=(dy*dy+dot); | |
dot=(dz*dz+dot); | |
dot=(ofs+dot); | |
// double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125)); | |
float rsqdot = 1.f/sqrtf(dot); | |
float rdot = (rsqdot*rsqdot); | |
float r2_3rddot = (rsqdot*rdot);//r2*r2*r2 | |
float mj=*(m+j);//load m[j] | |
float mj_r2_3rddot=(mj,r2_3rddot); | |
// ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d; | |
// ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d; | |
// ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d; | |
ax=(dx*mj_r2_3rddot+ax); | |
ay=(dy*mj_r2_3rddot+ay); | |
az=(dz*mj_r2_3rddot+az); | |
} | |
ansx[i]=0; | |
ansy[i]=0; | |
ansz[i]=0; | |
for(int j=0;j<1;j++){ | |
ansx[i]-=ax; | |
ansy[i]-=ay; | |
ansz[i]-=az; | |
} | |
} | |
} | |
/*calculate gravity and potential energy*/ | |
void gravity(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){ | |
#pragma omp parallel for | |
for(int i=0;i<n;i++){ | |
// p[i] = 0; | |
// ansx[i]=0; ansy[i]=0; ansz[i]=0; | |
__m256 ofs=_mm256_set1_ps(0.03125f*0.03125f); //softening | |
__m256 ax,ay,az; | |
ax = _mm256_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0) | |
ay = _mm256_xor_ps(ay,ay); | |
az = _mm256_xor_ps(az,az); | |
__m256 xi=_mm256_broadcast_ss(&x[i]); | |
__m256 yi=_mm256_broadcast_ss(&y[i]); | |
__m256 zi=_mm256_broadcast_ss(&z[i]); | |
for(int j=0;j<n;j+=8){ | |
__m256 xj=_mm256_load_ps(&x[j]); | |
__m256 yj=_mm256_load_ps(&y[j]); | |
__m256 zj=_mm256_load_ps(&z[j]); | |
// double x_d=x[i]-x[j]; | |
// double y_d=y[i]-y[j]; | |
// double z_d=z[i]-z[j]; | |
__m256 dx=_mm256_sub_ps(xi,xj); | |
__m256 dy=_mm256_sub_ps(yi,yj); | |
__m256 dz=_mm256_sub_ps(zi,zj); | |
__m256 dot=_mm256_mul_ps(dx,dx); | |
dot=_mm256_fmadd_ps(dy,dy,dot); | |
dot=_mm256_fmadd_ps(dz,dz,dot); | |
dot=_mm256_add_ps(ofs,dot); | |
// double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125)); | |
__m256 rsqdot = _mm256_rsqrt_ps(dot); | |
__m256 rdot = _mm256_mul_ps(rsqdot,rsqdot); | |
__m256 r2_3rddot = _mm256_mul_ps(rsqdot,rdot);//r2*r2*r2 | |
__m256 mj=_mm256_broadcast_ss(m+j);//load m[j] | |
__m256 mj_r2_3rddot=_mm256_mul_ps(mj,r2_3rddot); | |
// ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d; | |
// ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d; | |
// ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d; | |
ax=_mm256_fmadd_ps(dx,mj_r2_3rddot,ax); | |
ay=_mm256_fmadd_ps(dy,mj_r2_3rddot,ay); | |
az=_mm256_fmadd_ps(dz,mj_r2_3rddot,az); | |
} | |
ansx[i]=0; | |
ansy[i]=0; | |
ansz[i]=0; | |
for(int j=0;j<8;j++){ | |
ansx[i]-=ax[j]; | |
ansy[i]-=ay[j]; | |
ansz[i]-=az[j]; | |
} | |
} | |
} | |
void (*gsolver[3])(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n)={gravity1,gravity128,gravity}; | |
float kx[5][N]; | |
float lx[5][N]; | |
float ky[5][N]; | |
float ly[5][N]; | |
float kz[5][N]; | |
float lz[5][N]; | |
/*runge-kutta 4th solver*/ | |
void runge4( float x[],float y[],float z[],float vx[],float vy[],float vz[],float ax[],float ay[],float az[],float m[],float *p,const double dt,const int n,int solver){ | |
gsolver[solver](x,y,z,m,ax,ay,az,p,n); | |
for(int j=0;j<n;j++){ | |
kx[1][j] = vx[j]*dt; | |
lx[1][j] = ax[j]*dt; | |
x[j] += kx[1][j]*(1./2.);//do | |
ky[1][j] = vy[j]*dt; | |
ly[1][j] = ay[j]*dt; | |
y[j] += ky[1][j]*(1./2.);//do | |
kz[1][j] = vz[j]*dt; | |
lz[1][j] = az[j]*dt; | |
z[j] += kz[1][j]*(1./2.);//do | |
} | |
gsolver[solver](x,y,z,m,ax,ay,az,p,n); | |
for(int j=0;j<n;j++){ | |
x[j] -= kx[1][j]*(1./2.); //undo | |
kx[2][j] = (vx[j]+lx[1][j]*0.5)*dt; | |
lx[2][j] = (ax[j])*dt; | |
x[j] += kx[2][j]*(1./2.); | |
y[j] -= ky[1][j]*(1./2.); //undo | |
ky[2][j] = (vy[j]+ly[1][j]*0.5)*dt; | |
ly[2][j] = (ay[j])*dt; | |
y[j] += ky[2][j]*(1./2.); | |
z[j] -= kz[1][j]*(1./2.); //undo | |
kz[2][j] = (vz[j]+lz[1][j]*0.5)*dt; | |
lz[2][j] = (az[j])*dt; | |
z[j] += kz[2][j]*(1./2.); | |
} | |
gsolver[solver](x,y,z,m,ax,ay,az,p,n); | |
for(int j=0;j<n;j++){ | |
x[j] -= kx[2][j]*(1./2.); //undo | |
kx[3][j] = (vx[j]+lx[2][j]*0.5)*dt; | |
lx[3][j] = (ax[j])*dt; | |
x[j] += kx[3][j]; | |
y[j] -= ky[2][j]*(1./2.); //undo | |
ky[3][j] = (vy[j]+ly[2][j]*0.5)*dt; | |
ly[3][j] = (ay[j])*dt; | |
y[j] += ky[3][j]; | |
z[j] -= kz[2][j]*(1./2.); //undo | |
kz[3][j] = (vz[j]+lz[2][j]*0.5)*dt; | |
lz[3][j] = (az[j])*dt; | |
z[j] += kz[3][j]; | |
} | |
gsolver[solver](x,y,z,m,ax,ay,az,p,n); | |
for(int j=0;j<n;j++){ | |
x[j] -= kx[3][j]; //undo | |
kx[4][j] = (vx[j]+lx[3][j])*dt; | |
lx[4][j] = (ax[j])*dt; | |
kx[0][j] = (kx[1][j]+2*kx[2][j]+2*kx[3][j]+kx[4][j])*(1./6.); | |
lx[0][j] = (lx[1][j]+2*lx[2][j]+2*lx[3][j]+lx[4][j])*(1./6.); | |
x[j] += kx[0][j]; | |
vx[j] += lx[0][j]; | |
y[j] -= ky[3][j]; //undo | |
ky[4][j] = (vy[j]+ly[3][j])*dt; | |
ly[4][j] = (ay[j])*dt; | |
ky[0][j] = (ky[1][j]+2*ky[2][j]+2*ky[3][j]+ky[4][j])*(1./6.); | |
ly[0][j] = (ly[1][j]+2*ly[2][j]+2*ly[3][j]+ly[4][j])*(1./6.); | |
y[j] += ky[0][j]; | |
vy[j] += ly[0][j]; | |
z[j] -= kz[3][j]; //undo | |
kz[4][j] = (vz[j]+lz[3][j])*dt; | |
lz[4][j] = (az[j])*dt; | |
kz[0][j] = (kz[1][j]+2*kz[2][j]+2*kz[3][j]+kz[4][j])*(1./6.); | |
lz[0][j] = (lz[1][j]+2*lz[2][j]+2*lz[3][j]+lz[4][j])*(1./6.); | |
z[j] += kz[0][j]; | |
vz[j] += lz[0][j]; | |
} | |
} | |
double GetTimeStamp() { | |
struct timespec ts; | |
clock_gettime(CLOCK_REALTIME, &ts); | |
return (ts.tv_sec*1000000.+ts.tv_nsec/1000.); | |
} | |
float __attribute__((aligned(32))) x[N], vx[N], ax[N]; | |
float __attribute__((aligned(32))) y[N], vy[N], ay[N]; | |
float __attribute__((aligned(32))) z[N], vz[N], az[N]; | |
float __attribute__((aligned(32))) m[N], p[N]; | |
int main(int argc,const char **argv){ | |
int n,method; | |
#ifdef _USE_SDL | |
SDL_Init(SDL_INIT_VIDEO); | |
SDL_Window* sdlWindow; | |
SDL_Event ev; | |
SDL_Texture *sdlTexture; | |
SDL_Renderer *sdlRenderer; | |
SDL_CreateWindowAndRenderer(w, h, SDL_WINDOW_OPENGL, &sdlWindow, &sdlRenderer); | |
sdlTexture = SDL_CreateTexture(sdlRenderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_TARGET, w, h); | |
// SDL_Delay(5000); | |
#endif | |
double dt,t=0.,tend,e0; | |
int solver,numcore; | |
// std::cin >> n >> dt >> tend; | |
if(argc!=4)std::cerr<<"usage:"<<std::endl<<argv[0]<<" <BODY COUNT> <SOLVER TYPE> <CORE COUNT>"<<std::endl; | |
sscanf(argv[1],"%d",&n); | |
sscanf(argv[2],"%d",&solver); | |
sscanf(argv[3],"%d",&numcore); | |
omp_set_num_threads(numcore); | |
dt=0.2; | |
tend=5; | |
double msum; | |
for(int i=0; i<n;i++){ | |
m[i]=((double)rand())/RAND_MAX+0.1; | |
msum+=m[i]; | |
double r=((double)rand()+0.01)/RAND_MAX; | |
r*=r; | |
double uz =((double)rand())/RAND_MAX; | |
double ruz = sqrt(1.-uz*uz); | |
double phi =M_PI-(2.*M_PI*(double)rand())/RAND_MAX; | |
x[i]=r*ruz*sin(phi); | |
z[i]=r*uz; | |
y[i]=r*ruz*cos(phi);// | |
// std::cin >> m[i]>>x[i]>>y[i]>>z[i]>>vx[i]>>vy[i]>>vz[i]; | |
} | |
for(int i=0; i<n;i++){ | |
m[i]/=msum; | |
} | |
while(t<tend){ | |
for(int i=0;i<w*h;i++){ | |
rgba[i*4+3]=rgba[i*4+3]*0.9; | |
rgba[i*4+2]=rgba[i*4+2]*0.9; | |
rgba[i*4+1]=rgba[i*4+1]*0.9; | |
} | |
double mindis=dt;//at least dt | |
static double prtime; | |
double rtime; | |
rtime = GetTimeStamp(); | |
// std::cerr<<1000000./(rtime-prtime); | |
prtime = rtime; | |
runge4(x,y,z,vx,vy,vz,ax,ay,az,m,p,dt,n,solver); | |
t+=dt; | |
static int count=0; | |
// std::cout << t; | |
int len=0; | |
for(int i=0;i<n;i++){ | |
#ifndef _USE_SDL | |
printf("%5f\t%5f\t%5f\n",x[i],y[i],z[i]);//better performance compared to std::cout | |
#else | |
int idx=-1; | |
int px=(x[i])*(w)+w/2,py=(y[i])*(h)+h/2; | |
if(px<0||px>=w||py<0||py>=h)continue; | |
idx=(px+py*w)*4; | |
int r=64; | |
int g=64; | |
int b=64; | |
rgba[idx+3]+=r;rgba[idx+2]+=g;rgba[idx+1]+=b;rgba[idx+0]=255; | |
if(rgba[idx+3]>=190)rgba[idx+3]=190; | |
if(rgba[idx+2]>=190)rgba[idx+2]=190; | |
if(rgba[idx+1]>=190)rgba[idx+1]=190; | |
#endif | |
} | |
#ifndef _USE_SDL | |
puts(""); | |
#else | |
SDL_UpdateTexture(sdlTexture, NULL, rgba, w*4); | |
{ | |
SDL_Rect dstrect; | |
dstrect.x = 0; | |
dstrect.y = 0; | |
dstrect.w = w; | |
dstrect.h = h; | |
SDL_RenderCopy(sdlRenderer, sdlTexture, NULL, &dstrect); | |
} | |
SDL_RenderPresent(sdlRenderer); | |
SDL_Delay(20); | |
for(int i=0;i<64;i++){ | |
if(SDL_PollEvent(&ev)){ | |
if(ev.type == SDL_QUIT) { | |
break; | |
} | |
} | |
} | |
if(ev.type == SDL_QUIT) { | |
break; | |
} | |
#endif | |
// std::cerr<<"fps"<<std::endl; | |
} | |
#ifdef _USE_SDL | |
SDL_Quit(); | |
#endif | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment