Skip to content

Instantly share code, notes, and snippets.

@elect-gombe
Last active February 15, 2020 02:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save elect-gombe/24a8664bcd124a259d1fc4c86fb55326 to your computer and use it in GitHub Desktop.
Save elect-gombe/24a8664bcd124a259d1fc4c86fb55326 to your computer and use it in GitHub Desktop.
multi-body simulation using SIMD or SISD instruction modes.
#include <omp.h>
#include <iostream>
#include <cmath>
#include <cstdlib>
#include <x86intrin.h>
#ifdef _USE_SDL
#include <SDL2/SDL.h>
#endif
const int w=1000;
const int h=1000;
unsigned char rgba[w*h*4];
/*max point number*/
#define N 4096*4
/*calculate gravity and potential energy*/
void gravity128(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
#pragma omp parallel for
for(int i=0;i<n;i++){
// p[i] = 0;
// ansx[i]=0; ansy[i]=0; ansz[i]=0;
__m128 ofs=_mm_set1_ps(0.03125f*0.03125f*4.f); //softening
__m128 ax,ay,az;
ax = _mm_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0)
ay = _mm_xor_ps(ay,ay);
az = _mm_xor_ps(az,az);
__m128 xi=_mm_broadcast_ss(&x[i]);
__m128 yi=_mm_broadcast_ss(&y[i]);
__m128 zi=_mm_broadcast_ss(&z[i]);
for(int j=0;j<n;j+=4){
__m128 xj=_mm_load_ps(&x[j]);
__m128 yj=_mm_load_ps(&y[j]);
__m128 zj=_mm_load_ps(&z[j]);
// double x_d=x[i]-x[j];
// double y_d=y[i]-y[j];
// double z_d=z[i]-z[j];
__m128 dx=_mm_sub_ps(xi,xj);
__m128 dy=_mm_sub_ps(yi,yj);
__m128 dz=_mm_sub_ps(zi,zj);
__m128 dot=_mm_mul_ps(dx,dx);
dot=_mm_fmadd_ps(dy,dy,dot);
dot=_mm_fmadd_ps(dz,dz,dot);
dot=_mm_add_ps(ofs,dot);
// double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125));
__m128 rsqdot = _mm_rsqrt_ps(dot);
__m128 rdot = _mm_mul_ps(rsqdot,rsqdot);
__m128 r2_3rddot = _mm_mul_ps(rsqdot,rdot);//r2*r2*r2
__m128 mj=_mm_broadcast_ss(m+j);//load m[j]
__m128 mj_r2_3rddot=_mm_mul_ps(mj,r2_3rddot);
// ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d;
// ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d;
// ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d;
ax=_mm_fmadd_ps(dx,mj_r2_3rddot,ax);
ay=_mm_fmadd_ps(dy,mj_r2_3rddot,ay);
az=_mm_fmadd_ps(dz,mj_r2_3rddot,az);
}
ansx[i]=0;
ansy[i]=0;
ansz[i]=0;
for(int j=0;j<4;j++){
ansx[i]-=ax[j];
ansy[i]-=ay[j];
ansz[i]-=az[j];
}
}
}
/*calculate gravity and potential energy*/
void gravity1(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
#pragma omp parallel for
for(int i=0;i<n;i++){
// p[i] = 0;
// ansx[i]=0; ansy[i]=0; ansz[i]=0;
float ofs=(0.03125f*0.03125f); //softening
float ax,ay,az;
ax = 0;
ay = 0;
az = 0;
for(int j=0;j<n;j++){
float xj=*(&x[j]);
float yj=*(&y[j]);
float zj=*(&z[j]);
// double x_d=x[i]-x[j];
// double y_d=y[i]-y[j];
// double z_d=z[i]-z[j];
float dx=(x[i]-xj);
float dy=(y[i]-yj);
float dz=(z[i]-zj);
float dot=(dx*dx);
dot=(dy*dy+dot);
dot=(dz*dz+dot);
dot=(ofs+dot);
// double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125));
float rsqdot = 1.f/sqrtf(dot);
float rdot = (rsqdot*rsqdot);
float r2_3rddot = (rsqdot*rdot);//r2*r2*r2
float mj=*(m+j);//load m[j]
float mj_r2_3rddot=(mj,r2_3rddot);
// ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d;
// ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d;
// ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d;
ax=(dx*mj_r2_3rddot+ax);
ay=(dy*mj_r2_3rddot+ay);
az=(dz*mj_r2_3rddot+az);
}
ansx[i]=0;
ansy[i]=0;
ansz[i]=0;
for(int j=0;j<1;j++){
ansx[i]-=ax;
ansy[i]-=ay;
ansz[i]-=az;
}
}
}
/*calculate gravity and potential energy*/
void gravity(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n){
#pragma omp parallel for
for(int i=0;i<n;i++){
// p[i] = 0;
// ansx[i]=0; ansy[i]=0; ansz[i]=0;
__m256 ofs=_mm256_set1_ps(0.03125f*0.03125f); //softening
__m256 ax,ay,az;
ax = _mm256_xor_ps(ax,ax);//ax=(0,0,0,0,0,0,0,0)
ay = _mm256_xor_ps(ay,ay);
az = _mm256_xor_ps(az,az);
__m256 xi=_mm256_broadcast_ss(&x[i]);
__m256 yi=_mm256_broadcast_ss(&y[i]);
__m256 zi=_mm256_broadcast_ss(&z[i]);
for(int j=0;j<n;j+=8){
__m256 xj=_mm256_load_ps(&x[j]);
__m256 yj=_mm256_load_ps(&y[j]);
__m256 zj=_mm256_load_ps(&z[j]);
// double x_d=x[i]-x[j];
// double y_d=y[i]-y[j];
// double z_d=z[i]-z[j];
__m256 dx=_mm256_sub_ps(xi,xj);
__m256 dy=_mm256_sub_ps(yi,yj);
__m256 dz=_mm256_sub_ps(zi,zj);
__m256 dot=_mm256_mul_ps(dx,dx);
dot=_mm256_fmadd_ps(dy,dy,dot);
dot=_mm256_fmadd_ps(dz,dz,dot);
dot=_mm256_add_ps(ofs,dot);
// double r2=sqrt(1./(x_d*x_d+y_d*y_d+z_d*z_d+0.03125*0.03125));
__m256 rsqdot = _mm256_rsqrt_ps(dot);
__m256 rdot = _mm256_mul_ps(rsqdot,rsqdot);
__m256 r2_3rddot = _mm256_mul_ps(rsqdot,rdot);//r2*r2*r2
__m256 mj=_mm256_broadcast_ss(m+j);//load m[j]
__m256 mj_r2_3rddot=_mm256_mul_ps(mj,r2_3rddot);
// ansx[i] -= (m[j]*sqrt(r2*r2*r2))*x_d;
// ansy[i] -= (m[j]*sqrt(r2*r2*r2))*y_d;
// ansz[i] -= (m[j]*sqrt(r2*r2*r2))*z_d;
ax=_mm256_fmadd_ps(dx,mj_r2_3rddot,ax);
ay=_mm256_fmadd_ps(dy,mj_r2_3rddot,ay);
az=_mm256_fmadd_ps(dz,mj_r2_3rddot,az);
}
ansx[i]=0;
ansy[i]=0;
ansz[i]=0;
for(int j=0;j<8;j++){
ansx[i]-=ax[j];
ansy[i]-=ay[j];
ansz[i]-=az[j];
}
}
}
void (*gsolver[3])(const float x[],const float y[],const float z[],const float m[],float ansx[],float ansy[],float ansz[] ,float p[],const int n)={gravity1,gravity128,gravity};
float kx[5][N];
float lx[5][N];
float ky[5][N];
float ly[5][N];
float kz[5][N];
float lz[5][N];
/*runge-kutta 4th solver*/
void runge4( float x[],float y[],float z[],float vx[],float vy[],float vz[],float ax[],float ay[],float az[],float m[],float *p,const double dt,const int n,int solver){
gsolver[solver](x,y,z,m,ax,ay,az,p,n);
for(int j=0;j<n;j++){
kx[1][j] = vx[j]*dt;
lx[1][j] = ax[j]*dt;
x[j] += kx[1][j]*(1./2.);//do
ky[1][j] = vy[j]*dt;
ly[1][j] = ay[j]*dt;
y[j] += ky[1][j]*(1./2.);//do
kz[1][j] = vz[j]*dt;
lz[1][j] = az[j]*dt;
z[j] += kz[1][j]*(1./2.);//do
}
gsolver[solver](x,y,z,m,ax,ay,az,p,n);
for(int j=0;j<n;j++){
x[j] -= kx[1][j]*(1./2.); //undo
kx[2][j] = (vx[j]+lx[1][j]*0.5)*dt;
lx[2][j] = (ax[j])*dt;
x[j] += kx[2][j]*(1./2.);
y[j] -= ky[1][j]*(1./2.); //undo
ky[2][j] = (vy[j]+ly[1][j]*0.5)*dt;
ly[2][j] = (ay[j])*dt;
y[j] += ky[2][j]*(1./2.);
z[j] -= kz[1][j]*(1./2.); //undo
kz[2][j] = (vz[j]+lz[1][j]*0.5)*dt;
lz[2][j] = (az[j])*dt;
z[j] += kz[2][j]*(1./2.);
}
gsolver[solver](x,y,z,m,ax,ay,az,p,n);
for(int j=0;j<n;j++){
x[j] -= kx[2][j]*(1./2.); //undo
kx[3][j] = (vx[j]+lx[2][j]*0.5)*dt;
lx[3][j] = (ax[j])*dt;
x[j] += kx[3][j];
y[j] -= ky[2][j]*(1./2.); //undo
ky[3][j] = (vy[j]+ly[2][j]*0.5)*dt;
ly[3][j] = (ay[j])*dt;
y[j] += ky[3][j];
z[j] -= kz[2][j]*(1./2.); //undo
kz[3][j] = (vz[j]+lz[2][j]*0.5)*dt;
lz[3][j] = (az[j])*dt;
z[j] += kz[3][j];
}
gsolver[solver](x,y,z,m,ax,ay,az,p,n);
for(int j=0;j<n;j++){
x[j] -= kx[3][j]; //undo
kx[4][j] = (vx[j]+lx[3][j])*dt;
lx[4][j] = (ax[j])*dt;
kx[0][j] = (kx[1][j]+2*kx[2][j]+2*kx[3][j]+kx[4][j])*(1./6.);
lx[0][j] = (lx[1][j]+2*lx[2][j]+2*lx[3][j]+lx[4][j])*(1./6.);
x[j] += kx[0][j];
vx[j] += lx[0][j];
y[j] -= ky[3][j]; //undo
ky[4][j] = (vy[j]+ly[3][j])*dt;
ly[4][j] = (ay[j])*dt;
ky[0][j] = (ky[1][j]+2*ky[2][j]+2*ky[3][j]+ky[4][j])*(1./6.);
ly[0][j] = (ly[1][j]+2*ly[2][j]+2*ly[3][j]+ly[4][j])*(1./6.);
y[j] += ky[0][j];
vy[j] += ly[0][j];
z[j] -= kz[3][j]; //undo
kz[4][j] = (vz[j]+lz[3][j])*dt;
lz[4][j] = (az[j])*dt;
kz[0][j] = (kz[1][j]+2*kz[2][j]+2*kz[3][j]+kz[4][j])*(1./6.);
lz[0][j] = (lz[1][j]+2*lz[2][j]+2*lz[3][j]+lz[4][j])*(1./6.);
z[j] += kz[0][j];
vz[j] += lz[0][j];
}
}
double GetTimeStamp() {
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return (ts.tv_sec*1000000.+ts.tv_nsec/1000.);
}
float __attribute__((aligned(32))) x[N], vx[N], ax[N];
float __attribute__((aligned(32))) y[N], vy[N], ay[N];
float __attribute__((aligned(32))) z[N], vz[N], az[N];
float __attribute__((aligned(32))) m[N], p[N];
int main(int argc,const char **argv){
int n,method;
#ifdef _USE_SDL
SDL_Init(SDL_INIT_VIDEO);
SDL_Window* sdlWindow;
SDL_Event ev;
SDL_Texture *sdlTexture;
SDL_Renderer *sdlRenderer;
SDL_CreateWindowAndRenderer(w, h, SDL_WINDOW_OPENGL, &sdlWindow, &sdlRenderer);
sdlTexture = SDL_CreateTexture(sdlRenderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_TARGET, w, h);
// SDL_Delay(5000);
#endif
double dt,t=0.,tend,e0;
int solver,numcore;
// std::cin >> n >> dt >> tend;
if(argc!=4)std::cerr<<"usage:"<<std::endl<<argv[0]<<" <BODY COUNT> <SOLVER TYPE> <CORE COUNT>"<<std::endl;
sscanf(argv[1],"%d",&n);
sscanf(argv[2],"%d",&solver);
sscanf(argv[3],"%d",&numcore);
omp_set_num_threads(numcore);
dt=0.2;
tend=5;
double msum;
for(int i=0; i<n;i++){
m[i]=((double)rand())/RAND_MAX+0.1;
msum+=m[i];
double r=((double)rand()+0.01)/RAND_MAX;
r*=r;
double uz =((double)rand())/RAND_MAX;
double ruz = sqrt(1.-uz*uz);
double phi =M_PI-(2.*M_PI*(double)rand())/RAND_MAX;
x[i]=r*ruz*sin(phi);
z[i]=r*uz;
y[i]=r*ruz*cos(phi);//
// std::cin >> m[i]>>x[i]>>y[i]>>z[i]>>vx[i]>>vy[i]>>vz[i];
}
for(int i=0; i<n;i++){
m[i]/=msum;
}
while(t<tend){
for(int i=0;i<w*h;i++){
rgba[i*4+3]=rgba[i*4+3]*0.9;
rgba[i*4+2]=rgba[i*4+2]*0.9;
rgba[i*4+1]=rgba[i*4+1]*0.9;
}
double mindis=dt;//at least dt
static double prtime;
double rtime;
rtime = GetTimeStamp();
// std::cerr<<1000000./(rtime-prtime);
prtime = rtime;
runge4(x,y,z,vx,vy,vz,ax,ay,az,m,p,dt,n,solver);
t+=dt;
static int count=0;
// std::cout << t;
int len=0;
for(int i=0;i<n;i++){
#ifndef _USE_SDL
printf("%5f\t%5f\t%5f\n",x[i],y[i],z[i]);//better performance compared to std::cout
#else
int idx=-1;
int px=(x[i])*(w)+w/2,py=(y[i])*(h)+h/2;
if(px<0||px>=w||py<0||py>=h)continue;
idx=(px+py*w)*4;
int r=64;
int g=64;
int b=64;
rgba[idx+3]+=r;rgba[idx+2]+=g;rgba[idx+1]+=b;rgba[idx+0]=255;
if(rgba[idx+3]>=190)rgba[idx+3]=190;
if(rgba[idx+2]>=190)rgba[idx+2]=190;
if(rgba[idx+1]>=190)rgba[idx+1]=190;
#endif
}
#ifndef _USE_SDL
puts("");
#else
SDL_UpdateTexture(sdlTexture, NULL, rgba, w*4);
{
SDL_Rect dstrect;
dstrect.x = 0;
dstrect.y = 0;
dstrect.w = w;
dstrect.h = h;
SDL_RenderCopy(sdlRenderer, sdlTexture, NULL, &dstrect);
}
SDL_RenderPresent(sdlRenderer);
SDL_Delay(20);
for(int i=0;i<64;i++){
if(SDL_PollEvent(&ev)){
if(ev.type == SDL_QUIT) {
break;
}
}
}
if(ev.type == SDL_QUIT) {
break;
}
#endif
// std::cerr<<"fps"<<std::endl;
}
#ifdef _USE_SDL
SDL_Quit();
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment