Skip to content

Instantly share code, notes, and snippets.

@kaityo256
Forked from kohnakagawa/test_mdacp.cpp
Created March 31, 2017 14:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaityo256/76ba539bf87f95540c373e8a31f5c436 to your computer and use it in GitHub Desktop.
Save kaityo256/76ba539bf87f95540c373e8a31f5c436 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <immintrin.h>
#include <iostream>
#include <fstream>
#include <random>
#include <math.h>
#include <assert.h>
#include <sys/stat.h>
#include <sys/time.h>
const double density = 1.0;
const int N = 400000;
const int MAX_PAIRS = 100 * N;
double L = 50.0;
const double dt = 0.001;
const int D = 4;
enum {X, Y, Z};
double q[N][D];
double p[N][D];
int particle_number = 0;
int number_of_pairs = 0;
int number_of_partners[N];
int i_particles[MAX_PAIRS];
int j_particles[MAX_PAIRS];
int pointer[N], pointer2[N];
int sorted_list[MAX_PAIRS];
const double CUTOFF_LENGTH = 3.0;
const double SEARCH_LENGTH = 3.3;
const double CL2 = CUTOFF_LENGTH * CUTOFF_LENGTH;
//----------------------------------------------------------------------
typedef double v4df __attribute__((vector_size(32)));
//----------------------------------------------------------------------
void
print256(v4df r) {
double *a = (double*)(&r);
printf("%.10f %.10f %.10f %.10f\n", a[0], a[1], a[2], a[3]);
}
//----------------------------------------------------------------------
void
add_particle(double x, double y, double z) {
static std::mt19937 mt(2);
std::uniform_real_distribution<double> ud(0.0, 0.1);
q[particle_number][X] = x + ud(mt);
q[particle_number][Y] = y + ud(mt);
q[particle_number][Z] = z + ud(mt);
particle_number++;
}
//----------------------------------------------------------------------
double
myclock(void) {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + t.tv_usec * 1e-6;
}
//----------------------------------------------------------------------
void
register_pair(int index1, int index2) {
int i, j;
if (index1 < index2) {
i = index1;
j = index2;
} else {
i = index2;
j = index1;
}
i_particles[number_of_pairs] = i;
j_particles[number_of_pairs] = j;
number_of_partners[i]++;
number_of_pairs++;
i_particles[number_of_pairs] = j;
j_particles[number_of_pairs] = i;
number_of_partners[j]++;
number_of_pairs++;
}
//----------------------------------------------------------------------
void
sortpair(void) {
const int pn = particle_number;
int pos = 0;
pointer[0] = 0;
for (int i = 0; i < pn - 1; i++) {
pos += number_of_partners[i];
pointer[i + 1] = pos;
}
for (int i = 0; i < pn; i++) {
pointer2[i] = 0;
}
const int s = number_of_pairs;
for (int k = 0; k < s; k++) {
int i = i_particles[k];
int j = j_particles[k];
int index = pointer[i] + pointer2[i];
sorted_list[index] = j;
pointer2[i] ++;
}
}
//----------------------------------------------------------------------
void
makepair(void) {
const double SL2 = SEARCH_LENGTH * SEARCH_LENGTH;
const int pn = particle_number;
for (int i = 0; i < pn; i++) {
number_of_partners[i] = 0;
}
for (int i = 0; i < particle_number - 1; i++) {
for (int j = i + 1; j < particle_number; j++) {
const double dx = q[i][X] - q[j][X];
const double dy = q[i][Y] - q[j][Y];
const double dz = q[i][Z] - q[j][Z];
const double r2 = dx * dx + dy * dy + dz * dz;
if (r2 < SL2) {
register_pair(i, j);
}
}
}
}
//----------------------------------------------------------------------
void
init(void) {
const double s = 1.0 / pow(density * 0.25, 1.0 / 3.0);
const double hs = s * 0.5;
int sx = static_cast<int>(L / s);
int sy = static_cast<int>(L / s);
int sz = static_cast<int>(L / s);
for (int iz = 0; iz < sz; iz++) {
for (int iy = 0; iy < sy; iy++) {
for (int ix = 0; ix < sx; ix++) {
double x = ix * s;
double y = iy * s;
double z = iz * s;
add_particle(x , y , z);
add_particle(x , y + hs, z + hs);
add_particle(x + hs , y , z + hs);
add_particle(x + hs , y + hs, z);
}
}
}
for (int i = 0; i < particle_number; i++) {
p[i][X] = 0.0;
p[i][Y] = 0.0;
p[i][Z] = 0.0;
}
}
//----------------------------------------------------------------------
__attribute__((noinline))
void
force_pair(void) {
for (int k = 0; k < number_of_pairs; k++) {
const int i = i_particles[k];
const int j = j_particles[k];
double dx = q[j][X] - q[i][X];
double dy = q[j][Y] - q[i][Y];
double dz = q[j][Z] - q[i][Z];
double r2 = (dx * dx + dy * dy + dz * dz);
if (r2 > CL2) continue;
double r6 = r2 * r2 * r2;
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
p[i][X] += df * dx;
p[i][Y] += df * dy;
p[i][Z] += df * dz;
}
}
//----------------------------------------------------------------------
__attribute__((noinline))
void
mdacp_intrin_reactless() {
#if 1
#pragma omp parallel
{
const v4df vc24 = _mm256_set1_pd(24.0 * dt);
const v4df vc48 = _mm256_set1_pd(48.0 * dt);
const v4df vcl2 = _mm256_set1_pd(CL2);
const v4df vzero = _mm256_setzero_pd();
const int pn = particle_number;
#pragma omp for nowait
for (int i = 0; i < pn; i++) {
const v4df vqi = _mm256_loadu_pd((double*)(q + i));
v4df vpi = _mm256_loadu_pd((double*)(p + i));
const int np = number_of_partners[i];
const int kp = pointer[i];
for (int k = 0; k < (np / 4) * 4; k += 4) {
const int j_a = sorted_list[kp + k];
const v4df vqj_a = _mm256_loadu_pd((double*)(q + j_a));
v4df vdq_a = vqj_a - vqi;
const int j_b = sorted_list[kp + k + 1];
const v4df vqj_b = _mm256_loadu_pd((double*)(q + j_b));
v4df vdq_b = vqj_b - vqi;
const int j_c = sorted_list[kp + k + 2];
const v4df vqj_c = _mm256_loadu_pd((double*)(q + j_c));
v4df vdq_c = vqj_c - vqi;
const int j_d = sorted_list[kp + k + 3];
const v4df vqj_d = _mm256_loadu_pd((double*)(q + j_d));
v4df vdq_d = vqj_d - vqi;
v4df tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b);
v4df tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b);
v4df tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d);
v4df tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d);
v4df vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
v4df vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
v4df vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);
v4df vr2 = vdx * vdx + vdy * vdy + vdz * vdz;
v4df vr6 = vr2 * vr2 * vr2;
v4df vdf = (vc24 * vr6 - vc48) / (vr6 * vr6 * vr2);
v4df mask = vcl2 - vr2;
vdf = _mm256_blendv_pd(vdf, vzero, mask);
v4df vdf_a = _mm256_permute4x64_pd(vdf, 0);
v4df vdf_b = _mm256_permute4x64_pd(vdf, 85);
v4df vdf_c = _mm256_permute4x64_pd(vdf, 170);
v4df vdf_d = _mm256_permute4x64_pd(vdf, 255);
vpi += vdq_a * vdf_a;
vpi += vdq_b * vdf_b;
vpi += vdq_c * vdf_c;
vpi += vdq_d * vdf_d;
}
_mm256_storeu_pd((double*)(p + i), vpi);
double pfx = 0.0, pfy = 0.0, pfz = 0.0;
const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z];
for (int k = (np / 4) * 4; k < np; k++) {
const int j = sorted_list[kp + k];
const double dx = q[j][X] - qix;
const double dy = q[j][Y] - qiy;
const double dz = q[j][Z] - qiz;
const double r2 = (dx * dx + dy * dy + dz * dz);
const double r6 = r2 * r2 * r2;
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
if (r2 > CL2) df = 0.0;
pfx += df * dx;
pfy += df * dy;
pfz += df * dz;
}
p[i][X] += pfx;
p[i][Y] += pfy;
p[i][Z] += pfz;
}
}
#else
#pragma omp parallel
{
const auto vc24 = _mm256_set1_pd(24.0 * dt);
const auto vc48 = _mm256_set1_pd(48.0 * dt);
const auto vcl2 = _mm256_set1_pd(CL2);
const auto vzero = _mm256_setzero_pd();
const auto pn = particle_number;
#pragma omp for nowait
for (int i = 0; i < pn; i++) {
const auto vqi = _mm256_loadu_pd((double*)(q + i));
auto vpi = _mm256_loadu_pd((double*)(p + i));
const auto np = number_of_partners[i];
const auto kp = pointer[i];
for (int k = 0; k < (np / 4) * 4; k += 4) {
const auto j_a = sorted_list[kp + k];
const auto vqj_a = _mm256_loadu_pd((double*)(q + j_a));
auto vdq_a = _mm256_sub_pd(vqj_a, vqi);
const auto j_b = sorted_list[kp + k + 1];
const auto vqj_b = _mm256_loadu_pd((double*)(q + j_b));
auto vdq_b = _mm256_sub_pd(vqj_b, vqi);
const auto j_c = sorted_list[kp + k + 2];
const auto vqj_c = _mm256_loadu_pd((double*)(q + j_c));
auto vdq_c = _mm256_sub_pd(vqj_c, vqi);
const auto j_d = sorted_list[kp + k + 3];
const auto vqj_d = _mm256_loadu_pd((double*)(q + j_d));
auto vdq_d = _mm256_sub_pd(vqj_d, vqi);
auto tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b);
auto tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b);
auto tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d);
auto tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d);
auto vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
auto vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
auto vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);
auto vr2 = _mm256_fmadd_pd(vdz, vdz,
_mm256_fmadd_pd(vdy, vdy,
_mm256_mul_pd(vdx, vdx)));
auto vr6 = _mm256_mul_pd(_mm256_mul_pd(vr2, vr2), vr2);
auto vdf = _mm256_div_pd(_mm256_fmsub_pd(vc24, vr6, vc48),
_mm256_mul_pd(_mm256_mul_pd(vr6, vr6), vr2));
auto mask = vcl2 - vr2;
vdf = _mm256_blendv_pd(vdf, vzero, mask);
auto vdf_a = _mm256_permute4x64_pd(vdf, 0);
auto vdf_b = _mm256_permute4x64_pd(vdf, 85);
auto vdf_c = _mm256_permute4x64_pd(vdf, 170);
auto vdf_d = _mm256_permute4x64_pd(vdf, 255);
vpi += vdq_a * vdf_a;
vpi += vdq_b * vdf_b;
vpi += vdq_c * vdf_c;
vpi += vdq_d * vdf_d;
}
_mm256_storeu_pd((double*)(p + i), vpi);
double pfx = 0.0, pfy = 0.0, pfz = 0.0;
const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z];
for (int k = (np / 4) * 4; k < np; k++) {
const int j = sorted_list[kp + k];
const double dx = q[j][X] - qix;
const double dy = q[j][Y] - qiy;
const double dz = q[j][Z] - qiz;
const double r2 = (dx * dx + dy * dy + dz * dz);
const double r6 = r2 * r2 * r2;
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
if (r2 > CL2) df = 0.0;
pfx += df * dx;
pfy += df * dy;
pfz += df * dz;
}
p[i][X] += pfx;
p[i][Y] += pfy;
p[i][Z] += pfz;
}
}
#endif
}
//----------------------------------------------------------------------
void
loadpair(void) {
std::ifstream ifs("pair.dat", std::ios::binary);
ifs.read((char*)&number_of_pairs, sizeof(int));
ifs.read((char*)number_of_partners, sizeof(int)*N);
ifs.read((char*)i_particles, sizeof(int)*MAX_PAIRS);
ifs.read((char*)j_particles, sizeof(int)*MAX_PAIRS);
}
//----------------------------------------------------------------------
void
savepair(void) {
makepair();
std::ofstream ofs("pair.dat", std::ios::binary);
ofs.write((char*)&number_of_pairs, sizeof(int));
ofs.write((char*)number_of_partners, sizeof(int)*N);
ofs.write((char*)i_particles, sizeof(int)*MAX_PAIRS);
ofs.write((char*)j_particles, sizeof(int)*MAX_PAIRS);
}
//----------------------------------------------------------------------
void
print_result(void) {
for (int i = 0; i < 5; i++) {
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
}
for (int i = particle_number - 5; i < particle_number; i++) {
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
}
}
//----------------------------------------------------------------------
int
main(void) {
init();
struct stat st;
int ret = stat("pair.dat", &st);
if (ret == 0) {
std::cerr << "A pair-file is found. I use it." << std::endl;
loadpair();
} else {
std::cerr << "Make pairlist." << std::endl;
savepair();
}
std::cerr << "Number of pairs: " << number_of_pairs << std::endl;
sortpair();
#ifdef MDACP_TEST
const auto beg = myclock();
for (int i = 0; i < 100; i++) {
mdacp_intrin_reactless();
}
const auto dur = myclock() - beg;
std::cout << dur << "[s]\n";
print_result();
#else
for (int i = 0; i < 100; i++) {
force_pair();
}
print_result();
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment