-
-
Save kaityo256/76ba539bf87f95540c373e8a31f5c436 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <immintrin.h> | |
#include <iostream> | |
#include <fstream> | |
#include <random> | |
#include <math.h> | |
#include <assert.h> | |
#include <sys/stat.h> | |
#include <sys/time.h> | |
const double density = 1.0; | |
const int N = 400000; | |
const int MAX_PAIRS = 100 * N; | |
double L = 50.0; | |
const double dt = 0.001; | |
const int D = 4; | |
enum {X, Y, Z}; | |
double q[N][D]; | |
double p[N][D]; | |
int particle_number = 0; | |
int number_of_pairs = 0; | |
int number_of_partners[N]; | |
int i_particles[MAX_PAIRS]; | |
int j_particles[MAX_PAIRS]; | |
int pointer[N], pointer2[N]; | |
int sorted_list[MAX_PAIRS]; | |
const double CUTOFF_LENGTH = 3.0; | |
const double SEARCH_LENGTH = 3.3; | |
const double CL2 = CUTOFF_LENGTH * CUTOFF_LENGTH; | |
//---------------------------------------------------------------------- | |
typedef double v4df __attribute__((vector_size(32))); | |
//---------------------------------------------------------------------- | |
void | |
print256(v4df r) { | |
double *a = (double*)(&r); | |
printf("%.10f %.10f %.10f %.10f\n", a[0], a[1], a[2], a[3]); | |
} | |
//---------------------------------------------------------------------- | |
void | |
add_particle(double x, double y, double z) { | |
static std::mt19937 mt(2); | |
std::uniform_real_distribution<double> ud(0.0, 0.1); | |
q[particle_number][X] = x + ud(mt); | |
q[particle_number][Y] = y + ud(mt); | |
q[particle_number][Z] = z + ud(mt); | |
particle_number++; | |
} | |
//---------------------------------------------------------------------- | |
double | |
myclock(void) { | |
struct timeval t; | |
gettimeofday(&t, NULL); | |
return t.tv_sec + t.tv_usec * 1e-6; | |
} | |
//---------------------------------------------------------------------- | |
void | |
register_pair(int index1, int index2) { | |
int i, j; | |
if (index1 < index2) { | |
i = index1; | |
j = index2; | |
} else { | |
i = index2; | |
j = index1; | |
} | |
i_particles[number_of_pairs] = i; | |
j_particles[number_of_pairs] = j; | |
number_of_partners[i]++; | |
number_of_pairs++; | |
i_particles[number_of_pairs] = j; | |
j_particles[number_of_pairs] = i; | |
number_of_partners[j]++; | |
number_of_pairs++; | |
} | |
//---------------------------------------------------------------------- | |
void | |
sortpair(void) { | |
const int pn = particle_number; | |
int pos = 0; | |
pointer[0] = 0; | |
for (int i = 0; i < pn - 1; i++) { | |
pos += number_of_partners[i]; | |
pointer[i + 1] = pos; | |
} | |
for (int i = 0; i < pn; i++) { | |
pointer2[i] = 0; | |
} | |
const int s = number_of_pairs; | |
for (int k = 0; k < s; k++) { | |
int i = i_particles[k]; | |
int j = j_particles[k]; | |
int index = pointer[i] + pointer2[i]; | |
sorted_list[index] = j; | |
pointer2[i] ++; | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
makepair(void) { | |
const double SL2 = SEARCH_LENGTH * SEARCH_LENGTH; | |
const int pn = particle_number; | |
for (int i = 0; i < pn; i++) { | |
number_of_partners[i] = 0; | |
} | |
for (int i = 0; i < particle_number - 1; i++) { | |
for (int j = i + 1; j < particle_number; j++) { | |
const double dx = q[i][X] - q[j][X]; | |
const double dy = q[i][Y] - q[j][Y]; | |
const double dz = q[i][Z] - q[j][Z]; | |
const double r2 = dx * dx + dy * dy + dz * dz; | |
if (r2 < SL2) { | |
register_pair(i, j); | |
} | |
} | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
init(void) { | |
const double s = 1.0 / pow(density * 0.25, 1.0 / 3.0); | |
const double hs = s * 0.5; | |
int sx = static_cast<int>(L / s); | |
int sy = static_cast<int>(L / s); | |
int sz = static_cast<int>(L / s); | |
for (int iz = 0; iz < sz; iz++) { | |
for (int iy = 0; iy < sy; iy++) { | |
for (int ix = 0; ix < sx; ix++) { | |
double x = ix * s; | |
double y = iy * s; | |
double z = iz * s; | |
add_particle(x , y , z); | |
add_particle(x , y + hs, z + hs); | |
add_particle(x + hs , y , z + hs); | |
add_particle(x + hs , y + hs, z); | |
} | |
} | |
} | |
for (int i = 0; i < particle_number; i++) { | |
p[i][X] = 0.0; | |
p[i][Y] = 0.0; | |
p[i][Z] = 0.0; | |
} | |
} | |
//---------------------------------------------------------------------- | |
__attribute__((noinline)) | |
void | |
force_pair(void) { | |
for (int k = 0; k < number_of_pairs; k++) { | |
const int i = i_particles[k]; | |
const int j = j_particles[k]; | |
double dx = q[j][X] - q[i][X]; | |
double dy = q[j][Y] - q[i][Y]; | |
double dz = q[j][Z] - q[i][Z]; | |
double r2 = (dx * dx + dy * dy + dz * dz); | |
if (r2 > CL2) continue; | |
double r6 = r2 * r2 * r2; | |
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
p[i][X] += df * dx; | |
p[i][Y] += df * dy; | |
p[i][Z] += df * dz; | |
} | |
} | |
//---------------------------------------------------------------------- | |
__attribute__((noinline)) | |
void | |
mdacp_intrin_reactless() { | |
#if 1 | |
#pragma omp parallel | |
{ | |
const v4df vc24 = _mm256_set1_pd(24.0 * dt); | |
const v4df vc48 = _mm256_set1_pd(48.0 * dt); | |
const v4df vcl2 = _mm256_set1_pd(CL2); | |
const v4df vzero = _mm256_setzero_pd(); | |
const int pn = particle_number; | |
#pragma omp for nowait | |
for (int i = 0; i < pn; i++) { | |
const v4df vqi = _mm256_loadu_pd((double*)(q + i)); | |
v4df vpi = _mm256_loadu_pd((double*)(p + i)); | |
const int np = number_of_partners[i]; | |
const int kp = pointer[i]; | |
for (int k = 0; k < (np / 4) * 4; k += 4) { | |
const int j_a = sorted_list[kp + k]; | |
const v4df vqj_a = _mm256_loadu_pd((double*)(q + j_a)); | |
v4df vdq_a = vqj_a - vqi; | |
const int j_b = sorted_list[kp + k + 1]; | |
const v4df vqj_b = _mm256_loadu_pd((double*)(q + j_b)); | |
v4df vdq_b = vqj_b - vqi; | |
const int j_c = sorted_list[kp + k + 2]; | |
const v4df vqj_c = _mm256_loadu_pd((double*)(q + j_c)); | |
v4df vdq_c = vqj_c - vqi; | |
const int j_d = sorted_list[kp + k + 3]; | |
const v4df vqj_d = _mm256_loadu_pd((double*)(q + j_d)); | |
v4df vdq_d = vqj_d - vqi; | |
v4df tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b); | |
v4df tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b); | |
v4df tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d); | |
v4df tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d); | |
v4df vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20); | |
v4df vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20); | |
v4df vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31); | |
v4df vr2 = vdx * vdx + vdy * vdy + vdz * vdz; | |
v4df vr6 = vr2 * vr2 * vr2; | |
v4df vdf = (vc24 * vr6 - vc48) / (vr6 * vr6 * vr2); | |
v4df mask = vcl2 - vr2; | |
vdf = _mm256_blendv_pd(vdf, vzero, mask); | |
v4df vdf_a = _mm256_permute4x64_pd(vdf, 0); | |
v4df vdf_b = _mm256_permute4x64_pd(vdf, 85); | |
v4df vdf_c = _mm256_permute4x64_pd(vdf, 170); | |
v4df vdf_d = _mm256_permute4x64_pd(vdf, 255); | |
vpi += vdq_a * vdf_a; | |
vpi += vdq_b * vdf_b; | |
vpi += vdq_c * vdf_c; | |
vpi += vdq_d * vdf_d; | |
} | |
_mm256_storeu_pd((double*)(p + i), vpi); | |
double pfx = 0.0, pfy = 0.0, pfz = 0.0; | |
const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z]; | |
for (int k = (np / 4) * 4; k < np; k++) { | |
const int j = sorted_list[kp + k]; | |
const double dx = q[j][X] - qix; | |
const double dy = q[j][Y] - qiy; | |
const double dz = q[j][Z] - qiz; | |
const double r2 = (dx * dx + dy * dy + dz * dz); | |
const double r6 = r2 * r2 * r2; | |
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
if (r2 > CL2) df = 0.0; | |
pfx += df * dx; | |
pfy += df * dy; | |
pfz += df * dz; | |
} | |
p[i][X] += pfx; | |
p[i][Y] += pfy; | |
p[i][Z] += pfz; | |
} | |
} | |
#else | |
#pragma omp parallel | |
{ | |
const auto vc24 = _mm256_set1_pd(24.0 * dt); | |
const auto vc48 = _mm256_set1_pd(48.0 * dt); | |
const auto vcl2 = _mm256_set1_pd(CL2); | |
const auto vzero = _mm256_setzero_pd(); | |
const auto pn = particle_number; | |
#pragma omp for nowait | |
for (int i = 0; i < pn; i++) { | |
const auto vqi = _mm256_loadu_pd((double*)(q + i)); | |
auto vpi = _mm256_loadu_pd((double*)(p + i)); | |
const auto np = number_of_partners[i]; | |
const auto kp = pointer[i]; | |
for (int k = 0; k < (np / 4) * 4; k += 4) { | |
const auto j_a = sorted_list[kp + k]; | |
const auto vqj_a = _mm256_loadu_pd((double*)(q + j_a)); | |
auto vdq_a = _mm256_sub_pd(vqj_a, vqi); | |
const auto j_b = sorted_list[kp + k + 1]; | |
const auto vqj_b = _mm256_loadu_pd((double*)(q + j_b)); | |
auto vdq_b = _mm256_sub_pd(vqj_b, vqi); | |
const auto j_c = sorted_list[kp + k + 2]; | |
const auto vqj_c = _mm256_loadu_pd((double*)(q + j_c)); | |
auto vdq_c = _mm256_sub_pd(vqj_c, vqi); | |
const auto j_d = sorted_list[kp + k + 3]; | |
const auto vqj_d = _mm256_loadu_pd((double*)(q + j_d)); | |
auto vdq_d = _mm256_sub_pd(vqj_d, vqi); | |
auto tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b); | |
auto tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b); | |
auto tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d); | |
auto tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d); | |
auto vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20); | |
auto vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20); | |
auto vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31); | |
auto vr2 = _mm256_fmadd_pd(vdz, vdz, | |
_mm256_fmadd_pd(vdy, vdy, | |
_mm256_mul_pd(vdx, vdx))); | |
auto vr6 = _mm256_mul_pd(_mm256_mul_pd(vr2, vr2), vr2); | |
auto vdf = _mm256_div_pd(_mm256_fmsub_pd(vc24, vr6, vc48), | |
_mm256_mul_pd(_mm256_mul_pd(vr6, vr6), vr2)); | |
auto mask = vcl2 - vr2; | |
vdf = _mm256_blendv_pd(vdf, vzero, mask); | |
auto vdf_a = _mm256_permute4x64_pd(vdf, 0); | |
auto vdf_b = _mm256_permute4x64_pd(vdf, 85); | |
auto vdf_c = _mm256_permute4x64_pd(vdf, 170); | |
auto vdf_d = _mm256_permute4x64_pd(vdf, 255); | |
vpi += vdq_a * vdf_a; | |
vpi += vdq_b * vdf_b; | |
vpi += vdq_c * vdf_c; | |
vpi += vdq_d * vdf_d; | |
} | |
_mm256_storeu_pd((double*)(p + i), vpi); | |
double pfx = 0.0, pfy = 0.0, pfz = 0.0; | |
const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z]; | |
for (int k = (np / 4) * 4; k < np; k++) { | |
const int j = sorted_list[kp + k]; | |
const double dx = q[j][X] - qix; | |
const double dy = q[j][Y] - qiy; | |
const double dz = q[j][Z] - qiz; | |
const double r2 = (dx * dx + dy * dy + dz * dz); | |
const double r6 = r2 * r2 * r2; | |
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
if (r2 > CL2) df = 0.0; | |
pfx += df * dx; | |
pfy += df * dy; | |
pfz += df * dz; | |
} | |
p[i][X] += pfx; | |
p[i][Y] += pfy; | |
p[i][Z] += pfz; | |
} | |
} | |
#endif | |
} | |
//---------------------------------------------------------------------- | |
void | |
loadpair(void) { | |
std::ifstream ifs("pair.dat", std::ios::binary); | |
ifs.read((char*)&number_of_pairs, sizeof(int)); | |
ifs.read((char*)number_of_partners, sizeof(int)*N); | |
ifs.read((char*)i_particles, sizeof(int)*MAX_PAIRS); | |
ifs.read((char*)j_particles, sizeof(int)*MAX_PAIRS); | |
} | |
//---------------------------------------------------------------------- | |
void | |
savepair(void) { | |
makepair(); | |
std::ofstream ofs("pair.dat", std::ios::binary); | |
ofs.write((char*)&number_of_pairs, sizeof(int)); | |
ofs.write((char*)number_of_partners, sizeof(int)*N); | |
ofs.write((char*)i_particles, sizeof(int)*MAX_PAIRS); | |
ofs.write((char*)j_particles, sizeof(int)*MAX_PAIRS); | |
} | |
//---------------------------------------------------------------------- | |
void | |
print_result(void) { | |
for (int i = 0; i < 5; i++) { | |
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]); | |
} | |
for (int i = particle_number - 5; i < particle_number; i++) { | |
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]); | |
} | |
} | |
//---------------------------------------------------------------------- | |
int | |
main(void) { | |
init(); | |
struct stat st; | |
int ret = stat("pair.dat", &st); | |
if (ret == 0) { | |
std::cerr << "A pair-file is found. I use it." << std::endl; | |
loadpair(); | |
} else { | |
std::cerr << "Make pairlist." << std::endl; | |
savepair(); | |
} | |
std::cerr << "Number of pairs: " << number_of_pairs << std::endl; | |
sortpair(); | |
#ifdef MDACP_TEST | |
const auto beg = myclock(); | |
for (int i = 0; i < 100; i++) { | |
mdacp_intrin_reactless(); | |
} | |
const auto dur = myclock() - beg; | |
std::cout << dur << "[s]\n"; | |
print_result(); | |
#else | |
for (int i = 0; i < 100; i++) { | |
force_pair(); | |
} | |
print_result(); | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment