Skip to content

Instantly share code, notes, and snippets.

@kaityo256
Created March 17, 2017 05:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaityo256/a8d6d0c974285e01fe6b04d6b1cccd0c to your computer and use it in GitHub Desktop.
Save kaityo256/a8d6d0c974285e01fe6b04d6b1cccd0c to your computer and use it in GitHub Desktop.
Haswell vs. Skylake
/*
# Copyright H. Watanabe 2017
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)
*/
//----------------------------------------------------------------------
#include <stdio.h>
#include <immintrin.h>
#include <iostream>
#include <fstream>
#include <random>
#include <math.h>
#include <sys/time.h>
#include <sys/stat.h>
//----------------------------------------------------------------------
const double density = 1.0;
const int N = 400000;
const int MAX_PAIRS = 30 * N;
double L = 50.0;
const double dt = 0.001;
const int D = 4;
enum {X, Y, Z};
double q[N][D];
double p[N][D];
int particle_number = 0;
int number_of_pairs = 0;
int number_of_partners[N];
int i_particles[MAX_PAIRS];
int j_particles[MAX_PAIRS];
int pointer[N], pointer2[N];
int sorted_list[MAX_PAIRS];
const double CUTOFF_LENGTH = 3.0;
const double SEARCH_LENGTH = 3.3;
const double CL2 = CUTOFF_LENGTH * CUTOFF_LENGTH;
//----------------------------------------------------------------------
typedef double v4df __attribute__((vector_size(32)));
//----------------------------------------------------------------------
void
print256(v4df r) {
double *a = (double*)(&r);
printf("%.10f %.10f %.10f %.10f\n", a[0], a[1], a[2], a[3]);
}
//----------------------------------------------------------------------
void
add_particle(double x, double y, double z) {
static std::mt19937 mt(2);
std::uniform_real_distribution<double> ud(0.0, 0.1);
q[particle_number][X] = x + ud(mt);
q[particle_number][Y] = y + ud(mt);
q[particle_number][Z] = z + ud(mt);
particle_number++;
}
//----------------------------------------------------------------------
double
myclock(void) {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + t.tv_usec * 1e-6;
}
//----------------------------------------------------------------------
void
register_pair(int index1, int index2) {
int i, j;
if (index1 < index2) {
i = index1;
j = index2;
} else {
i = index2;
j = index1;
}
i_particles[number_of_pairs] = i;
j_particles[number_of_pairs] = j;
number_of_partners[i]++;
number_of_pairs++;
}
//----------------------------------------------------------------------
void
sortpair(void) {
const int pn = particle_number;
int pos = 0;
pointer[0] = 0;
for (int i = 0; i < pn - 1; i++) {
pos += number_of_partners[i];
pointer[i + 1] = pos;
}
for (int i = 0; i < pn; i++) {
pointer2[i] = 0;
}
const int s = number_of_pairs;
for (int k = 0; k < s; k++) {
int i = i_particles[k];
int j = j_particles[k];
int index = pointer[i] + pointer2[i];
sorted_list[index] = j;
pointer2[i] ++;
}
}
//----------------------------------------------------------------------
void
makepair(void) {
const double SL2 = SEARCH_LENGTH * SEARCH_LENGTH;
const int pn = particle_number;
for (int i = 0; i < pn; i++) {
number_of_partners[i] = 0;
}
for (int i = 0; i < particle_number - 1; i++) {
for (int j = i + 1; j < particle_number; j++) {
const double dx = q[i][X] - q[j][X];
const double dy = q[i][Y] - q[j][Y];
const double dz = q[i][Z] - q[j][Z];
const double r2 = dx * dx + dy * dy + dz * dz;
if (r2 < SL2) {
register_pair(i, j);
}
}
}
}
//----------------------------------------------------------------------
void
init(void) {
const double s = 1.0 / pow(density * 0.25, 1.0 / 3.0);
const double hs = s * 0.5;
int sx = static_cast<int>(L / s);
int sy = static_cast<int>(L / s);
int sz = static_cast<int>(L / s);
for (int iz = 0; iz < sz; iz++) {
for (int iy = 0; iy < sy; iy++) {
for (int ix = 0; ix < sx; ix++) {
double x = ix * s;
double y = iy * s;
double z = iz * s;
add_particle(x , y , z);
add_particle(x , y + hs, z + hs);
add_particle(x + hs , y , z + hs);
add_particle(x + hs , y + hs, z);
}
}
}
for (int i = 0; i < particle_number; i++) {
p[i][X] = 0.0;
p[i][Y] = 0.0;
p[i][Z] = 0.0;
}
}
//----------------------------------------------------------------------
void
force_pair(void) {
for (int k = 0; k < number_of_pairs; k++) {
const int i = i_particles[k];
const int j = j_particles[k];
double dx = q[j][X] - q[i][X];
double dy = q[j][Y] - q[i][Y];
double dz = q[j][Z] - q[i][Z];
double r2 = (dx * dx + dy * dy + dz * dz);
if (r2 > CL2) continue;
double r6 = r2 * r2 * r2;
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
p[i][X] += df * dx;
p[i][Y] += df * dy;
p[i][Z] += df * dz;
p[j][X] -= df * dx;
p[j][Y] -= df * dy;
p[j][Z] -= df * dz;
}
}
//----------------------------------------------------------------------
void
force_sorted(void) {
const int pn = particle_number;
for (int i = 0; i < pn; i++) {
const double qx_key = q[i][X];
const double qy_key = q[i][Y];
const double qz_key = q[i][Z];
const int np = number_of_partners[i];
double pfx = 0;
double pfy = 0;
double pfz = 0;
const int kp = pointer[i];
for (int k = 0; k < np; k++) {
const int j = sorted_list[kp + k];
double dx = q[j][X] - qx_key;
double dy = q[j][Y] - qy_key;
double dz = q[j][Z] - qz_key;
double r2 = (dx * dx + dy * dy + dz * dz);
if (r2 > CL2) continue;
double r6 = r2 * r2 * r2;
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
pfx += df * dx;
pfy += df * dy;
pfz += df * dz;
p[j][X] -= df * dx;
p[j][Y] -= df * dy;
p[j][Z] -= df * dz;
}
p[i][X] += pfx;
p[i][Y] += pfy;
p[i][Z] += pfz;
}
}
//----------------------------------------------------------------------
void
force_sorted_swp(void) {
const int pn = particle_number;
for (int i = 0; i < pn; i++) {
const double qx_key = q[i][X];
const double qy_key = q[i][Y];
const double qz_key = q[i][Z];
double pfx = 0;
double pfy = 0;
double pfz = 0;
const int kp = pointer[i];
int ja = sorted_list[kp];
double dxa = q[ja][X] - qx_key;
double dya = q[ja][Y] - qy_key;
double dza = q[ja][Z] - qz_key;
double df = 0.0;
double dxb = 0.0, dyb = 0.0, dzb = 0.0;
int jb = 0;
const int np = number_of_partners[i];
for (int k = kp; k < np + kp; k++) {
const double dx = dxa;
const double dy = dya;
const double dz = dza;
double r2 = (dx * dx + dy * dy + dz * dz);
const int j = ja;
ja = sorted_list[k + 1];
dxa = q[ja][X] - qx_key;
dya = q[ja][Y] - qy_key;
dza = q[ja][Z] - qz_key;
if (r2 > CL2)continue;
pfx += df * dxb;
pfy += df * dyb;
pfz += df * dzb;
p[jb][X] -= df * dxb;
p[jb][Y] -= df * dyb;
p[jb][Z] -= df * dzb;
const double r6 = r2 * r2 * r2;
df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
jb = j;
dxb = dx;
dyb = dy;
dzb = dz;
}
p[jb][X] -= df * dxb;
p[jb][Y] -= df * dyb;
p[jb][Z] -= df * dzb;
p[i][X] += pfx + df * dxb;
p[i][Y] += pfy + df * dyb;
p[i][Z] += pfz + df * dzb;
}
}
//----------------------------------------------------------------------
void
force_sorted_swp_intrin(void) {
const int pn = particle_number;
const v4df vzero = _mm256_set_pd(0, 0, 0, 0);
const v4df vcl2 = _mm256_set_pd(CL2, CL2, CL2, CL2);
const v4df vc24 = _mm256_set_pd(24 * dt, 24 * dt, 24 * dt, 24 * dt);
const v4df vc48 = _mm256_set_pd(48 * dt, 48 * dt, 48 * dt, 48 * dt);
for (int i = 0; i < pn; i++) {
const v4df vqi = _mm256_load_pd((double*)(q + i));
v4df vpf = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
const int kp = pointer[i];
int ja_1 = sorted_list[kp];
int ja_2 = sorted_list[kp + 1];
int ja_3 = sorted_list[kp + 2];
int ja_4 = sorted_list[kp + 3];
v4df vqj_1 = _mm256_load_pd((double*)(q + ja_1));
v4df vdqa_1 = vqj_1 - vqi;
v4df vqj_2 = _mm256_load_pd((double*)(q + ja_2));
v4df vdqa_2 = vqj_2 - vqi;
v4df vqj_3 = _mm256_load_pd((double*)(q + ja_3));
v4df vdqa_3 = vqj_3 - vqi;
v4df vqj_4 = _mm256_load_pd((double*)(q + ja_4));
v4df vdqa_4 = vqj_4 - vqi;
v4df vdf = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
v4df vdqb_1 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
v4df vdqb_2 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
v4df vdqb_3 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
v4df vdqb_4 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
int jb_1 = 0, jb_2 = 0, jb_3 = 0, jb_4 = 0;
const int np = number_of_partners[i];
for (int k = 0; k < (np / 4) * 4; k += 4) {
const int j_1 = ja_1;
const int j_2 = ja_2;
const int j_3 = ja_3;
const int j_4 = ja_4;
v4df vdq_1 = vdqa_1;
v4df vdq_2 = vdqa_2;
v4df vdq_3 = vdqa_3;
v4df vdq_4 = vdqa_4;
ja_1 = sorted_list[kp + k + 4];
ja_2 = sorted_list[kp + k + 5];
ja_3 = sorted_list[kp + k + 6];
ja_4 = sorted_list[kp + k + 7];
v4df tmp0 = _mm256_unpacklo_pd(vdq_1, vdq_2);
v4df tmp1 = _mm256_unpackhi_pd(vdq_1, vdq_2);
v4df tmp2 = _mm256_unpacklo_pd(vdq_3, vdq_4);
v4df tmp3 = _mm256_unpackhi_pd(vdq_3, vdq_4);
v4df vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
v4df vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
v4df vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);
v4df vdf_1 = _mm256_permute4x64_pd(vdf, 0);
v4df vdf_2 = _mm256_permute4x64_pd(vdf, 85);
v4df vdf_3 = _mm256_permute4x64_pd(vdf, 170);
v4df vdf_4 = _mm256_permute4x64_pd(vdf, 255);
vqj_1 = _mm256_load_pd((double*)(q + ja_1));
vdqa_1 = vqj_1 - vqi;
vpf += vdf_1 * vdqb_1;
v4df vpjb_1 = _mm256_load_pd((double*)(p + jb_1));
vpjb_1 -= vdf_1 * vdqb_1;
_mm256_store_pd((double*)(p + jb_1), vpjb_1);
vqj_2 = _mm256_load_pd((double*)(q + ja_2));
vdqa_2 = vqj_2 - vqi;
vpf += vdf_2 * vdqb_2;
v4df vpjb_2 = _mm256_load_pd((double*)(p + jb_2));
vpjb_2 -= vdf_2 * vdqb_2;
_mm256_store_pd((double*)(p + jb_2), vpjb_2);
vqj_3 = _mm256_load_pd((double*)(q + ja_3));
vdqa_3 = vqj_3 - vqi;
vpf += vdf_3 * vdqb_3;
v4df vpjb_3 = _mm256_load_pd((double*)(p + jb_3));
vpjb_3 -= vdf_3 * vdqb_3;
_mm256_store_pd((double*)(p + jb_3), vpjb_3);
vqj_4 = _mm256_load_pd((double*)(q + ja_4));
vdqa_4 = vqj_4 - vqi;
vpf += vdf_4 * vdqb_4;
v4df vpjb_4 = _mm256_load_pd((double*)(p + jb_4));
vpjb_4 -= vdf_4 * vdqb_4;
_mm256_store_pd((double*)(p + jb_4), vpjb_4);
v4df vr2 = vdx * vdx + vdy * vdy + vdz * vdz;
v4df vr6 = vr2 * vr2 * vr2;
vdf = (vc24 * vr6 - vc48) / (vr6 * vr6 * vr2);
v4df mask = vcl2 - vr2;
vdf = _mm256_blendv_pd(vdf, vzero, mask);
jb_1 = j_1;
jb_2 = j_2;
jb_3 = j_3;
jb_4 = j_4;
vdqb_1 = vdq_1;
vdqb_2 = vdq_2;
vdqb_3 = vdq_3;
vdqb_4 = vdq_4;
}
v4df vdf_1 = _mm256_permute4x64_pd(vdf, 0);
v4df vdf_2 = _mm256_permute4x64_pd(vdf, 85);
v4df vdf_3 = _mm256_permute4x64_pd(vdf, 170);
v4df vdf_4 = _mm256_permute4x64_pd(vdf, 255);
v4df vpjb_1 = _mm256_load_pd((double*)(p + jb_1));
vpjb_1 -= vdf_1 * vdqb_1;
_mm256_store_pd((double*)(p + jb_1), vpjb_1);
v4df vpjb_2 = _mm256_load_pd((double*)(p + jb_2));
vpjb_2 -= vdf_2 * vdqb_2;
_mm256_store_pd((double*)(p + jb_2), vpjb_2);
v4df vpjb_3 = _mm256_load_pd((double*)(p + jb_3));
vpjb_3 -= vdf_3 * vdqb_3;
_mm256_store_pd((double*)(p + jb_3), vpjb_3);
v4df vpjb_4 = _mm256_load_pd((double*)(p + jb_4));
vpjb_4 -= vdf_4 * vdqb_4;
_mm256_store_pd((double*)(p + jb_4), vpjb_4);
v4df vpi = _mm256_load_pd((double*)(p + i));
vpf += vdf_1 * vdqb_1;
vpf += vdf_2 * vdqb_2;
vpf += vdf_3 * vdqb_3;
vpf += vdf_4 * vdqb_4;
vpi += vpf;
_mm256_store_pd((double*)(p + i), vpi);
const double qix = q[i][X];
const double qiy = q[i][Y];
const double qiz = q[i][Z];
double pfx = 0.0;
double pfy = 0.0;
double pfz = 0.0;
for (int k = (np / 4) * 4; k < np; k++) {
const int j = sorted_list[k + kp];
double dx = q[j][X] - qix;
double dy = q[j][Y] - qiy;
double dz = q[j][Z] - qiz;
double r2 = (dx * dx + dy * dy + dz * dz);
double r6 = r2 * r2 * r2;
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
if (r2 > CL2) df = 0.0;
pfx += df * dx;
pfy += df * dy;
pfz += df * dz;
p[j][X] -= df * dx;
p[j][Y] -= df * dy;
p[j][Z] -= df * dz;
}
p[i][X] += pfx;
p[i][Y] += pfy;
p[i][Z] += pfz;
}
}
//----------------------------------------------------------------------
void
measure(void(*pfunc)(), const char *name) {
double st = myclock();
const int LOOP = 100;
//const int LOOP = 1;
for (int i = 0; i < LOOP; i++) {
pfunc();
}
double t = myclock() - st;
fprintf(stderr, "N=%d, %s %f [sec]\n", particle_number, name, t);
}
//----------------------------------------------------------------------
void
loadpair(void) {
std::ifstream ifs("pair.dat", std::ios::binary);
ifs.read((char*)&number_of_pairs, sizeof(int));
ifs.read((char*)number_of_partners, sizeof(int)*N);
ifs.read((char*)i_particles, sizeof(int)*MAX_PAIRS);
ifs.read((char*)j_particles, sizeof(int)*MAX_PAIRS);
}
//----------------------------------------------------------------------
void
savepair(void) {
makepair();
std::ofstream ofs("pair.dat", std::ios::binary);
ofs.write((char*)&number_of_pairs, sizeof(int));
ofs.write((char*)number_of_partners, sizeof(int)*N);
ofs.write((char*)i_particles, sizeof(int)*MAX_PAIRS);
ofs.write((char*)j_particles, sizeof(int)*MAX_PAIRS);
}
//----------------------------------------------------------------------
void
print_result(void) {
for (int i = 0; i < 5; i++) {
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
}
for (int i = particle_number - 5; i < particle_number; i++) {
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
}
}
//----------------------------------------------------------------------
int
main(void) {
init();
struct stat st;
int ret = stat("pair.dat", &st);
if (ret == 0) {
std::cerr << "A pair-file is found. I use it." << std::endl;
loadpair();
} else {
std::cerr << "Make pairlist." << std::endl;
savepair();
}
std::cerr << "Number of pairs: " << number_of_pairs << std::endl;
sortpair();
measure(&force_pair, "pair");
measure(&force_sorted, "sorted");
measure(&force_sorted_swp, "sorted_swp");
measure(&force_sorted_swp_intrin, "sorted_swp_intrin");
}
//----------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment