Created
March 17, 2017 05:00
-
-
Save kaityo256/a8d6d0c974285e01fe6b04d6b1cccd0c to your computer and use it in GitHub Desktop.
Haswell vs. Skylake
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
# Copyright H. Watanabe 2017 | |
# Distributed under the Boost Software License, Version 1.0. | |
# (See accompanying file LICENSE_1_0.txt or copy at | |
# http://www.boost.org/LICENSE_1_0.txt) | |
*/ | |
//---------------------------------------------------------------------- | |
#include <stdio.h> | |
#include <immintrin.h> | |
#include <iostream> | |
#include <fstream> | |
#include <random> | |
#include <math.h> | |
#include <sys/time.h> | |
#include <sys/stat.h> | |
//---------------------------------------------------------------------- | |
const double density = 1.0; | |
const int N = 400000; | |
const int MAX_PAIRS = 30 * N; | |
double L = 50.0; | |
const double dt = 0.001; | |
const int D = 4; | |
enum {X, Y, Z}; | |
double q[N][D]; | |
double p[N][D]; | |
int particle_number = 0; | |
int number_of_pairs = 0; | |
int number_of_partners[N]; | |
int i_particles[MAX_PAIRS]; | |
int j_particles[MAX_PAIRS]; | |
int pointer[N], pointer2[N]; | |
int sorted_list[MAX_PAIRS]; | |
const double CUTOFF_LENGTH = 3.0; | |
const double SEARCH_LENGTH = 3.3; | |
const double CL2 = CUTOFF_LENGTH * CUTOFF_LENGTH; | |
//---------------------------------------------------------------------- | |
typedef double v4df __attribute__((vector_size(32))); | |
//---------------------------------------------------------------------- | |
void | |
print256(v4df r) { | |
double *a = (double*)(&r); | |
printf("%.10f %.10f %.10f %.10f\n", a[0], a[1], a[2], a[3]); | |
} | |
//---------------------------------------------------------------------- | |
void | |
add_particle(double x, double y, double z) { | |
static std::mt19937 mt(2); | |
std::uniform_real_distribution<double> ud(0.0, 0.1); | |
q[particle_number][X] = x + ud(mt); | |
q[particle_number][Y] = y + ud(mt); | |
q[particle_number][Z] = z + ud(mt); | |
particle_number++; | |
} | |
//---------------------------------------------------------------------- | |
double | |
myclock(void) { | |
struct timeval t; | |
gettimeofday(&t, NULL); | |
return t.tv_sec + t.tv_usec * 1e-6; | |
} | |
//---------------------------------------------------------------------- | |
void | |
register_pair(int index1, int index2) { | |
int i, j; | |
if (index1 < index2) { | |
i = index1; | |
j = index2; | |
} else { | |
i = index2; | |
j = index1; | |
} | |
i_particles[number_of_pairs] = i; | |
j_particles[number_of_pairs] = j; | |
number_of_partners[i]++; | |
number_of_pairs++; | |
} | |
//---------------------------------------------------------------------- | |
void | |
sortpair(void) { | |
const int pn = particle_number; | |
int pos = 0; | |
pointer[0] = 0; | |
for (int i = 0; i < pn - 1; i++) { | |
pos += number_of_partners[i]; | |
pointer[i + 1] = pos; | |
} | |
for (int i = 0; i < pn; i++) { | |
pointer2[i] = 0; | |
} | |
const int s = number_of_pairs; | |
for (int k = 0; k < s; k++) { | |
int i = i_particles[k]; | |
int j = j_particles[k]; | |
int index = pointer[i] + pointer2[i]; | |
sorted_list[index] = j; | |
pointer2[i] ++; | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
makepair(void) { | |
const double SL2 = SEARCH_LENGTH * SEARCH_LENGTH; | |
const int pn = particle_number; | |
for (int i = 0; i < pn; i++) { | |
number_of_partners[i] = 0; | |
} | |
for (int i = 0; i < particle_number - 1; i++) { | |
for (int j = i + 1; j < particle_number; j++) { | |
const double dx = q[i][X] - q[j][X]; | |
const double dy = q[i][Y] - q[j][Y]; | |
const double dz = q[i][Z] - q[j][Z]; | |
const double r2 = dx * dx + dy * dy + dz * dz; | |
if (r2 < SL2) { | |
register_pair(i, j); | |
} | |
} | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
init(void) { | |
const double s = 1.0 / pow(density * 0.25, 1.0 / 3.0); | |
const double hs = s * 0.5; | |
int sx = static_cast<int>(L / s); | |
int sy = static_cast<int>(L / s); | |
int sz = static_cast<int>(L / s); | |
for (int iz = 0; iz < sz; iz++) { | |
for (int iy = 0; iy < sy; iy++) { | |
for (int ix = 0; ix < sx; ix++) { | |
double x = ix * s; | |
double y = iy * s; | |
double z = iz * s; | |
add_particle(x , y , z); | |
add_particle(x , y + hs, z + hs); | |
add_particle(x + hs , y , z + hs); | |
add_particle(x + hs , y + hs, z); | |
} | |
} | |
} | |
for (int i = 0; i < particle_number; i++) { | |
p[i][X] = 0.0; | |
p[i][Y] = 0.0; | |
p[i][Z] = 0.0; | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
force_pair(void) { | |
for (int k = 0; k < number_of_pairs; k++) { | |
const int i = i_particles[k]; | |
const int j = j_particles[k]; | |
double dx = q[j][X] - q[i][X]; | |
double dy = q[j][Y] - q[i][Y]; | |
double dz = q[j][Z] - q[i][Z]; | |
double r2 = (dx * dx + dy * dy + dz * dz); | |
if (r2 > CL2) continue; | |
double r6 = r2 * r2 * r2; | |
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
p[i][X] += df * dx; | |
p[i][Y] += df * dy; | |
p[i][Z] += df * dz; | |
p[j][X] -= df * dx; | |
p[j][Y] -= df * dy; | |
p[j][Z] -= df * dz; | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
force_sorted(void) { | |
const int pn = particle_number; | |
for (int i = 0; i < pn; i++) { | |
const double qx_key = q[i][X]; | |
const double qy_key = q[i][Y]; | |
const double qz_key = q[i][Z]; | |
const int np = number_of_partners[i]; | |
double pfx = 0; | |
double pfy = 0; | |
double pfz = 0; | |
const int kp = pointer[i]; | |
for (int k = 0; k < np; k++) { | |
const int j = sorted_list[kp + k]; | |
double dx = q[j][X] - qx_key; | |
double dy = q[j][Y] - qy_key; | |
double dz = q[j][Z] - qz_key; | |
double r2 = (dx * dx + dy * dy + dz * dz); | |
if (r2 > CL2) continue; | |
double r6 = r2 * r2 * r2; | |
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
pfx += df * dx; | |
pfy += df * dy; | |
pfz += df * dz; | |
p[j][X] -= df * dx; | |
p[j][Y] -= df * dy; | |
p[j][Z] -= df * dz; | |
} | |
p[i][X] += pfx; | |
p[i][Y] += pfy; | |
p[i][Z] += pfz; | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
force_sorted_swp(void) { | |
const int pn = particle_number; | |
for (int i = 0; i < pn; i++) { | |
const double qx_key = q[i][X]; | |
const double qy_key = q[i][Y]; | |
const double qz_key = q[i][Z]; | |
double pfx = 0; | |
double pfy = 0; | |
double pfz = 0; | |
const int kp = pointer[i]; | |
int ja = sorted_list[kp]; | |
double dxa = q[ja][X] - qx_key; | |
double dya = q[ja][Y] - qy_key; | |
double dza = q[ja][Z] - qz_key; | |
double df = 0.0; | |
double dxb = 0.0, dyb = 0.0, dzb = 0.0; | |
int jb = 0; | |
const int np = number_of_partners[i]; | |
for (int k = kp; k < np + kp; k++) { | |
const double dx = dxa; | |
const double dy = dya; | |
const double dz = dza; | |
double r2 = (dx * dx + dy * dy + dz * dz); | |
const int j = ja; | |
ja = sorted_list[k + 1]; | |
dxa = q[ja][X] - qx_key; | |
dya = q[ja][Y] - qy_key; | |
dza = q[ja][Z] - qz_key; | |
if (r2 > CL2)continue; | |
pfx += df * dxb; | |
pfy += df * dyb; | |
pfz += df * dzb; | |
p[jb][X] -= df * dxb; | |
p[jb][Y] -= df * dyb; | |
p[jb][Z] -= df * dzb; | |
const double r6 = r2 * r2 * r2; | |
df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
jb = j; | |
dxb = dx; | |
dyb = dy; | |
dzb = dz; | |
} | |
p[jb][X] -= df * dxb; | |
p[jb][Y] -= df * dyb; | |
p[jb][Z] -= df * dzb; | |
p[i][X] += pfx + df * dxb; | |
p[i][Y] += pfy + df * dyb; | |
p[i][Z] += pfz + df * dzb; | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
force_sorted_swp_intrin(void) { | |
const int pn = particle_number; | |
const v4df vzero = _mm256_set_pd(0, 0, 0, 0); | |
const v4df vcl2 = _mm256_set_pd(CL2, CL2, CL2, CL2); | |
const v4df vc24 = _mm256_set_pd(24 * dt, 24 * dt, 24 * dt, 24 * dt); | |
const v4df vc48 = _mm256_set_pd(48 * dt, 48 * dt, 48 * dt, 48 * dt); | |
for (int i = 0; i < pn; i++) { | |
const v4df vqi = _mm256_load_pd((double*)(q + i)); | |
v4df vpf = _mm256_set_pd(0.0, 0.0, 0.0, 0.0); | |
const int kp = pointer[i]; | |
int ja_1 = sorted_list[kp]; | |
int ja_2 = sorted_list[kp + 1]; | |
int ja_3 = sorted_list[kp + 2]; | |
int ja_4 = sorted_list[kp + 3]; | |
v4df vqj_1 = _mm256_load_pd((double*)(q + ja_1)); | |
v4df vdqa_1 = vqj_1 - vqi; | |
v4df vqj_2 = _mm256_load_pd((double*)(q + ja_2)); | |
v4df vdqa_2 = vqj_2 - vqi; | |
v4df vqj_3 = _mm256_load_pd((double*)(q + ja_3)); | |
v4df vdqa_3 = vqj_3 - vqi; | |
v4df vqj_4 = _mm256_load_pd((double*)(q + ja_4)); | |
v4df vdqa_4 = vqj_4 - vqi; | |
v4df vdf = _mm256_set_pd(0.0, 0.0, 0.0, 0.0); | |
v4df vdqb_1 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0); | |
v4df vdqb_2 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0); | |
v4df vdqb_3 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0); | |
v4df vdqb_4 = _mm256_set_pd(0.0, 0.0, 0.0, 0.0); | |
int jb_1 = 0, jb_2 = 0, jb_3 = 0, jb_4 = 0; | |
const int np = number_of_partners[i]; | |
for (int k = 0; k < (np / 4) * 4; k += 4) { | |
const int j_1 = ja_1; | |
const int j_2 = ja_2; | |
const int j_3 = ja_3; | |
const int j_4 = ja_4; | |
v4df vdq_1 = vdqa_1; | |
v4df vdq_2 = vdqa_2; | |
v4df vdq_3 = vdqa_3; | |
v4df vdq_4 = vdqa_4; | |
ja_1 = sorted_list[kp + k + 4]; | |
ja_2 = sorted_list[kp + k + 5]; | |
ja_3 = sorted_list[kp + k + 6]; | |
ja_4 = sorted_list[kp + k + 7]; | |
v4df tmp0 = _mm256_unpacklo_pd(vdq_1, vdq_2); | |
v4df tmp1 = _mm256_unpackhi_pd(vdq_1, vdq_2); | |
v4df tmp2 = _mm256_unpacklo_pd(vdq_3, vdq_4); | |
v4df tmp3 = _mm256_unpackhi_pd(vdq_3, vdq_4); | |
v4df vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20); | |
v4df vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20); | |
v4df vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31); | |
v4df vdf_1 = _mm256_permute4x64_pd(vdf, 0); | |
v4df vdf_2 = _mm256_permute4x64_pd(vdf, 85); | |
v4df vdf_3 = _mm256_permute4x64_pd(vdf, 170); | |
v4df vdf_4 = _mm256_permute4x64_pd(vdf, 255); | |
vqj_1 = _mm256_load_pd((double*)(q + ja_1)); | |
vdqa_1 = vqj_1 - vqi; | |
vpf += vdf_1 * vdqb_1; | |
v4df vpjb_1 = _mm256_load_pd((double*)(p + jb_1)); | |
vpjb_1 -= vdf_1 * vdqb_1; | |
_mm256_store_pd((double*)(p + jb_1), vpjb_1); | |
vqj_2 = _mm256_load_pd((double*)(q + ja_2)); | |
vdqa_2 = vqj_2 - vqi; | |
vpf += vdf_2 * vdqb_2; | |
v4df vpjb_2 = _mm256_load_pd((double*)(p + jb_2)); | |
vpjb_2 -= vdf_2 * vdqb_2; | |
_mm256_store_pd((double*)(p + jb_2), vpjb_2); | |
vqj_3 = _mm256_load_pd((double*)(q + ja_3)); | |
vdqa_3 = vqj_3 - vqi; | |
vpf += vdf_3 * vdqb_3; | |
v4df vpjb_3 = _mm256_load_pd((double*)(p + jb_3)); | |
vpjb_3 -= vdf_3 * vdqb_3; | |
_mm256_store_pd((double*)(p + jb_3), vpjb_3); | |
vqj_4 = _mm256_load_pd((double*)(q + ja_4)); | |
vdqa_4 = vqj_4 - vqi; | |
vpf += vdf_4 * vdqb_4; | |
v4df vpjb_4 = _mm256_load_pd((double*)(p + jb_4)); | |
vpjb_4 -= vdf_4 * vdqb_4; | |
_mm256_store_pd((double*)(p + jb_4), vpjb_4); | |
v4df vr2 = vdx * vdx + vdy * vdy + vdz * vdz; | |
v4df vr6 = vr2 * vr2 * vr2; | |
vdf = (vc24 * vr6 - vc48) / (vr6 * vr6 * vr2); | |
v4df mask = vcl2 - vr2; | |
vdf = _mm256_blendv_pd(vdf, vzero, mask); | |
jb_1 = j_1; | |
jb_2 = j_2; | |
jb_3 = j_3; | |
jb_4 = j_4; | |
vdqb_1 = vdq_1; | |
vdqb_2 = vdq_2; | |
vdqb_3 = vdq_3; | |
vdqb_4 = vdq_4; | |
} | |
v4df vdf_1 = _mm256_permute4x64_pd(vdf, 0); | |
v4df vdf_2 = _mm256_permute4x64_pd(vdf, 85); | |
v4df vdf_3 = _mm256_permute4x64_pd(vdf, 170); | |
v4df vdf_4 = _mm256_permute4x64_pd(vdf, 255); | |
v4df vpjb_1 = _mm256_load_pd((double*)(p + jb_1)); | |
vpjb_1 -= vdf_1 * vdqb_1; | |
_mm256_store_pd((double*)(p + jb_1), vpjb_1); | |
v4df vpjb_2 = _mm256_load_pd((double*)(p + jb_2)); | |
vpjb_2 -= vdf_2 * vdqb_2; | |
_mm256_store_pd((double*)(p + jb_2), vpjb_2); | |
v4df vpjb_3 = _mm256_load_pd((double*)(p + jb_3)); | |
vpjb_3 -= vdf_3 * vdqb_3; | |
_mm256_store_pd((double*)(p + jb_3), vpjb_3); | |
v4df vpjb_4 = _mm256_load_pd((double*)(p + jb_4)); | |
vpjb_4 -= vdf_4 * vdqb_4; | |
_mm256_store_pd((double*)(p + jb_4), vpjb_4); | |
v4df vpi = _mm256_load_pd((double*)(p + i)); | |
vpf += vdf_1 * vdqb_1; | |
vpf += vdf_2 * vdqb_2; | |
vpf += vdf_3 * vdqb_3; | |
vpf += vdf_4 * vdqb_4; | |
vpi += vpf; | |
_mm256_store_pd((double*)(p + i), vpi); | |
const double qix = q[i][X]; | |
const double qiy = q[i][Y]; | |
const double qiz = q[i][Z]; | |
double pfx = 0.0; | |
double pfy = 0.0; | |
double pfz = 0.0; | |
for (int k = (np / 4) * 4; k < np; k++) { | |
const int j = sorted_list[k + kp]; | |
double dx = q[j][X] - qix; | |
double dy = q[j][Y] - qiy; | |
double dz = q[j][Z] - qiz; | |
double r2 = (dx * dx + dy * dy + dz * dz); | |
double r6 = r2 * r2 * r2; | |
double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
if (r2 > CL2) df = 0.0; | |
pfx += df * dx; | |
pfy += df * dy; | |
pfz += df * dz; | |
p[j][X] -= df * dx; | |
p[j][Y] -= df * dy; | |
p[j][Z] -= df * dz; | |
} | |
p[i][X] += pfx; | |
p[i][Y] += pfy; | |
p[i][Z] += pfz; | |
} | |
} | |
//---------------------------------------------------------------------- | |
void | |
measure(void(*pfunc)(), const char *name) { | |
double st = myclock(); | |
const int LOOP = 100; | |
//const int LOOP = 1; | |
for (int i = 0; i < LOOP; i++) { | |
pfunc(); | |
} | |
double t = myclock() - st; | |
fprintf(stderr, "N=%d, %s %f [sec]\n", particle_number, name, t); | |
} | |
//---------------------------------------------------------------------- | |
void | |
loadpair(void) { | |
std::ifstream ifs("pair.dat", std::ios::binary); | |
ifs.read((char*)&number_of_pairs, sizeof(int)); | |
ifs.read((char*)number_of_partners, sizeof(int)*N); | |
ifs.read((char*)i_particles, sizeof(int)*MAX_PAIRS); | |
ifs.read((char*)j_particles, sizeof(int)*MAX_PAIRS); | |
} | |
//---------------------------------------------------------------------- | |
void | |
savepair(void) { | |
makepair(); | |
std::ofstream ofs("pair.dat", std::ios::binary); | |
ofs.write((char*)&number_of_pairs, sizeof(int)); | |
ofs.write((char*)number_of_partners, sizeof(int)*N); | |
ofs.write((char*)i_particles, sizeof(int)*MAX_PAIRS); | |
ofs.write((char*)j_particles, sizeof(int)*MAX_PAIRS); | |
} | |
//---------------------------------------------------------------------- | |
void | |
print_result(void) { | |
for (int i = 0; i < 5; i++) { | |
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]); | |
} | |
for (int i = particle_number - 5; i < particle_number; i++) { | |
printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]); | |
} | |
} | |
//---------------------------------------------------------------------- | |
int | |
main(void) { | |
init(); | |
struct stat st; | |
int ret = stat("pair.dat", &st); | |
if (ret == 0) { | |
std::cerr << "A pair-file is found. I use it." << std::endl; | |
loadpair(); | |
} else { | |
std::cerr << "Make pairlist." << std::endl; | |
savepair(); | |
} | |
std::cerr << "Number of pairs: " << number_of_pairs << std::endl; | |
sortpair(); | |
measure(&force_pair, "pair"); | |
measure(&force_sorted, "sorted"); | |
measure(&force_sorted_swp, "sorted_swp"); | |
measure(&force_sorted_swp_intrin, "sorted_swp_intrin"); | |
} | |
//---------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment