kaityo256/test_mdacp.cpp

## test_mdacp.cpp
#include <stdio.h>
#include <immintrin.h>
#include <iostream>
#include <fstream>
#include <random>
#include <math.h>
#include <assert.h>
#include <sys/stat.h>
#include <sys/time.h>
const double density = 1.0;
const int N = 400000;
const int MAX_PAIRS = 100 * N;
double L = 50.0;
const double dt = 0.001;
const int D = 4;
enum {X, Y, Z};
double q[N][D];
double p[N][D];

int particle_number = 0;
int number_of_pairs = 0;
int number_of_partners[N];
int i_particles[MAX_PAIRS];
int j_particles[MAX_PAIRS];
int pointer[N], pointer2[N];
int sorted_list[MAX_PAIRS];

const double CUTOFF_LENGTH = 3.0;
const double SEARCH_LENGTH = 3.3;
const double CL2 = CUTOFF_LENGTH * CUTOFF_LENGTH;
//----------------------------------------------------------------------
typedef double v4df __attribute__((vector_size(32)));
//----------------------------------------------------------------------
void
print256(v4df r) {
  double *a = (double*)(&r);
  printf("%.10f %.10f %.10f %.10f\n", a[0], a[1], a[2], a[3]);
}
//----------------------------------------------------------------------
void
add_particle(double x, double y, double z) {
  static std::mt19937 mt(2);
  std::uniform_real_distribution<double> ud(0.0, 0.1);
  q[particle_number][X] = x + ud(mt);
  q[particle_number][Y] = y + ud(mt);
  q[particle_number][Z] = z + ud(mt);
  particle_number++;
}
//----------------------------------------------------------------------
double
myclock(void) {
  struct timeval t;
  gettimeofday(&t, NULL);
  return t.tv_sec + t.tv_usec * 1e-6;
}
//----------------------------------------------------------------------
void
register_pair(int index1, int index2) {
  int i, j;
  if (index1 < index2) {
    i = index1;
    j = index2;
  } else {
    i = index2;
    j = index1;
  }
  i_particles[number_of_pairs] = i;
  j_particles[number_of_pairs] = j;
  number_of_partners[i]++;
  number_of_pairs++;

  i_particles[number_of_pairs] = j;
  j_particles[number_of_pairs] = i;
  number_of_partners[j]++;
  number_of_pairs++;
}
//----------------------------------------------------------------------
void
sortpair(void) {
  const int pn = particle_number;
  int pos = 0;
  pointer[0] = 0;
  for (int i = 0; i < pn - 1; i++) {
    pos += number_of_partners[i];
    pointer[i + 1] = pos;
  }
  for (int i = 0; i < pn; i++) {
    pointer2[i] = 0;
  }
  const int s = number_of_pairs;
  for (int k = 0; k < s; k++) {
    int i = i_particles[k];
    int j = j_particles[k];
    int index = pointer[i] + pointer2[i];
    sorted_list[index] = j;
    pointer2[i] ++;
  }
}
//----------------------------------------------------------------------
void
makepair(void) {
  const double SL2 = SEARCH_LENGTH * SEARCH_LENGTH;
  const int pn = particle_number;
  for (int i = 0; i < pn; i++) {
    number_of_partners[i] = 0;
  }
  for (int i = 0; i < particle_number - 1; i++) {
    for (int j = i + 1; j < particle_number; j++) {
      const double dx = q[i][X] - q[j][X];
      const double dy = q[i][Y] - q[j][Y];
      const double dz = q[i][Z] - q[j][Z];
      const double r2 = dx * dx + dy * dy + dz * dz;
      if (r2 < SL2) {
        register_pair(i, j);
      }
    }
  }
}
//----------------------------------------------------------------------
void
init(void) {
  const double s = 1.0 / pow(density * 0.25, 1.0 / 3.0);
  const double hs = s * 0.5;
  int sx = static_cast<int>(L / s);
  int sy = static_cast<int>(L / s);
  int sz = static_cast<int>(L / s);
  for (int iz = 0; iz < sz; iz++) {
    for (int iy = 0; iy < sy; iy++) {
      for (int ix = 0; ix < sx; ix++) {
        double x = ix * s;
        double y = iy * s;
        double z = iz * s;
        add_particle(x     , y   , z);
        add_particle(x     , y + hs, z + hs);
        add_particle(x + hs  , y   , z + hs);
        add_particle(x + hs  , y + hs, z);
      }
    }
  }
  for (int i = 0; i < particle_number; i++) {
    p[i][X] = 0.0;
    p[i][Y] = 0.0;
    p[i][Z] = 0.0;
  }
}
//----------------------------------------------------------------------
__attribute__((noinline))
void
force_pair(void) {
  for (int k = 0; k < number_of_pairs; k++) {
    const int i = i_particles[k];
    const int j = j_particles[k];
    double dx = q[j][X] - q[i][X];
    double dy = q[j][Y] - q[i][Y];
    double dz = q[j][Z] - q[i][Z];
    double r2 = (dx * dx + dy * dy + dz * dz);
    if (r2 > CL2) continue;
    double r6 = r2 * r2 * r2;
    double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
    p[i][X] += df * dx;
    p[i][Y] += df * dy;
    p[i][Z] += df * dz;
  }
}
//----------------------------------------------------------------------
__attribute__((noinline))
void
mdacp_intrin_reactless() {
#if 1
#pragma omp parallel
  {
    const v4df vc24  = _mm256_set1_pd(24.0 * dt);
    const v4df vc48  = _mm256_set1_pd(48.0 * dt);
    const v4df vcl2  = _mm256_set1_pd(CL2);
    const v4df vzero = _mm256_setzero_pd();
    const int pn    = particle_number;

#pragma omp for nowait
    for (int i = 0; i < pn; i++) {
      const v4df vqi = _mm256_loadu_pd((double*)(q + i));
      v4df vpi = _mm256_loadu_pd((double*)(p + i));
      const int np = number_of_partners[i];
      const int kp = pointer[i];
      for (int k = 0; k < (np / 4) * 4; k += 4) {
        const int j_a = sorted_list[kp + k];
        const v4df vqj_a = _mm256_loadu_pd((double*)(q + j_a));
        v4df vdq_a = vqj_a - vqi;

        const int j_b = sorted_list[kp + k + 1];
        const v4df vqj_b = _mm256_loadu_pd((double*)(q + j_b));
        v4df vdq_b = vqj_b - vqi;

        const int j_c = sorted_list[kp + k + 2];
        const v4df vqj_c = _mm256_loadu_pd((double*)(q + j_c));
        v4df vdq_c = vqj_c - vqi;

        const int j_d = sorted_list[kp + k + 3];
        const v4df vqj_d = _mm256_loadu_pd((double*)(q + j_d));
        v4df vdq_d = vqj_d - vqi;

        v4df tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b);
        v4df tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b);
        v4df tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d);
        v4df tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d);

        v4df vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
        v4df vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
        v4df vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);

        v4df vr2 = vdx * vdx + vdy * vdy + vdz * vdz;
        v4df vr6 = vr2 * vr2 * vr2;
        v4df vdf = (vc24 * vr6 - vc48) / (vr6 * vr6 * vr2);

        v4df mask = vcl2 - vr2;
        vdf = _mm256_blendv_pd(vdf, vzero, mask);

        v4df vdf_a = _mm256_permute4x64_pd(vdf, 0);
        v4df vdf_b = _mm256_permute4x64_pd(vdf, 85);
        v4df vdf_c = _mm256_permute4x64_pd(vdf, 170);
        v4df vdf_d = _mm256_permute4x64_pd(vdf, 255);

        vpi += vdq_a * vdf_a;
        vpi += vdq_b * vdf_b;
        vpi += vdq_c * vdf_c;
        vpi += vdq_d * vdf_d;
      }
      _mm256_storeu_pd((double*)(p + i), vpi);

      double pfx = 0.0, pfy = 0.0, pfz = 0.0;
      const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z];
      for (int k = (np / 4) * 4; k < np; k++) {
        const int j = sorted_list[kp + k];
        const double dx = q[j][X] - qix;
        const double dy = q[j][Y] - qiy;
        const double dz = q[j][Z] - qiz;
        const double r2 = (dx * dx + dy * dy + dz * dz);
        const double r6 = r2 * r2 * r2;
        double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
        if (r2 > CL2) df = 0.0;
        pfx += df * dx;
        pfy += df * dy;
        pfz += df * dz;
      }
      p[i][X] += pfx;
      p[i][Y] += pfy;
      p[i][Z] += pfz;
    }
  }

#else

#pragma omp parallel
  {
    const auto vc24  = _mm256_set1_pd(24.0 * dt);
    const auto vc48  = _mm256_set1_pd(48.0 * dt);
    const auto vcl2  = _mm256_set1_pd(CL2);
    const auto vzero = _mm256_setzero_pd();
    const auto pn    = particle_number;

#pragma omp for nowait
    for (int i = 0; i < pn; i++) {
      const auto vqi = _mm256_loadu_pd((double*)(q + i));
      auto vpi = _mm256_loadu_pd((double*)(p + i));
      const auto np = number_of_partners[i];
      const auto kp = pointer[i];
      for (int k = 0; k < (np / 4) * 4; k += 4) {
        const auto j_a = sorted_list[kp + k];
        const auto vqj_a = _mm256_loadu_pd((double*)(q + j_a));
        auto vdq_a = _mm256_sub_pd(vqj_a, vqi);

        const auto j_b = sorted_list[kp + k + 1];
        const auto vqj_b = _mm256_loadu_pd((double*)(q + j_b));
        auto vdq_b = _mm256_sub_pd(vqj_b, vqi);

        const auto j_c = sorted_list[kp + k + 2];
        const auto vqj_c = _mm256_loadu_pd((double*)(q + j_c));
        auto vdq_c = _mm256_sub_pd(vqj_c, vqi);

        const auto j_d = sorted_list[kp + k + 3];
        const auto vqj_d = _mm256_loadu_pd((double*)(q + j_d));
        auto vdq_d = _mm256_sub_pd(vqj_d, vqi);

        auto tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b);
        auto tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b);
        auto tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d);
        auto tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d);

        auto vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
        auto vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
        auto vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);

        auto vr2 = _mm256_fmadd_pd(vdz, vdz,
                                   _mm256_fmadd_pd(vdy, vdy,
                                                   _mm256_mul_pd(vdx, vdx)));
        auto vr6 = _mm256_mul_pd(_mm256_mul_pd(vr2, vr2), vr2);

        auto vdf = _mm256_div_pd(_mm256_fmsub_pd(vc24, vr6, vc48),
                                 _mm256_mul_pd(_mm256_mul_pd(vr6, vr6), vr2));

        auto mask = vcl2 - vr2;
        vdf = _mm256_blendv_pd(vdf, vzero, mask);

        auto vdf_a = _mm256_permute4x64_pd(vdf, 0);
        auto vdf_b = _mm256_permute4x64_pd(vdf, 85);
        auto vdf_c = _mm256_permute4x64_pd(vdf, 170);
        auto vdf_d = _mm256_permute4x64_pd(vdf, 255);

        vpi += vdq_a * vdf_a;
        vpi += vdq_b * vdf_b;
        vpi += vdq_c * vdf_c;
        vpi += vdq_d * vdf_d;
      }
      _mm256_storeu_pd((double*)(p + i), vpi);

      double pfx = 0.0, pfy = 0.0, pfz = 0.0;
      const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z];
      for (int k = (np / 4) * 4; k < np; k++) {
        const int j = sorted_list[kp + k];
        const double dx = q[j][X] - qix;
        const double dy = q[j][Y] - qiy;
        const double dz = q[j][Z] - qiz;
        const double r2 = (dx * dx + dy * dy + dz * dz);
        const double r6 = r2 * r2 * r2;
        double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
        if (r2 > CL2) df = 0.0;
        pfx += df * dx;
        pfy += df * dy;
        pfz += df * dz;
      }
      p[i][X] += pfx;
      p[i][Y] += pfy;
      p[i][Z] += pfz;
    }
  }
#endif
}
//----------------------------------------------------------------------
void
loadpair(void) {
  std::ifstream ifs("pair.dat", std::ios::binary);
  ifs.read((char*)&number_of_pairs, sizeof(int));
  ifs.read((char*)number_of_partners, sizeof(int)*N);
  ifs.read((char*)i_particles, sizeof(int)*MAX_PAIRS);
  ifs.read((char*)j_particles, sizeof(int)*MAX_PAIRS);
}
//----------------------------------------------------------------------
void
savepair(void) {
  makepair();
  std::ofstream ofs("pair.dat", std::ios::binary);
  ofs.write((char*)&number_of_pairs, sizeof(int));
  ofs.write((char*)number_of_partners, sizeof(int)*N);
  ofs.write((char*)i_particles, sizeof(int)*MAX_PAIRS);
  ofs.write((char*)j_particles, sizeof(int)*MAX_PAIRS);
}
//----------------------------------------------------------------------
void
print_result(void) {
  for (int i = 0; i < 5; i++) {
    printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
  }
  for (int i = particle_number - 5; i < particle_number; i++) {
    printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
  }
}
//----------------------------------------------------------------------
int
main(void) {
  init();
  struct stat st;
  int ret = stat("pair.dat", &st);
  if (ret == 0) {
    std::cerr << "A pair-file is found. I use it." << std::endl;
    loadpair();
  } else {
    std::cerr << "Make pairlist." << std::endl;
    savepair();
  }
  std::cerr << "Number of pairs: " << number_of_pairs << std::endl;
  sortpair();
#ifdef MDACP_TEST
  const auto beg = myclock();
  for (int i = 0; i < 100; i++) {
    mdacp_intrin_reactless();
  }
  const auto dur = myclock() - beg;
  std::cout << dur << "[s]\n";
  print_result();
#else
  for (int i = 0; i < 100; i++) {
    force_pair();
  }
  print_result();
#endif
}
	#include <stdio.h>
	#include <immintrin.h>
	#include <iostream>
	#include <fstream>
	#include <random>
	#include <math.h>
	#include <assert.h>
	#include <sys/stat.h>
	#include <sys/time.h>
	const double density = 1.0;
	const int N = 400000;
	const int MAX_PAIRS = 100 * N;
	double L = 50.0;
	const double dt = 0.001;
	const int D = 4;
	enum {X, Y, Z};
	double q[N][D];
	double p[N][D];

	int particle_number = 0;
	int number_of_pairs = 0;
	int number_of_partners[N];
	int i_particles[MAX_PAIRS];
	int j_particles[MAX_PAIRS];
	int pointer[N], pointer2[N];
	int sorted_list[MAX_PAIRS];

	const double CUTOFF_LENGTH = 3.0;
	const double SEARCH_LENGTH = 3.3;
	const double CL2 = CUTOFF_LENGTH * CUTOFF_LENGTH;
	//----------------------------------------------------------------------
	typedef double v4df __attribute__((vector_size(32)));
	//----------------------------------------------------------------------
	void
	print256(v4df r) {
	double a = (double)(&r);
	printf("%.10f %.10f %.10f %.10f\n", a[0], a[1], a[2], a[3]);
	}
	//----------------------------------------------------------------------
	void
	add_particle(double x, double y, double z) {
	static std::mt19937 mt(2);
	std::uniform_real_distribution<double> ud(0.0, 0.1);
	q[particle_number][X] = x + ud(mt);
	q[particle_number][Y] = y + ud(mt);
	q[particle_number][Z] = z + ud(mt);
	particle_number++;
	}
	//----------------------------------------------------------------------
	double
	myclock(void) {
	struct timeval t;
	gettimeofday(&t, NULL);
	return t.tv_sec + t.tv_usec * 1e-6;
	}
	//----------------------------------------------------------------------
	void
	register_pair(int index1, int index2) {
	int i, j;
	if (index1 < index2) {
	i = index1;
	j = index2;
	} else {
	i = index2;
	j = index1;
	}
	i_particles[number_of_pairs] = i;
	j_particles[number_of_pairs] = j;
	number_of_partners[i]++;
	number_of_pairs++;

	i_particles[number_of_pairs] = j;
	j_particles[number_of_pairs] = i;
	number_of_partners[j]++;
	number_of_pairs++;
	}
	//----------------------------------------------------------------------
	void
	sortpair(void) {
	const int pn = particle_number;
	int pos = 0;
	pointer[0] = 0;
	for (int i = 0; i < pn - 1; i++) {
	pos += number_of_partners[i];
	pointer[i + 1] = pos;
	}
	for (int i = 0; i < pn; i++) {
	pointer2[i] = 0;
	}
	const int s = number_of_pairs;
	for (int k = 0; k < s; k++) {
	int i = i_particles[k];
	int j = j_particles[k];
	int index = pointer[i] + pointer2[i];
	sorted_list[index] = j;
	pointer2[i] ++;
	}
	}
	//----------------------------------------------------------------------
	void
	makepair(void) {
	const double SL2 = SEARCH_LENGTH * SEARCH_LENGTH;
	const int pn = particle_number;
	for (int i = 0; i < pn; i++) {
	number_of_partners[i] = 0;
	}
	for (int i = 0; i < particle_number - 1; i++) {
	for (int j = i + 1; j < particle_number; j++) {
	const double dx = q[i][X] - q[j][X];
	const double dy = q[i][Y] - q[j][Y];
	const double dz = q[i][Z] - q[j][Z];
	const double r2 = dx * dx + dy * dy + dz * dz;
	if (r2 < SL2) {
	register_pair(i, j);
	}
	}
	}
	}
	//----------------------------------------------------------------------
	void
	init(void) {
	const double s = 1.0 / pow(density * 0.25, 1.0 / 3.0);
	const double hs = s * 0.5;
	int sx = static_cast<int>(L / s);
	int sy = static_cast<int>(L / s);
	int sz = static_cast<int>(L / s);
	for (int iz = 0; iz < sz; iz++) {
	for (int iy = 0; iy < sy; iy++) {
	for (int ix = 0; ix < sx; ix++) {
	double x = ix * s;
	double y = iy * s;
	double z = iz * s;
	add_particle(x , y , z);
	add_particle(x , y + hs, z + hs);
	add_particle(x + hs , y , z + hs);
	add_particle(x + hs , y + hs, z);
	}
	}
	}
	for (int i = 0; i < particle_number; i++) {
	p[i][X] = 0.0;
	p[i][Y] = 0.0;
	p[i][Z] = 0.0;
	}
	}
	//----------------------------------------------------------------------
	__attribute__((noinline))
	void
	force_pair(void) {
	for (int k = 0; k < number_of_pairs; k++) {
	const int i = i_particles[k];
	const int j = j_particles[k];
	double dx = q[j][X] - q[i][X];
	double dy = q[j][Y] - q[i][Y];
	double dz = q[j][Z] - q[i][Z];
	double r2 = (dx * dx + dy * dy + dz * dz);
	if (r2 > CL2) continue;
	double r6 = r2 * r2 * r2;
	double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
	p[i][X] += df * dx;
	p[i][Y] += df * dy;
	p[i][Z] += df * dz;
	}
	}
	//----------------------------------------------------------------------
	__attribute__((noinline))
	void
	mdacp_intrin_reactless() {
	#if 1
	#pragma omp parallel
	{
	const v4df vc24 = _mm256_set1_pd(24.0 * dt);
	const v4df vc48 = _mm256_set1_pd(48.0 * dt);
	const v4df vcl2 = _mm256_set1_pd(CL2);
	const v4df vzero = _mm256_setzero_pd();
	const int pn = particle_number;

	#pragma omp for nowait
	for (int i = 0; i < pn; i++) {
	const v4df vqi = _mm256_loadu_pd((double*)(q + i));
	v4df vpi = _mm256_loadu_pd((double*)(p + i));
	const int np = number_of_partners[i];
	const int kp = pointer[i];
	for (int k = 0; k < (np / 4) * 4; k += 4) {
	const int j_a = sorted_list[kp + k];
	const v4df vqj_a = _mm256_loadu_pd((double*)(q + j_a));
	v4df vdq_a = vqj_a - vqi;

	const int j_b = sorted_list[kp + k + 1];
	const v4df vqj_b = _mm256_loadu_pd((double*)(q + j_b));
	v4df vdq_b = vqj_b - vqi;

	const int j_c = sorted_list[kp + k + 2];
	const v4df vqj_c = _mm256_loadu_pd((double*)(q + j_c));
	v4df vdq_c = vqj_c - vqi;

	const int j_d = sorted_list[kp + k + 3];
	const v4df vqj_d = _mm256_loadu_pd((double*)(q + j_d));
	v4df vdq_d = vqj_d - vqi;

	v4df tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b);
	v4df tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b);
	v4df tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d);
	v4df tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d);

	v4df vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
	v4df vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
	v4df vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);

	v4df vr2 = vdx * vdx + vdy * vdy + vdz * vdz;
	v4df vr6 = vr2 * vr2 * vr2;
	v4df vdf = (vc24 * vr6 - vc48) / (vr6 * vr6 * vr2);

	v4df mask = vcl2 - vr2;
	vdf = _mm256_blendv_pd(vdf, vzero, mask);

	v4df vdf_a = _mm256_permute4x64_pd(vdf, 0);
	v4df vdf_b = _mm256_permute4x64_pd(vdf, 85);
	v4df vdf_c = _mm256_permute4x64_pd(vdf, 170);
	v4df vdf_d = _mm256_permute4x64_pd(vdf, 255);

	vpi += vdq_a * vdf_a;
	vpi += vdq_b * vdf_b;
	vpi += vdq_c * vdf_c;
	vpi += vdq_d * vdf_d;
	}
	_mm256_storeu_pd((double*)(p + i), vpi);

	double pfx = 0.0, pfy = 0.0, pfz = 0.0;
	const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z];
	for (int k = (np / 4) * 4; k < np; k++) {
	const int j = sorted_list[kp + k];
	const double dx = q[j][X] - qix;
	const double dy = q[j][Y] - qiy;
	const double dz = q[j][Z] - qiz;
	const double r2 = (dx * dx + dy * dy + dz * dz);
	const double r6 = r2 * r2 * r2;
	double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
	if (r2 > CL2) df = 0.0;
	pfx += df * dx;
	pfy += df * dy;
	pfz += df * dz;
	}
	p[i][X] += pfx;
	p[i][Y] += pfy;
	p[i][Z] += pfz;
	}
	}

	#else

	#pragma omp parallel
	{
	const auto vc24 = _mm256_set1_pd(24.0 * dt);
	const auto vc48 = _mm256_set1_pd(48.0 * dt);
	const auto vcl2 = _mm256_set1_pd(CL2);
	const auto vzero = _mm256_setzero_pd();
	const auto pn = particle_number;

	#pragma omp for nowait
	for (int i = 0; i < pn; i++) {
	const auto vqi = _mm256_loadu_pd((double*)(q + i));
	auto vpi = _mm256_loadu_pd((double*)(p + i));
	const auto np = number_of_partners[i];
	const auto kp = pointer[i];
	for (int k = 0; k < (np / 4) * 4; k += 4) {
	const auto j_a = sorted_list[kp + k];
	const auto vqj_a = _mm256_loadu_pd((double*)(q + j_a));
	auto vdq_a = _mm256_sub_pd(vqj_a, vqi);

	const auto j_b = sorted_list[kp + k + 1];
	const auto vqj_b = _mm256_loadu_pd((double*)(q + j_b));
	auto vdq_b = _mm256_sub_pd(vqj_b, vqi);

	const auto j_c = sorted_list[kp + k + 2];
	const auto vqj_c = _mm256_loadu_pd((double*)(q + j_c));
	auto vdq_c = _mm256_sub_pd(vqj_c, vqi);

	const auto j_d = sorted_list[kp + k + 3];
	const auto vqj_d = _mm256_loadu_pd((double*)(q + j_d));
	auto vdq_d = _mm256_sub_pd(vqj_d, vqi);

	auto tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b);
	auto tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b);
	auto tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d);
	auto tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d);

	auto vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
	auto vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
	auto vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);

	auto vr2 = _mm256_fmadd_pd(vdz, vdz,
	_mm256_fmadd_pd(vdy, vdy,
	_mm256_mul_pd(vdx, vdx)));
	auto vr6 = _mm256_mul_pd(_mm256_mul_pd(vr2, vr2), vr2);

	auto vdf = _mm256_div_pd(_mm256_fmsub_pd(vc24, vr6, vc48),
	_mm256_mul_pd(_mm256_mul_pd(vr6, vr6), vr2));

	auto mask = vcl2 - vr2;
	vdf = _mm256_blendv_pd(vdf, vzero, mask);

	auto vdf_a = _mm256_permute4x64_pd(vdf, 0);
	auto vdf_b = _mm256_permute4x64_pd(vdf, 85);
	auto vdf_c = _mm256_permute4x64_pd(vdf, 170);
	auto vdf_d = _mm256_permute4x64_pd(vdf, 255);

	vpi += vdq_a * vdf_a;
	vpi += vdq_b * vdf_b;
	vpi += vdq_c * vdf_c;
	vpi += vdq_d * vdf_d;
	}
	_mm256_storeu_pd((double*)(p + i), vpi);

	double pfx = 0.0, pfy = 0.0, pfz = 0.0;
	const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z];
	for (int k = (np / 4) * 4; k < np; k++) {
	const int j = sorted_list[kp + k];
	const double dx = q[j][X] - qix;
	const double dy = q[j][Y] - qiy;
	const double dz = q[j][Z] - qiz;
	const double r2 = (dx * dx + dy * dy + dz * dz);
	const double r6 = r2 * r2 * r2;
	double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt;
	if (r2 > CL2) df = 0.0;
	pfx += df * dx;
	pfy += df * dy;
	pfz += df * dz;
	}
	p[i][X] += pfx;
	p[i][Y] += pfy;
	p[i][Z] += pfz;
	}
	}
	#endif
	}
	//----------------------------------------------------------------------
	void
	loadpair(void) {
	std::ifstream ifs("pair.dat", std::ios::binary);
	ifs.read((char*)&number_of_pairs, sizeof(int));
	ifs.read((char)number_of_partners, sizeof(int)N);
	ifs.read((char)i_particles, sizeof(int)MAX_PAIRS);
	ifs.read((char)j_particles, sizeof(int)MAX_PAIRS);
	}
	//----------------------------------------------------------------------
	void
	savepair(void) {
	makepair();
	std::ofstream ofs("pair.dat", std::ios::binary);
	ofs.write((char*)&number_of_pairs, sizeof(int));
	ofs.write((char)number_of_partners, sizeof(int)N);
	ofs.write((char)i_particles, sizeof(int)MAX_PAIRS);
	ofs.write((char)j_particles, sizeof(int)MAX_PAIRS);
	}
	//----------------------------------------------------------------------
	void
	print_result(void) {
	for (int i = 0; i < 5; i++) {
	printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
	}
	for (int i = particle_number - 5; i < particle_number; i++) {
	printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]);
	}
	}
	//----------------------------------------------------------------------
	int
	main(void) {
	init();
	struct stat st;
	int ret = stat("pair.dat", &st);
	if (ret == 0) {
	std::cerr << "A pair-file is found. I use it." << std::endl;
	loadpair();
	} else {
	std::cerr << "Make pairlist." << std::endl;
	savepair();
	}
	std::cerr << "Number of pairs: " << number_of_pairs << std::endl;
	sortpair();
	#ifdef MDACP_TEST
	const auto beg = myclock();
	for (int i = 0; i < 100; i++) {
	mdacp_intrin_reactless();
	}
	const auto dur = myclock() - beg;
	std::cout << dur << "[s]\n";
	print_result();
	#else
	for (int i = 0; i < 100; i++) {
	force_pair();
	}
	print_result();
	#endif
	}