foota/simple_md_gpu.cu

## simple_md_gpu.cu
////////////////////////////////////////////////////////////////
// Simple Molecular Dynamics using GPU  by nox, 2009.12.10.
//
// Perform molecular dynamics for pseudo-atoms without internal
// interactions, such as bonds, angles, dihedrals. Only vdW and
// Coulomb interactions.

#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <cutil_inline.h>
using namespace std;

const int BLOCK = 256;

__device__ float Distance(float* crd, int i, int j)
{
    float dx = crd[i*3]   - crd[j*3];
    float dy = crd[i*3+1] - crd[j*3+1];
    float dz = crd[i*3+2] - crd[j*3+2];
    return sqrtf(dx * dx + dy * dy + dz * dz);
}

__global__ void Center(float* crd, float cx, float cy, float cz, int num_atoms)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= num_atoms) return;

    crd[i*3]   -= cx;
    crd[i*3+1] -= cy;
    crd[i*3+2] -= cz;
}

// Calculate energies and forces
// Total energy = vdW + Coulomb
// vdW
//  U = eps * [(Rij / r[i])^12 - 2 * (Rij / r[i])^6]
//  F = -12 * eps / Rij * [(Rij / r[i])^13 - (Rij / r[i])^7] * r_xyz / r[i]
// Coulomb
//  U = SUM_i>j qiqj / r[i]
//  F = SUM_j qiqj / r[i]^3 * r_xyz
__global__ void Calc(float* crd, float* f, float* ene, float* chg, int num_atoms)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= num_atoms) return;

    const float eps = 0.2f;
    const float Rij = 2.5f;
    float r, rij, r12, r6, r3, r_, q, x;

    float e0, f0, f1, f2;
    e0 = f0 = f1 = f2 = 0.0f;
    float c0 = crd[i*3];
    float c1 = crd[i*3+1];
    float c2 = crd[i*3+2];
    float q0 = chg[i];
    for (int j = 0; j < num_atoms; j++) {
        if (i == j) continue;
        r = Distance(crd, i, j);
        r_ = 1.0f / r;
        q = q0 * chg[j];
        rij = Rij * r_;
        r3 = rij * rij * rij;
        r6 = r3 * r3;
        r12 = r6 * r6;
        x = ((12 * eps) * (r12 - r6) + q * r_) * r_ * r_;
        e0 += eps * (r12 - 2.0f * r6) + q * r_;
        f0 += x * (c0 - crd[j*3]);
        f1 += x * (c1 - crd[j*3+1]);
        f2 += x * (c2 - crd[j*3+2]);
    }
    ene[i] = e0;
    f[i*3] = f0;
    f[i*3+1] = f1;
    f[i*3+2] = f2;
}

template<unsigned int blockSize>
__global__ void Energy(float* ene, float* oene, unsigned int n)
{
    extern __shared__ float sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * (blockSize * 2) + tid;
    unsigned int gridSize = blockSize * 2 * gridDim.x;
    sdata[tid] = 0;

    while (i < n) { sdata[tid] += ene[i] + ene[i+blockSize];  i += gridSize; }
    __syncthreads();

    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid+256]; } __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid+128]; } __syncthreads(); }
    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid+ 64]; } __syncthreads(); }

    if (tid < 32) {
        if (blockSize >= 64) sdata[tid] += sdata[tid+32];
        if (blockSize >= 32) sdata[tid] += sdata[tid+16];
        if (blockSize >= 16) sdata[tid] += sdata[tid+ 8];
        if (blockSize >=  8) sdata[tid] += sdata[tid+ 4];
        if (blockSize >=  4) sdata[tid] += sdata[tid+ 2];
        if (blockSize >=  2) sdata[tid] += sdata[tid+ 1];
    }

    if (tid == 0) oene[blockIdx.x] = sdata[0];
}

__global__ void Move(float* crd, float* f, int num_atoms, float cap_range)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= num_atoms) return;

    crd[i*3]   += f[i*3]   * 0.01f;
    crd[i*3+1] += f[i*3+1] * 0.01f;
    crd[i*3+2] += f[i*3+2] * 0.01f;

    float r = crd[i*3] * crd[i*3] + crd[i*3+1] * crd[i*3+1] + crd[i*3+2] * crd[i*3+2];
    float dr = r - cap_range * cap_range;
    if (dr > 0.0f) {
        f[i*3]   = -crd[i*3]   / cap_range * dr * 0.01f;
        f[i*3+1] = -crd[i*3+1] / cap_range * dr * 0.01f;
        f[i*3+2] = -crd[i*3+2] / cap_range * dr * 0.01f;
    }
}

class SimpleMD {
private:
    dim3 grid;
    dim3 threads;
    int width;
    int num_steps;
    int num_atoms;
    int num_center;
    float cap_range;
    float *h_crd, *h_f, *h_ene, *h_chg;
    float *d_crd, *d_f, *d_ene, *d_chg, *d_oene;
    float tene;

public:
    SimpleMD(int n, int nc, char*);
    ~SimpleMD();
    void ReadCrd(char*);
    void PrintCrd();
    void CenterPosition();
    unsigned int NextPow2(unsigned int x);
    void GetBlocksAndThreads(int n, int& blocks, int& threads);
    void TotalEnergyReduce(int threads, int blocks);
    void TotalEnergy();
    void Run();
};

SimpleMD::SimpleMD(int n, int nc, char* fname) : num_steps(n), num_center(nc)
{
    ReadCrd(fname);

    h_f = new float[num_atoms * 3];
    fill(h_f, h_f + num_atoms * 3, 0.0f);
    h_ene = new float[NextPow2(num_atoms)];
    fill(h_ene, h_ene + NextPow2(num_atoms), 0.0f);

    width = (num_atoms - 1) / BLOCK + 1;
    grid.x = width;
    grid.y = 1;
    grid.z = 1;
    threads.x = BLOCK;
    threads.y = 1;
    threads.z = 1;

    cudaMalloc((void**)&d_crd, sizeof(float) * num_atoms * 3);
    cudaMalloc((void**)&d_f, sizeof(float) * num_atoms * 3);
    cudaMalloc((void**)&d_oene, sizeof(float) * num_atoms);
    cudaMalloc((void**)&d_ene, sizeof(float) * NextPow2(num_atoms));
    cudaMalloc((void**)&d_chg, sizeof(float) * num_atoms);
}

SimpleMD::~SimpleMD()
{
    cudaFree(d_chg);
    cudaFree(d_ene);
    cudaFree(d_f);
    cudaFree(d_crd);

    delete[] h_ene;
    delete[] h_f;
    delete[] h_chg;
    delete[] h_crd;
}

void SimpleMD::ReadCrd(char* fname)
{
    fstream fs(fname, ios_base::in);
    string line;
    stringstream ss;

    if (!fs.is_open()) {
        cerr << "File open error: " << fname << endl;
        exit(1);
    }

    getline(fs, line);  cout << line << endl;
    getline(fs, line);  ss.str(line);  ss >> num_atoms;  ss.clear();
    getline(fs, line);  ss.str(line);  ss >> cap_range;  ss.clear();

    h_crd = new float[num_atoms * 3];
    h_chg = new float[num_atoms];

    for (int i = 0; i < num_atoms; i++) {
        getline(fs, line);
        ss.str(line);
        ss >> h_crd[i*3] >> h_crd[i*3+1] >> h_crd[i*3+2] >> h_chg[i];
        ss.clear();
        ss.str("");
    }
    fs.close();
}

void SimpleMD::PrintCrd()
{
    cout << endl;
    cout << num_atoms << endl;
    cout << cap_range << endl;
    for (int i = 0; i < num_atoms; i++) {
        for (int j = 0; j < 3; j++) cout << " " << fixed << setw(10) << setprecision(6) << h_crd[i*3+j];
        cout << " " << fixed << setw(8) << setprecision(4) << h_chg[i];
        cout << endl;
    }
}

void SimpleMD::CenterPosition()
{
    float d[3];
    cudaMemcpy(h_crd, d_crd, sizeof(float) * num_atoms * 3, cudaMemcpyDeviceToHost);
    for (int i = 0; i < num_atoms; i++)
        for (int j = 0; j < 3; j++) d[j] += h_crd[i*3+j];
    for (int i = 0; i < 3; i++) d[i] /= num_atoms;
    cudaMemcpy(d_crd, h_crd, sizeof(float) * num_atoms * 3, cudaMemcpyHostToDevice);
    Center<<<grid, threads>>>(d_crd, d[0], d[1], d[2], num_atoms);
    cudaThreadSynchronize();
}

unsigned int SimpleMD::NextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

void SimpleMD::GetBlocksAndThreads(int n, int& blocks, int& threads)
{
    threads = (n < BLOCK * 2) ? NextPow2((n + 1) / 2) : BLOCK;
    blocks = (n + (threads * 2 - 1)) / (threads * 2);
    blocks = min(width, blocks);
}

void SimpleMD::TotalEnergyReduce(int threads, int blocks)
{
    switch (threads) {
    case 512:
        Energy<512><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 256:
        Energy<256><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 128:
        Energy<128><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 64:
        Energy< 64><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 32:
        Energy< 32><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 16:
        Energy< 16><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 8:
        Energy<  8><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 4:
        Energy<  4><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 2:
        Energy<  2><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    case 1:
        Energy<  1><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms);  break;
    }
}

void SimpleMD::TotalEnergy()
{
    if (num_atoms > 1024) {
        int th, bl;
        GetBlocksAndThreads(num_atoms, bl, th);
        int s = bl;
        while (s > 1) {
            GetBlocksAndThreads(s, bl, th);
            TotalEnergyReduce(th, bl);
            s = (s + (th * 2 - 1)) / (th * 2);
        }
        cudaThreadSynchronize();
        cudaMemcpy(&tene, d_oene, sizeof(float), cudaMemcpyDeviceToHost);
        tene /= 2.0f;
    } else {
        cudaMemcpy(h_ene, d_ene, sizeof(float) * num_atoms, cudaMemcpyDeviceToHost);
        tene = accumulate(h_ene, h_ene + num_atoms, 0.0f) / 2.0f;
    }
}

void SimpleMD::Run()
{
    cudaMemcpy(d_crd, h_crd, sizeof(float) * num_atoms * 3, cudaMemcpyHostToDevice);
    cudaMemcpy(d_f, h_f, sizeof(float) * num_atoms * 3, cudaMemcpyHostToDevice);
    cudaMemcpy(d_ene, h_ene, sizeof(float) * NextPow2(num_atoms), cudaMemcpyHostToDevice);
    cudaMemcpy(d_chg, h_chg, sizeof(float) * num_atoms, cudaMemcpyHostToDevice);

    for (int i = 0; i < num_steps; i++) {
        if (i % num_center == 0) CenterPosition();
        Calc<<<grid, threads>>>(d_crd, d_f, d_ene, d_chg, num_atoms);
        cudaThreadSynchronize();
        TotalEnergy();
        Move<<<grid, threads>>>(d_crd, d_f, num_atoms, cap_range);
        cudaThreadSynchronize();
        cout << "Energy (" << setw(7) << i + 1 << "): ";
        cout << fixed << setw(15) << setprecision(5) << tene << endl;
    }

    cudaMemcpy(h_crd, d_crd, sizeof(float) * num_atoms * 3, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_f, d_f, sizeof(float) * num_atoms * 3, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_ene, d_ene, sizeof(float) * num_atoms, cudaMemcpyDeviceToHost);
}

int main(int argc, char** argv)
{
    if (argc != 3) {
        cerr << "Usage: " << argv[0] << " input_file number_of_steps" << endl;
        cerr << "Input: line 1 : title" << endl;
        cerr << "       line 2 : number of atoms" << endl;
        cerr << "       line 3 : radius of droplet" << endl;
        cerr << "       line 4-: x-crd  y-crd  z-crd  charge" << endl;
        exit(1);
    }

    stringstream ss;
    int nstep;

    cutilDeviceInit(1, argv);

    ss.str(argv[2]);  ss >> nstep;
    int ncenter = 100;
    SimpleMD md(nstep, ncenter, argv[1]);

    md.PrintCrd();
    md.Run();
    md.PrintCrd();

    return 0;
}
	////////////////////////////////////////////////////////////////
	// Simple Molecular Dynamics using GPU by nox, 2009.12.10.
	//
	// Perform molecular dynamics for pseudo-atoms without internal
	// interactions, such as bonds, angles, dihedrals. Only vdW and
	// Coulomb interactions.

	#include <iostream>
	#include <iomanip>
	#include <fstream>
	#include <sstream>
	#include <algorithm>
	#include <numeric>
	#include <cmath>
	#include <cutil_inline.h>
	using namespace std;

	const int BLOCK = 256;

	__device__ float Distance(float* crd, int i, int j)
	{
	float dx = crd[i3] - crd[j3];
	float dy = crd[i3+1] - crd[j3+1];
	float dz = crd[i3+2] - crd[j3+2];
	return sqrtf(dx * dx + dy * dy + dz * dz);
	}

	__global__ void Center(float* crd, float cx, float cy, float cz, int num_atoms)
	{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= num_atoms) return;

	crd[i*3] -= cx;
	crd[i*3+1] -= cy;
	crd[i*3+2] -= cz;
	}

	// Calculate energies and forces
	// Total energy = vdW + Coulomb
	// vdW
	// U = eps * [(Rij / r[i])^12 - 2 * (Rij / r[i])^6]
	// F = -12 * eps / Rij * [(Rij / r[i])^13 - (Rij / r[i])^7] * r_xyz / r[i]
	// Coulomb
	// U = SUM_i>j qiqj / r[i]
	// F = SUM_j qiqj / r[i]^3 * r_xyz
	__global__ void Calc(float* crd, float* f, float* ene, float* chg, int num_atoms)
	{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= num_atoms) return;

	const float eps = 0.2f;
	const float Rij = 2.5f;
	float r, rij, r12, r6, r3, r_, q, x;

	float e0, f0, f1, f2;
	e0 = f0 = f1 = f2 = 0.0f;
	float c0 = crd[i*3];
	float c1 = crd[i*3+1];
	float c2 = crd[i*3+2];
	float q0 = chg[i];
	for (int j = 0; j < num_atoms; j++) {
	if (i == j) continue;
	r = Distance(crd, i, j);
	r_ = 1.0f / r;
	q = q0 * chg[j];
	rij = Rij * r_;
	r3 = rij * rij * rij;
	r6 = r3 * r3;
	r12 = r6 * r6;
	x = ((12 * eps) * (r12 - r6) + q * r_) * r_ * r_;
	e0 += eps * (r12 - 2.0f * r6) + q * r_;
	f0 += x * (c0 - crd[j*3]);
	f1 += x * (c1 - crd[j*3+1]);
	f2 += x * (c2 - crd[j*3+2]);
	}
	ene[i] = e0;
	f[i*3] = f0;
	f[i*3+1] = f1;
	f[i*3+2] = f2;
	}

	template<unsigned int blockSize>
	__global__ void Energy(float* ene, float* oene, unsigned int n)
	{
	extern __shared__ float sdata[];

	unsigned int tid = threadIdx.x;
	unsigned int i = blockIdx.x * (blockSize * 2) + tid;
	unsigned int gridSize = blockSize * 2 * gridDim.x;
	sdata[tid] = 0;

	while (i < n) { sdata[tid] += ene[i] + ene[i+blockSize]; i += gridSize; }
	__syncthreads();

	if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid+256]; } __syncthreads(); }
	if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid+128]; } __syncthreads(); }
	if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid+ 64]; } __syncthreads(); }

	if (tid < 32) {
	if (blockSize >= 64) sdata[tid] += sdata[tid+32];
	if (blockSize >= 32) sdata[tid] += sdata[tid+16];
	if (blockSize >= 16) sdata[tid] += sdata[tid+ 8];
	if (blockSize >= 8) sdata[tid] += sdata[tid+ 4];
	if (blockSize >= 4) sdata[tid] += sdata[tid+ 2];
	if (blockSize >= 2) sdata[tid] += sdata[tid+ 1];
	}

	if (tid == 0) oene[blockIdx.x] = sdata[0];
	}

	__global__ void Move(float* crd, float* f, int num_atoms, float cap_range)
	{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= num_atoms) return;

	crd[i3] += f[i3] * 0.01f;
	crd[i3+1] += f[i3+1] * 0.01f;
	crd[i3+2] += f[i3+2] * 0.01f;

	float r = crd[i3] crd[i3] + crd[i3+1] * crd[i3+1] + crd[i3+2] * crd[i*3+2];
	float dr = r - cap_range * cap_range;
	if (dr > 0.0f) {
	f[i3] = -crd[i3] / cap_range * dr * 0.01f;
	f[i3+1] = -crd[i3+1] / cap_range * dr * 0.01f;
	f[i3+2] = -crd[i3+2] / cap_range * dr * 0.01f;
	}
	}

	class SimpleMD {
	private:
	dim3 grid;
	dim3 threads;
	int width;
	int num_steps;
	int num_atoms;
	int num_center;
	float cap_range;
	float h_crd, h_f, h_ene, h_chg;
	float d_crd, d_f, d_ene, d_chg, *d_oene;
	float tene;

	public:
	SimpleMD(int n, int nc, char*);
	~SimpleMD();
	void ReadCrd(char*);
	void PrintCrd();
	void CenterPosition();
	unsigned int NextPow2(unsigned int x);
	void GetBlocksAndThreads(int n, int& blocks, int& threads);
	void TotalEnergyReduce(int threads, int blocks);
	void TotalEnergy();
	void Run();
	};

	SimpleMD::SimpleMD(int n, int nc, char* fname) : num_steps(n), num_center(nc)
	{
	ReadCrd(fname);

	h_f = new float[num_atoms * 3];
	fill(h_f, h_f + num_atoms * 3, 0.0f);
	h_ene = new float[NextPow2(num_atoms)];
	fill(h_ene, h_ene + NextPow2(num_atoms), 0.0f);

	width = (num_atoms - 1) / BLOCK + 1;
	grid.x = width;
	grid.y = 1;
	grid.z = 1;
	threads.x = BLOCK;
	threads.y = 1;
	threads.z = 1;

	cudaMalloc((void*)&d_crd, sizeof(float) num_atoms * 3);
	cudaMalloc((void*)&d_f, sizeof(float) num_atoms * 3);
	cudaMalloc((void*)&d_oene, sizeof(float) num_atoms);
	cudaMalloc((void*)&d_ene, sizeof(float) NextPow2(num_atoms));
	cudaMalloc((void*)&d_chg, sizeof(float) num_atoms);
	}

	SimpleMD::~SimpleMD()
	{
	cudaFree(d_chg);
	cudaFree(d_ene);
	cudaFree(d_f);
	cudaFree(d_crd);

	delete[] h_ene;
	delete[] h_f;
	delete[] h_chg;
	delete[] h_crd;
	}

	void SimpleMD::ReadCrd(char* fname)
	{
	fstream fs(fname, ios_base::in);
	string line;
	stringstream ss;

	if (!fs.is_open()) {
	cerr << "File open error: " << fname << endl;
	exit(1);
	}

	getline(fs, line); cout << line << endl;
	getline(fs, line); ss.str(line); ss >> num_atoms; ss.clear();
	getline(fs, line); ss.str(line); ss >> cap_range; ss.clear();

	h_crd = new float[num_atoms * 3];
	h_chg = new float[num_atoms];

	for (int i = 0; i < num_atoms; i++) {
	getline(fs, line);
	ss.str(line);
	ss >> h_crd[i3] >> h_crd[i3+1] >> h_crd[i*3+2] >> h_chg[i];
	ss.clear();
	ss.str("");
	}
	fs.close();
	}

	void SimpleMD::PrintCrd()
	{
	cout << endl;
	cout << num_atoms << endl;
	cout << cap_range << endl;
	for (int i = 0; i < num_atoms; i++) {
	for (int j = 0; j < 3; j++) cout << " " << fixed << setw(10) << setprecision(6) << h_crd[i*3+j];
	cout << " " << fixed << setw(8) << setprecision(4) << h_chg[i];
	cout << endl;
	}
	}

	void SimpleMD::CenterPosition()
	{
	float d[3];
	cudaMemcpy(h_crd, d_crd, sizeof(float) * num_atoms * 3, cudaMemcpyDeviceToHost);
	for (int i = 0; i < num_atoms; i++)
	for (int j = 0; j < 3; j++) d[j] += h_crd[i*3+j];
	for (int i = 0; i < 3; i++) d[i] /= num_atoms;
	cudaMemcpy(d_crd, h_crd, sizeof(float) * num_atoms * 3, cudaMemcpyHostToDevice);
	Center<<<grid, threads>>>(d_crd, d[0], d[1], d[2], num_atoms);
	cudaThreadSynchronize();
	}

	unsigned int SimpleMD::NextPow2(unsigned int x)
	{
	--x;
	x \|= x >> 1;
	x \|= x >> 2;
	x \|= x >> 4;
	x \|= x >> 8;
	x \|= x >> 16;
	return ++x;
	}

	void SimpleMD::GetBlocksAndThreads(int n, int& blocks, int& threads)
	{
	threads = (n < BLOCK * 2) ? NextPow2((n + 1) / 2) : BLOCK;
	blocks = (n + (threads * 2 - 1)) / (threads * 2);
	blocks = min(width, blocks);
	}

	void SimpleMD::TotalEnergyReduce(int threads, int blocks)
	{
	switch (threads) {
	case 512:
	Energy<512><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 256:
	Energy<256><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 128:
	Energy<128><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 64:
	Energy< 64><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 32:
	Energy< 32><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 16:
	Energy< 16><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 8:
	Energy< 8><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 4:
	Energy< 4><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 2:
	Energy< 2><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	case 1:
	Energy< 1><<<blocks, threads, sizeof(float) * threads>>>(d_ene, d_oene, num_atoms); break;
	}
	}

	void SimpleMD::TotalEnergy()
	{
	if (num_atoms > 1024) {
	int th, bl;
	GetBlocksAndThreads(num_atoms, bl, th);
	int s = bl;
	while (s > 1) {
	GetBlocksAndThreads(s, bl, th);
	TotalEnergyReduce(th, bl);
	s = (s + (th * 2 - 1)) / (th * 2);
	}
	cudaThreadSynchronize();
	cudaMemcpy(&tene, d_oene, sizeof(float), cudaMemcpyDeviceToHost);
	tene /= 2.0f;
	} else {
	cudaMemcpy(h_ene, d_ene, sizeof(float) * num_atoms, cudaMemcpyDeviceToHost);
	tene = accumulate(h_ene, h_ene + num_atoms, 0.0f) / 2.0f;
	}
	}

	void SimpleMD::Run()
	{
	cudaMemcpy(d_crd, h_crd, sizeof(float) * num_atoms * 3, cudaMemcpyHostToDevice);
	cudaMemcpy(d_f, h_f, sizeof(float) * num_atoms * 3, cudaMemcpyHostToDevice);
	cudaMemcpy(d_ene, h_ene, sizeof(float) * NextPow2(num_atoms), cudaMemcpyHostToDevice);
	cudaMemcpy(d_chg, h_chg, sizeof(float) * num_atoms, cudaMemcpyHostToDevice);

	for (int i = 0; i < num_steps; i++) {
	if (i % num_center == 0) CenterPosition();
	Calc<<<grid, threads>>>(d_crd, d_f, d_ene, d_chg, num_atoms);
	cudaThreadSynchronize();
	TotalEnergy();
	Move<<<grid, threads>>>(d_crd, d_f, num_atoms, cap_range);
	cudaThreadSynchronize();
	cout << "Energy (" << setw(7) << i + 1 << "): ";
	cout << fixed << setw(15) << setprecision(5) << tene << endl;
	}

	cudaMemcpy(h_crd, d_crd, sizeof(float) * num_atoms * 3, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_f, d_f, sizeof(float) * num_atoms * 3, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_ene, d_ene, sizeof(float) * num_atoms, cudaMemcpyDeviceToHost);
	}

	int main(int argc, char** argv)
	{
	if (argc != 3) {
	cerr << "Usage: " << argv[0] << " input_file number_of_steps" << endl;
	cerr << "Input: line 1 : title" << endl;
	cerr << " line 2 : number of atoms" << endl;
	cerr << " line 3 : radius of droplet" << endl;
	cerr << " line 4-: x-crd y-crd z-crd charge" << endl;
	exit(1);
	}

	stringstream ss;
	int nstep;

	cutilDeviceInit(1, argv);

	ss.str(argv[2]); ss >> nstep;
	int ncenter = 100;
	SimpleMD md(nstep, ncenter, argv[1]);

	md.PrintCrd();
	md.Run();
	md.PrintCrd();

	return 0;
	}