Created
October 29, 2013 11:06
-
-
Save saifthe1/7212656 to your computer and use it in GitHub Desktop.
Force calculation function containing additional modification for virial calculation and facilitates invocation using preprocessor flags
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifdef SUPPORTS_64_BIT_ATOMICS | |
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable | |
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable | |
#endif | |
#define TILE_SIZE 32 | |
// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would | |
// cause every element of array to have DWORD of padding to make it 4 DWORD | |
// aligned which wastes space and causes LDS bank conflicts as stride is no | |
// longer odd DWORDS. | |
typedef struct { | |
float x, y, z; | |
} UnalignedFloat3; | |
/** | |
* Through the code CALCULATE_VIRIAL preprocessor directive will be used | |
* to check if virial calculation functionality is included. | |
*/ | |
typedef struct { | |
float x, y, z; | |
float q; | |
float fx, fy, fz; | |
/** | |
* The 9 float elements below are added additionally to | |
* to accomodate virial calculation functionality | |
* by saif mulla | |
*/ | |
#ifdef CALCULATE_VIRIAL | |
float rf0,rf1,rf2,rf3,rf4,rf5,rf6,rf7,rf8; // virial tensor elements | |
#endif | |
ATOM_PARAMETER_DATA | |
#ifndef PARAMETER_SIZE_IS_EVEN | |
float padding; | |
#endif | |
} AtomData; | |
/** | |
* Compute nonbonded interactions. | |
*/ | |
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1))) | |
void computeNonbonded( | |
#ifdef SUPPORTS_64_BIT_ATOMICS | |
__global long* restrict forceBuffers, | |
#else | |
__global float4* restrict forceBuffers, | |
#endif | |
__global float* restrict energyBuffer, | |
__global const float4* restrict posq, | |
__global const unsigned int* restrict exclusions, | |
__global const unsigned int* restrict exclusionIndices, | |
__global const unsigned int* restrict exclusionRowIndices, | |
unsigned int startTileIndex, unsigned int endTileIndex, | |
#ifdef USE_CUTOFF | |
__global const ushort2* restrict tiles, | |
__global const unsigned int* restrict interactionCount, | |
float4 periodicBoxSize, float4 invPeriodicBoxSize, | |
unsigned int maxTiles, | |
__global const unsigned int* restrict interactionFlags | |
#else | |
unsigned int numTiles | |
#endif | |
#ifdef CALCULATE_VIRIAL | |
#ifdef SUPPORTS_64_BIT_ATOMICS | |
,__global long* restrict virialBuffers1, | |
__global long* restrict virialBuffers2, | |
__global long* restrict virialBuffers3, | |
#else | |
,__global float4* restrict virialBuffers1, | |
__global float4* restrict virialBuffers2, | |
__global float4* restrict virialBuffers3, | |
#endif | |
__global float4* restrict moleculeCentresOfMass, | |
__global int* restrict atomInMolecule | |
#endif | |
PARAMETER_ARGUMENTS) { | |
#ifdef USE_CUTOFF | |
unsigned int numTiles = interactionCount[0]; | |
unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0)); | |
unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0)); | |
#else | |
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0); | |
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0); | |
#endif | |
float energy = 0.0f; | |
unsigned int lasty = 0xFFFFFFFF; | |
__local AtomData localData[TILE_SIZE]; | |
__local UnalignedFloat3 localForce[FORCE_WORK_GROUP_SIZE]; | |
#ifdef CALCULATE_VIRIAL | |
__local UnalignedFloat3 localVirial1[FORCE_WORK_GROUP_SIZE]; | |
__local UnalignedFloat3 localVirial2[FORCE_WORK_GROUP_SIZE]; | |
__local UnalignedFloat3 localVirial3[FORCE_WORK_GROUP_SIZE]; | |
#endif | |
#ifdef USE_EXCLUSIONS | |
__local unsigned int exclusionRange[2]; | |
__local int exclusionIndex[1]; | |
#endif | |
while (pos < end) { | |
// Extract the coordinates of this tile | |
unsigned int x, y; | |
#ifdef USE_CUTOFF | |
if (numTiles <= maxTiles) { | |
ushort2 tileIndices = tiles[pos]; | |
x = tileIndices.x; | |
y = tileIndices.y; | |
} | |
else | |
#endif | |
{ | |
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); | |
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); | |
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. | |
y += (x < y ? -1 : 1); | |
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); | |
} | |
} | |
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); | |
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); | |
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1); | |
unsigned int atom1 = x*TILE_SIZE + tgx; | |
float4 force = 0.0f; | |
#ifdef CALCULATE_VIRIAL | |
float4 virial1 = (float4) 0.0f; | |
float4 virial2 = (float4) 0.0f; | |
float4 virial3 = (float4) 0.0f; | |
#endif | |
float4 posq1 = posq[atom1]; | |
LOAD_ATOM1_PARAMETERS | |
// Locate the exclusion data for this tile. | |
#ifdef USE_EXCLUSIONS | |
if (get_local_id(0) < 2) | |
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)]; | |
if (get_local_id(0) == 0) | |
exclusionIndex[0] = -1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for (int i = exclusionRange[0]+get_local_id(0); i < exclusionRange[1]; i += FORCE_WORK_GROUP_SIZE) | |
if (exclusionIndices[i] == y) | |
exclusionIndex[0] = i*TILE_SIZE; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
bool hasExclusions = (exclusionIndex[0] > -1); | |
#endif | |
if (x == y) { | |
// This tile is on the diagonal. | |
if (get_local_id(0) < TILE_SIZE) { | |
const unsigned int localAtomIndex = tgx; | |
localData[localAtomIndex].x = posq1.x; | |
localData[localAtomIndex].y = posq1.y; | |
localData[localAtomIndex].z = posq1.z; | |
localData[localAtomIndex].q = posq1.w; | |
LOAD_LOCAL_PARAMETERS_FROM_1 | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
#ifdef USE_EXCLUSIONS | |
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom; | |
#endif | |
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { | |
#ifdef USE_EXCLUSIONS | |
bool isExcluded = !(excl & 0x1); | |
#endif | |
unsigned int atom2 = baseLocalAtom+j; | |
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); | |
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); | |
#ifdef USE_PERIODIC | |
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; | |
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; | |
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; | |
#endif | |
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; | |
float invR = RSQRT(r2); | |
float r = RECIP(invR); | |
LOAD_ATOM2_PARAMETERS | |
atom2 = y*TILE_SIZE+baseLocalAtom+j; | |
#ifdef USE_SYMMETRIC | |
float dEdR = 0.0f; | |
#else | |
float4 dEdR1 = (float4) 0.0f; | |
float4 dEdR2 = (float4) 0.0f; | |
#endif | |
#ifdef CALCULATE_VIRIAL | |
float4 forceContribution = (float4) 0.0f; | |
#endif | |
float tempEnergy = 0.0f; | |
COMPUTE_INTERACTION | |
energy += 0.5f*tempEnergy; | |
#ifdef USE_SYMMETRIC | |
#ifdef CALCULATE_VIRIAL | |
forceContribution.xyz = delta.xyz*dEdR; | |
force.xyz -= forceContribution.xyz; | |
#else | |
force.xyz -= delta.xyz*dEdR; | |
#endif | |
#else | |
#ifdef CALCULATE_VIRIAL | |
forceContribution.xyz = dEdR1.xyz; | |
force.xyz -= dEdR1.xyz; | |
#else | |
force.xyz -= dEdR1.xyz; | |
#endif | |
#endif | |
#ifdef CALCULATE_VIRIAL | |
if(!isExcluded) | |
{ | |
virial1 += (float4) (forceContribution.x*delta.x,forceContribution.x*delta.y,forceContribution.x*delta.z,0.0); | |
virial2 += (float4) (forceContribution.y*delta.x,forceContribution.y*delta.y,forceContribution.y*delta.z,0.0); | |
virial3 += (float4) (forceContribution.z*delta.x,forceContribution.z*delta.y,forceContribution.z*delta.z,0.0); | |
} | |
#endif | |
excl >>= 1; | |
} | |
// Sum the forces and write results. | |
if (get_local_id(0) >= TILE_SIZE) { | |
localData[tgx].fx = force.x; | |
localData[tgx].fy = force.y; | |
localData[tgx].fz = force.z; | |
#ifdef CALCULATE_VIRIAL | |
localData[tgx].rf0 = virial1.x; | |
localData[tgx].rf1 = virial1.y; | |
localData[tgx].rf2 = virial1.z; | |
localData[tgx].rf3 = virial2.x; | |
localData[tgx].rf4 = virial2.y; | |
localData[tgx].rf5 = virial2.z; | |
localData[tgx].rf6 = virial3.x; | |
localData[tgx].rf7 = virial3.y; | |
localData[tgx].rf8 = virial3.z; | |
#endif | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (get_local_id(0) < TILE_SIZE) { | |
#ifdef SUPPORTS_64_BIT_ATOMICS | |
const unsigned int offset = x*TILE_SIZE + tgx; | |
atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].fx)*0xFFFFFFFF)); | |
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0xFFFFFFFF)); | |
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0xFFFFFFFF)); | |
#ifdef CALCULATE_VIRIAL | |
atom_add(&virialBuffers1[offset], (long) ((virial1.x + localData[tgx].rf0)*0xFFFFFFFF)); | |
atom_add(&virialBuffers1[offset+PADDED_NUM_ATOMS], (long) ((virial1.y + localData[tgx].rf1)*0xFFFFFFFF)); | |
atom_add(&virialBuffers1[offset+2*PADDED_NUM_ATOMS],(long) ((virial1.z + localData[tgx].rf2)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset], (long) ((virial2.x + localData[tgx].rf3)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset+PADDED_NUM_ATOMS], (long) ((virial2.y + localData[tgx].rf4)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset+2*PADDED_NUM_ATOMS],(long) ((virial2.z + localData[tgx].rf5)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset], (long) ((virial3.x + localData[tgx].rf6)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset+PADDED_NUM_ATOMS], (long) ((virial3.y + localData[tgx].rf7)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset+2*PADDED_NUM_ATOMS],(long) ((virial3.z + localData[tgx].rf8)*0xFFFFFFFF)); | |
#endif | |
#else | |
force.x += localData[tgx].fx; | |
force.y += localData[tgx].fy; | |
force.z += localData[tgx].fz; | |
#ifdef CALCULATE_VIRIAL | |
virial1.x += localData[tgx].rf0; | |
virial1.y += localData[tgx].rf1; | |
virial1.z += localData[tgx].rf2; | |
virial2.x += localData[tgx].rf3; | |
virial2.y += localData[tgx].rf4; | |
virial2.z += localData[tgx].rf5; | |
virial3.x += localData[tgx].rf6; | |
virial3.y += localData[tgx].rf7; | |
virial3.z += localData[tgx].rf8; | |
#endif | |
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK | |
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; | |
#else | |
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; | |
#endif | |
// Cheaper to load/store float4 than float3. | |
float4 sum = forceBuffers[offset]; | |
sum.xyz += force.xyz; | |
forceBuffers[offset] = sum; | |
#ifdef CALCULATE_VIRIAL | |
sum = virialBuffers1[offset]; | |
sum.xyz += virial1.xyz; | |
virialBuffers1[offset] = sum; | |
sum = virialBuffers2[offset]; | |
sum.xyz += virial2.xyz; | |
virialBuffers2[offset] = sum; | |
sum = virialBuffers3[offset]; | |
sum.xyz += virial3.xyz; | |
virialBuffers3[offset] = sum; | |
#endif | |
#endif | |
} | |
// barrier not required here as localData[*].temp is not accessed before encountering another barrier. | |
} | |
else { | |
// This is an off-diagonal tile. | |
if (lasty != y && get_local_id(0) < TILE_SIZE) { | |
const unsigned int localAtomIndex = tgx; | |
unsigned int j = y*TILE_SIZE + tgx; | |
float4 tempPosq = posq[j]; | |
localData[localAtomIndex].x = tempPosq.x; | |
localData[localAtomIndex].y = tempPosq.y; | |
localData[localAtomIndex].z = tempPosq.z; | |
localData[localAtomIndex].q = tempPosq.w; | |
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL | |
} | |
localForce[get_local_id(0)].x = 0.0f; | |
localForce[get_local_id(0)].y = 0.0f; | |
localForce[get_local_id(0)].z = 0.0f; | |
#ifdef CALCULATE_VIRIAL | |
localVirial1[get_local_id(0)].x = 0.0f; | |
localVirial1[get_local_id(0)].y = 0.0f; | |
localVirial1[get_local_id(0)].z = 0.0f; | |
localVirial2[get_local_id(0)].x = 0.0f; | |
localVirial2[get_local_id(0)].y = 0.0f; | |
localVirial2[get_local_id(0)].z = 0.0f; | |
localVirial3[get_local_id(0)].x = 0.0f; | |
localVirial3[get_local_id(0)].y = 0.0f; | |
localVirial3[get_local_id(0)].z = 0.0f; | |
#endif | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// Compute the full set of interactions in this tile. | |
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1); | |
#ifdef USE_EXCLUSIONS | |
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF); | |
excl = (excl >> tj) | (excl << (TILE_SIZE - tj)); | |
#endif | |
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { | |
#ifdef USE_EXCLUSIONS | |
bool isExcluded = !(excl & 0x1); | |
#endif | |
float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q); | |
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); | |
#ifdef USE_PERIODIC | |
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; | |
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; | |
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; | |
#endif | |
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; | |
float invR = RSQRT(r2); | |
float r = RECIP(invR); | |
int atom2 = tj; | |
LOAD_ATOM2_PARAMETERS | |
atom2 = y*TILE_SIZE+tj; | |
#ifdef USE_SYMMETRIC | |
float dEdR = 0.0f; | |
#else | |
float4 dEdR1 = (float4) 0.0f; | |
float4 dEdR2 = (float4) 0.0f; | |
#endif | |
#ifdef CALCULATE_VIRIAL | |
float4 forceContribution = (float4) 0.0f;; | |
float4 virialContribution1 = (float4) 0.0f; | |
float4 virialContribution2 = (float4) 0.0f; | |
float4 virialContribution3 = (float4) 0.0f; | |
#endif | |
float tempEnergy = 0.0f; | |
COMPUTE_INTERACTION | |
energy += tempEnergy; | |
#ifdef USE_SYMMETRIC | |
#ifdef CALCULATE_VIRIAL | |
forceContribution.xyz = delta.xyz*dEdR; | |
force.xyz -= forceContribution.xyz; | |
localForce[tj+localForceOffset].x += forceContribution.x; | |
localForce[tj+localForceOffset].y += forceContribution.y; | |
localForce[tj+localForceOffset].z += forceContribution.z; | |
#else | |
delta.xyz *= dEdR; | |
force.xyz -= delta.xyz; | |
localForce[tj+localForceOffset].x += delta.x; | |
localForce[tj+localForceOffset].y += delta.y; | |
localForce[tj+localForceOffset].z += delta.z; | |
#endif | |
#else | |
#ifdef CALCULATE_VIRIAL | |
forceContribution.xyz = dEdR1.xyz; | |
force.xyz -= dEdR1.xyz; | |
localForce[tj+localForceOffset].x += dEdR2.x; | |
localForce[tj+localForceOffset].y += dEdR2.y; | |
localForce[tj+localForceOffset].z += dEdR2.z; | |
#else | |
force.xyz -= dEdR1.xyz; | |
localForce[tj+localForceOffset].x += dEdR2.x; | |
localForce[tj+localForceOffset].y += dEdR2.y; | |
localForce[tj+localForceOffset].z += dEdR2.z; | |
#endif | |
#endif | |
barrier(CLK_LOCAL_MEM_FENCE); | |
#ifdef CALCULATE_VIRIAL | |
// This is an atomistic simulation. Use a simplified virial equation to avoid unnecessary roundoff errors: | |
if(!isExcluded) | |
{ | |
virialContribution1 = (float4) (forceContribution.x*delta.x,forceContribution.x*delta.y,forceContribution.x*delta.z,0.0); | |
virialContribution2 = (float4) (forceContribution.y*delta.x,forceContribution.y*delta.y,forceContribution.y*delta.z,0.0); | |
virialContribution3 = (float4) (forceContribution.z*delta.x,forceContribution.z*delta.y,forceContribution.z*delta.z,0.0); | |
} | |
else | |
{ | |
virialContribution1 = (float4) (0.0,0.0,0.0,0.0); | |
virialContribution2 = (float4) (0.0,0.0,0.0,0.0); | |
virialContribution3 = (float4) (0.0,0.0,0.0,0.0); | |
} | |
virial1.xyz += virialContribution1.xyz; | |
virial2.xyz += virialContribution2.xyz; | |
virial3.xyz += virialContribution3.xyz; | |
localVirial1[tj+localForceOffset].x += (virialContribution1.x); | |
localVirial1[tj+localForceOffset].y += (virialContribution1.y); | |
localVirial1[tj+localForceOffset].z += (virialContribution1.z); | |
localVirial2[tj+localForceOffset].x += (virialContribution2.x); | |
localVirial2[tj+localForceOffset].y += (virialContribution2.y); | |
localVirial2[tj+localForceOffset].z += (virialContribution2.z); | |
localVirial3[tj+localForceOffset].x += (virialContribution3.x); | |
localVirial3[tj+localForceOffset].y += (virialContribution3.y); | |
localVirial3[tj+localForceOffset].z += (virialContribution3.z); | |
#endif | |
#ifdef USE_EXCLUSIONS | |
excl >>= 1; | |
#endif | |
tj = (tj+1) & (TILE_SIZE-1); | |
} | |
// Sum the forces and write results. | |
if (get_local_id(0) >= TILE_SIZE) { | |
localData[tgx].fx = force.x; | |
localData[tgx].fy = force.y; | |
localData[tgx].fz = force.z; | |
#ifdef CALCULATE_VIRIAL | |
localData[tgx].rf0 = virial1.x; | |
localData[tgx].rf1 = virial1.y; | |
localData[tgx].rf2 = virial1.z; | |
localData[tgx].rf3 = virial2.x; | |
localData[tgx].rf4 = virial2.y; | |
localData[tgx].rf5 = virial2.z; | |
localData[tgx].rf6 = virial3.x; | |
localData[tgx].rf7 = virial3.y; | |
localData[tgx].rf8 = virial3.z; | |
#endif | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (get_local_id(0) < TILE_SIZE) { | |
#ifdef SUPPORTS_64_BIT_ATOMICS | |
const unsigned int offset1 = x*TILE_SIZE + tgx; | |
const unsigned int offset2 = y*TILE_SIZE + tgx; | |
atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].fx)*0xFFFFFFFF)); | |
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0xFFFFFFFF)); | |
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0xFFFFFFFF)); | |
atom_add(&forceBuffers[offset2], (long) ((localForce[tgx].x + localForce[tgx+TILE_SIZE].x)*0xFFFFFFFF)); | |
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localForce[tgx].y + localForce[tgx+TILE_SIZE].y)*0xFFFFFFFF)); | |
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localForce[tgx].z + localForce[tgx+TILE_SIZE].z)*0xFFFFFFFF)); | |
#ifdef CALCULATE_VIRIAL | |
// S // Virial tensor for atom 1: | |
atom_add(&virialBuffers1[offset1], (long) ((virial1.x + localData[tgx].rf0)*0xFFFFFFFF)); | |
atom_add(&virialBuffers1[offset1+PADDED_NUM_ATOMS], (long) ((virial1.y + localData[tgx].rf1)*0xFFFFFFFF)); | |
atom_add(&virialBuffers1[offset1+2*PADDED_NUM_ATOMS],(long) ((virial1.z + localData[tgx].rf2)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset1], (long) ((virial2.x + localData[tgx].rf3)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset1+PADDED_NUM_ATOMS], (long) ((virial2.y + localData[tgx].rf4)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset1+2*PADDED_NUM_ATOMS],(long) ((virial2.z + localData[tgx].rf5)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset1], (long) ((virial3.x + localData[tgx].rf6)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset1+PADDED_NUM_ATOMS], (long) ((virial3.y + localData[tgx].rf7)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset1+2*PADDED_NUM_ATOMS],(long) ((virial3.z + localData[tgx].rf8)*0xFFFFFFFF)); | |
// S // Virial tensor for atom 2: | |
atom_add(&virialBuffers1[offset2], (long) ((localVirial1[tgx].x + localVirial1[tgx+TILE_SIZE].x)*0xFFFFFFFF)); | |
atom_add(&virialBuffers1[offset2+PADDED_NUM_ATOMS], (long) ((localVirial1[tgx].y + localVirial1[tgx+TILE_SIZE].y)*0xFFFFFFFF)); | |
atom_add(&virialBuffers1[offset2+2*PADDED_NUM_ATOMS],(long) ((localVirial1[tgx].z + localVirial1[tgx+TILE_SIZE].z)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset2], (long) ((localVirial2[tgx].x + localVirial2[tgx+TILE_SIZE].x)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset2+PADDED_NUM_ATOMS], (long) ((localVirial2[tgx].y + localVirial2[tgx+TILE_SIZE].y)*0xFFFFFFFF)); | |
atom_add(&virialBuffers2[offset2+2*PADDED_NUM_ATOMS],(long) ((localVirial2[tgx].z + localVirial2[tgx+TILE_SIZE].z)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset2], (long) ((localVirial3[tgx].x + localVirial3[tgx+TILE_SIZE].x)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset2+PADDED_NUM_ATOMS], (long) ((localVirial3[tgx].y + localVirial3[tgx+TILE_SIZE].y)*0xFFFFFFFF)); | |
atom_add(&virialBuffers3[offset2+2*PADDED_NUM_ATOMS],(long) ((localVirial3[tgx].z + localVirial3[tgx+TILE_SIZE].z)*0xFFFFFFFF)); | |
#endif | |
#else | |
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK | |
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; | |
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; | |
#else | |
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; | |
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; | |
#endif | |
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. | |
float4 sum1 = forceBuffers[offset1]; | |
float4 sum2 = forceBuffers[offset2]; | |
sum1.x += localData[tgx].fx + force.x; | |
sum1.y += localData[tgx].fy + force.y; | |
sum1.z += localData[tgx].fz + force.z; | |
sum2.x += localForce[tgx].x + localForce[tgx+TILE_SIZE].x; | |
sum2.y += localForce[tgx].y + localForce[tgx+TILE_SIZE].y; | |
sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z; | |
forceBuffers[offset1] = sum1; | |
forceBuffers[offset2] = sum2; | |
#ifdef CALCULATE_VIRIAL | |
// Virial for ATOM 1 & ATOM 2: | |
sum1 = virialBuffers1[offset1]; // S // Re-using the sum1 variable declared above. | |
float4 sum12 = virialBuffers2[offset1]; // S // Adding new variables, sum12 and sum13. | |
float4 sum13 = virialBuffers3[offset1]; // S // | |
sum2 = virialBuffers1[offset2]; // S // Re-using the sum2 variable declared above. | |
float4 sum22 = virialBuffers2[offset2]; // S // Adding new variables, sum22 and sum23 | |
float4 sum23 = virialBuffers3[offset2]; | |
sum1.x += localData[tgx].rf0 + virial1.x; | |
sum1.y += localData[tgx].rf1 + virial1.y; | |
sum1.z += localData[tgx].rf2 + virial1.z; | |
sum12.x += localData[tgx].rf3 + virial2.x; | |
sum12.y += localData[tgx].rf4 + virial2.y; | |
sum12.z += localData[tgx].rf5 + virial2.z; | |
sum13.x += localData[tgx].rf6 + virial3.x; | |
sum13.y += localData[tgx].rf7 + virial3.y; | |
sum13.z += localData[tgx].rf8 + virial3.z; | |
sum2.x += localVirial1[tgx].x + localVirial1[tgx+TILE_SIZE].x; | |
sum2.y += localVirial1[tgx].y + localVirial1[tgx+TILE_SIZE].y; | |
sum2.z += localVirial1[tgx].z + localVirial1[tgx+TILE_SIZE].z; | |
sum22.x += localVirial2[tgx].x + localVirial2[tgx+TILE_SIZE].x; | |
sum22.y += localVirial2[tgx].y + localVirial2[tgx+TILE_SIZE].y; | |
sum22.z += localVirial2[tgx].z + localVirial2[tgx+TILE_SIZE].z; | |
sum23.x += localVirial3[tgx].x + localVirial3[tgx+TILE_SIZE].x; | |
sum23.y += localVirial3[tgx].y + localVirial3[tgx+TILE_SIZE].y; | |
sum23.z += localVirial3[tgx].z + localVirial3[tgx+TILE_SIZE].z; | |
virialBuffers1[offset1] = sum1; | |
virialBuffers2[offset1] = sum12; | |
virialBuffers3[offset1] = sum13; | |
virialBuffers1[offset2] = sum2; | |
virialBuffers2[offset2] = sum22; | |
virialBuffers3[offset2] = sum23; | |
#endif | |
#endif | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
lasty = y; | |
pos++; | |
} | |
energyBuffer[get_global_id(0)] += energy; | |
} | |
__kernel void reduceFloat4VirialBuffer(__global float4* restrict buffer, int bufferSize, int numBuffers) { | |
int index = get_global_id(0); | |
int totalSize = bufferSize*numBuffers; | |
while (index < bufferSize) { | |
float4 sum = buffer[index]; | |
for (int i = index+bufferSize; i < totalSize; i += bufferSize) | |
sum += buffer[i]; | |
buffer[index] = sum; | |
index += get_global_size(0); | |
} | |
} | |
/** | |
* Sum the various buffers containing forces. | |
*/ | |
__kernel void reduceVirials(__global const long* restrict longBuffer, __global float4* restrict buffer, int bufferSize, int numBuffers) { | |
int totalSize = bufferSize*numBuffers; | |
float scale = 1.0f/(float) 0xFFFFFFFF; | |
for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) { | |
float4 sum = (float4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f); | |
for (int i = index; i < totalSize; i += bufferSize) | |
sum += buffer[i]; | |
buffer[index] = sum; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment