Skip to content

Instantly share code, notes, and snippets.

@qfgaohao
Last active October 17, 2017 04:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qfgaohao/ce5420faef59dd0fc627fc12c54fac4a to your computer and use it in GitHub Desktop.
Save qfgaohao/ce5420faef59dd0fc627fc12c54fac4a to your computer and use it in GitHub Desktop.
It tests the time overhead of shared memory bank conflicts.
#include <stdio.h>
#define N (32)
__global__ void increment(int* time) {
__shared__ float s[1024];
for (int i = 0; i < 1024; i++) {
s[i] = 1.0f;
}
__syncthreads();
for (int i = 0; i < 32; i++) {
int start = clock();
// enable broadcast by accessing the same element in a bank:
// s[threadIdx.x * (i + 1) % 32] += 1.0f;
s[threadIdx.x * (i + 1)] += 1.0f; // stride: i + 1
int end = clock();
if (threadIdx.x == 0) {
time[i] = end - start;
}
}
}
int main() {
int *h_time;
int* d_time;
h_time = (int*)malloc(32 * sizeof(int));
cudaMalloc(&d_time, N * sizeof(int));
// setup the kernal
increment<<<1, N>>>(d_time);
cudaError_t ierrSync = cudaGetLastError();
if(ierrSync != cudaSuccess) {
printf("Sync error: %s\n", cudaGetErrorString(ierrSync));
}
// run the kernal
cudaError_t ierrAsync = cudaDeviceSynchronize();
if(ierrAsync != cudaSuccess) {
printf("Async error: %s\n", cudaGetErrorString(ierrAsync));
}
cudaMemcpy(h_time, d_time, 32 * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 32; i++) {
printf("%d ", h_time[i]);
}
printf("%s", "\n");
cudaFree(d_time);
free(h_time);
return 0;
}
@qfgaohao
Copy link
Author

One time result on an AWS g2 instance:
106 106 106 163 106 160 106 272 106 163 106 272 106 160 106 496 106 163 106 272 106 160 106 496 106 163 106 272 106 160 106 944

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment