Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Created December 21, 2020 01:36
Show Gist options
  • Save sandeepkumar-skb/2e2254fe1ec05d2dd18180d362bd3eab to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/2e2254fe1ec05d2dd18180d362bd3eab to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <chrono>
#include <iostream>
#define BLOCK_SIZE 128
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
__global__
void copy_d(int* inp,
int* idx,
int* out,
int size)
{
int tx = blockIdx.x*blockDim.x + threadIdx.x;
if (tx < size){
out[tx] = inp[idx[tx]];
}
}
__global__
void copy_restrict_d(const int* __restrict__ inp,
const int* __restrict__ idx,
int* __restrict__ out,
int size)
{
int tx = blockIdx.x*blockDim.x + threadIdx.x;
if (tx < size){
out[tx] = inp[idx[tx]];
}
}
void copy_cpu(int* inp,
int* idx,
int* out,
int size)
{
for (int i=0; i < size; ++i){
out[i] = inp[idx[i]];
}
}
int main(){
int size = 1024*1024;
int *inp, *out, *idx, *out_cpu;
gpuErrchk(cudaMallocManaged(&inp, size*sizeof(int)));
gpuErrchk(cudaMallocManaged(&out, size*sizeof(int)));
gpuErrchk(cudaMallocManaged(&idx, size*sizeof(int)));
out_cpu = (int*) malloc (size*sizeof(int));
for (int i=0; i<size; ++i){
inp[i] = i;
out[i] = 0;
idx[i] = size - i -1;
out_cpu[i] = 0;
}
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
dim3 num_threads(BLOCK_SIZE);
dim3 num_blocks((size-1)/BLOCK_SIZE + 1);
int warmup = 20;
int num_iter = 100;
float total_time = 0.0;
for (int i=0; i < num_iter + warmup; ++i){
gpuErrchk(cudaEventRecord(start));
copy_d<<<num_blocks, num_threads>>>(inp, idx, out, size);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
float milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
if (i > warmup)
total_time += milliseconds;
}
printf("Effective time orig: %f us\n", (total_time/num_iter)*1000);
total_time = 0.0;
for (int i=0; i < num_iter + warmup; ++i){
gpuErrchk(cudaEventRecord(start));
copy_restrict_d<<<num_blocks, num_threads>>>(inp, idx, out, size);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
float milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
if (i > warmup)
total_time += milliseconds;
}
printf("Effective time with restrict: %f us\n", (total_time/num_iter)*1000);
std::chrono::high_resolution_clock::time_point ch_start;
std::chrono::high_resolution_clock::time_point ch_end ;
std::chrono::duration<double> span;
ch_start = std::chrono::high_resolution_clock::now();
copy_cpu(inp, idx, out_cpu, size);
ch_end = std::chrono::high_resolution_clock::now();
span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start);
std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl;
for(int i=0; i<size; ++i){
if (out_cpu[i] != out[i]){
printf("Error!!\n");
}
}
}
@sandeepkumar-skb
Copy link
Author

Compile & Run: nvcc pointer_aliasing_demo.cu -o out && ./out

Effective time orig: 25.583040 us
Effective time with restrict: 25.582722 us
CPU Time: 3.34411ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment