Skip to content

Instantly share code, notes, and snippets.

@gravitino
Last active February 16, 2016 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gravitino/3b694b6bd6af6937a2b2 to your computer and use it in GitHub Desktop.
Save gravitino/3b694b6bd6af6937a2b2 to your computer and use it in GitHub Desktop.
branch divergent-free if in CUDA considered harmful (benchmarked on Titan X)
#include <iostream>
// error makro
#define CUERR { \
cudaError_t err; \
if ((err = cudaGetLastError()) != cudaSuccess) { \
std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \
<< __FILE__ << ", line " << __LINE__ << std::endl; \
exit(1); \
} \
}
// convenient timers
#define TIMERSTART(label) \
cudaEvent_t start##label, stop##label; \
float time##label; \
cudaEventCreate(&start##label); \
cudaEventCreate(&stop##label); \
cudaEventRecord(start##label, 0);
#define TIMERSTOP(label) \
cudaEventRecord(stop##label, 0); \
cudaEventSynchronize(stop##label); \
cudaEventElapsedTime(&time##label, start##label, stop##label); \
std::cout << time##label << " ms (" << #label << ")" << std::endl;
template <class value_t, class index_t, bool sigma, int n_iters=100> __global__
void templated_kernel(value_t * data, index_t length) {
const index_t thid = blockDim.x*blockIdx.x+threadIdx.x;
for (index_t id = thid; id < length; id += blockDim.x*gridDim.x) {
value_t value = data[id];
for (index_t iter = 0; iter < n_iters; iter++)
if (sigma)
value = cos(value);
else
value = sin(value);
data[id] = value;
}
}
template <class value_t, class index_t, int n_iters=100> __global__
void parametrized_kernel(value_t * data, index_t length, bool sigma) {
const index_t thid = blockDim.x*blockIdx.x+threadIdx.x;
for (index_t id = thid; id < length; id += blockDim.x*gridDim.x) {
value_t value = data[id];
for (index_t iter = 0; iter < n_iters; iter++)
if (sigma)
value = cos(value);
else
value = sin(value);
data[id] = value;
}
}
// compile with nvcc -O3 -std=c++11 -arch=sm_35 cuda_if.cu
int main () {
typedef size_t index_t;
typedef float value_t;
index_t length = 1 << 30;
value_t * data = nullptr;
cudaMalloc(&data, sizeof(value_t)*length); CUERR
cudaMemset(data, 0, sizeof(value_t)*length); CUERR
// 2263.26 ms
// 2300.9 ms
// 2290.04 ms
TIMERSTART(templated)
templated_kernel<value_t, index_t, 0><<<1024, 1024>>>(data, length); CUERR
templated_kernel<value_t, index_t, 1><<<1024, 1024>>>(data, length); CUERR
TIMERSTOP(templated)
// 2745.73 ms
// 2729.74 ms
// 2745.31 ms
TIMERSTART(parametrized)
parametrized_kernel<<<1024, 1024>>>(data, length, 0); CUERR
parametrized_kernel<<<1024, 1024>>>(data, length, 1); CUERR
TIMERSTOP(parametrized)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment