Skip to content

Instantly share code, notes, and snippets.

/-

Created January 31, 2017 23:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/0ac6f0b0f994339838f5452f96e77cff to your computer and use it in GitHub Desktop.
Save anonymous/0ac6f0b0f994339838f5452f96e77cff to your computer and use it in GitHub Desktop.
#include <iostream>
__global__ void precise_sqrt(double a, double* out) {
*out = sqrt(a);
}
__global__ void approx_rsqrt(double a, double* out) {
*out = __nvvm_rsqrt_approx_d(a);
/*if (threadIdx.x == 0 && blockIdx.x == 0)
printf("approx_rsqrt: %f\n", *out);*/
}
__global__ void approx_rsqrt_mul(double a, double* out) {
*out = a > 0 ? a * __nvvm_rsqrt_approx_d(a) : 0;
/*if (threadIdx.x == 0 && blockIdx.x == 0)
printf("approx_rsqrt_mul: %f\n", *out);*/
}
__global__ void approx_rsqrt_recip(double a, double* out) {
*out = __nvvm_rcp_approx_ftz_d(__nvvm_rsqrt_approx_d(a));
/*if (threadIdx.x == 0 && blockIdx.x == 0)
printf("approx_rsqrt_rcp: %f\n", *out);*/
}
int main(int argc, char* argv[]) {
// Copy input data to device.
double* device_x;
cudaMalloc(&device_x, sizeof(double));
precise_sqrt<<<1024, 1024>>>(42, device_x);
approx_rsqrt<<<1024, 1024>>>(42, device_x);
approx_rsqrt_mul<<<1024, 1024>>>(42, device_x);
approx_rsqrt_recip<<<1024, 1024>>>(42, device_x);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment