Skip to content

Instantly share code, notes, and snippets.

/-
Created Jan 31, 2017

Embed
What would you like to do?
#include <iostream>
__global__ void precise_sqrt(double a, double* out) {
*out = sqrt(a);
}
__global__ void approx_rsqrt(double a, double* out) {
*out = __nvvm_rsqrt_approx_d(a);
/*if (threadIdx.x == 0 && blockIdx.x == 0)
printf("approx_rsqrt: %f\n", *out);*/
}
__global__ void approx_rsqrt_mul(double a, double* out) {
*out = a > 0 ? a * __nvvm_rsqrt_approx_d(a) : 0;
/*if (threadIdx.x == 0 && blockIdx.x == 0)
printf("approx_rsqrt_mul: %f\n", *out);*/
}
__global__ void approx_rsqrt_recip(double a, double* out) {
*out = __nvvm_rcp_approx_ftz_d(__nvvm_rsqrt_approx_d(a));
/*if (threadIdx.x == 0 && blockIdx.x == 0)
printf("approx_rsqrt_rcp: %f\n", *out);*/
}
int main(int argc, char* argv[]) {
// Copy input data to device.
double* device_x;
cudaMalloc(&device_x, sizeof(double));
precise_sqrt<<<1024, 1024>>>(42, device_x);
approx_rsqrt<<<1024, 1024>>>(42, device_x);
approx_rsqrt_mul<<<1024, 1024>>>(42, device_x);
approx_rsqrt_recip<<<1024, 1024>>>(42, device_x);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.