-
-
Save anonymous/0ac6f0b0f994339838f5452f96e77cff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
__global__ void precise_sqrt(double a, double* out) { | |
*out = sqrt(a); | |
} | |
__global__ void approx_rsqrt(double a, double* out) { | |
*out = __nvvm_rsqrt_approx_d(a); | |
/*if (threadIdx.x == 0 && blockIdx.x == 0) | |
printf("approx_rsqrt: %f\n", *out);*/ | |
} | |
__global__ void approx_rsqrt_mul(double a, double* out) { | |
*out = a > 0 ? a * __nvvm_rsqrt_approx_d(a) : 0; | |
/*if (threadIdx.x == 0 && blockIdx.x == 0) | |
printf("approx_rsqrt_mul: %f\n", *out);*/ | |
} | |
__global__ void approx_rsqrt_recip(double a, double* out) { | |
*out = __nvvm_rcp_approx_ftz_d(__nvvm_rsqrt_approx_d(a)); | |
/*if (threadIdx.x == 0 && blockIdx.x == 0) | |
printf("approx_rsqrt_rcp: %f\n", *out);*/ | |
} | |
int main(int argc, char* argv[]) { | |
// Copy input data to device. | |
double* device_x; | |
cudaMalloc(&device_x, sizeof(double)); | |
precise_sqrt<<<1024, 1024>>>(42, device_x); | |
approx_rsqrt<<<1024, 1024>>>(42, device_x); | |
approx_rsqrt_mul<<<1024, 1024>>>(42, device_x); | |
approx_rsqrt_recip<<<1024, 1024>>>(42, device_x); | |
cudaDeviceSynchronize(); | |
cudaDeviceReset(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment