Skip to content

Instantly share code, notes, and snippets.

@juniorprincewang
Last active October 10, 2022 13:38
Show Gist options
  • Save juniorprincewang/0d5ec955a71f46087db5f76ad42716cc to your computer and use it in GitHub Desktop.
Save juniorprincewang/0d5ec955a71f46087db5f76ad42716cc to your computer and use it in GitHub Desktop.
cudaLaunchKernel usage
// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
//
// This work is made available under the Nvidia Source Code License-NC.
// To view a copy of this license, visit
// https://nvlabs.github.io/stylegan2/license.html
// From https://github.com/NVlabs/stylegan2/blob/master/test_nvcc.cu
#include <cstdio>
void checkCudaError(cudaError_t err)
{
if (err != cudaSuccess)
{
printf("%s: %s\n", cudaGetErrorName(err), cudaGetErrorString(err));
exit(1);
}
}
__global__ void cudaKernel(void)
{
printf("GPU says hello.\n");
}
int main(void)
{
printf("CPU says hello.\n");
checkCudaError(cudaLaunchKernel((void*)cudaKernel, 1, 1, NULL, 0, NULL));
checkCudaError(cudaDeviceSynchronize());
return 0;
}
#include <cuda.h>
#include <stdio.h>
#define CHECK(call) { \
cudaError_t err; \
if ( (err = (call)) != cudaSuccess) { \
fprintf(stderr, "Got error %s at %s:%d\n", cudaGetErrorString(err), \
__FILE__, __LINE__); \
exit(1); \
} \
}
__global__ void kernel(float *g_data, float value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + value;
}
int checkResult(float *data, const int n, const float x)
{
for (int i = 0; i < n; i++)
{
if (data[i] != x)
{
printf("Error! data[%d] = %f, ref = %f\n", i, data[i], x);
return 0;
}
}
return 1;
}
__global__ void cudaKernel(void)
{
printf("GPU says hello!\n");
}
int main()
{
int devID=1;
int count = 0;
struct cudaDeviceProp props;
float *d_a=0;
float *h_a=0;
dim3 block, grid;
int num = 1 << 22;
int nbytes = num * sizeof(float);
float value=41;
devID = 0;
CHECK(cudaSetDevice(devID));
CHECK(cudaGetDeviceCount(&count));
printf("cuda count=%d\n", count);
CHECK(cudaGetDeviceProperties(&props, devID));
printf("Device %d: \"%s\" with Compute %d.%d capability\n",devID, props.name, props.major, props.minor);
h_a=(float*)malloc(nbytes);
memset(h_a, 0, nbytes);
CHECK(cudaMalloc((void**)&d_a, nbytes));
CHECK(cudaMemset(d_a, 0, nbytes));
// set kernel launch configuration
block = dim3(32,1,1);
grid = dim3((num + block.x - 1) / block.x);
CHECK(cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice));
// cudaKernel<<<1, 1>>>();
CHECK(cudaLaunchKernel((void*)cudaKernel, 1, 1, NULL, 0, NULL));
// kernel<<<grid, block>>>(d_a, value);
void *args[] = {&d_a, &value};
CHECK(cudaLaunchKernel((void*)kernel, grid, block, args, 0, NULL));
CHECK(cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost));
bool bFinalResults = (bool) checkResult(h_a, num, value);
printf("result:%s\n", bFinalResults? "PASS" : "FAILED");
CHECK(cudaFree(d_a));
free(h_a);
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment