Skip to content

Instantly share code, notes, and snippets.

@FernandoS27
Created September 1, 2018 05:15
Show Gist options
  • Save FernandoS27/b1ab4f8e861c9162f54952ed8273431b to your computer and use it in GitHub Desktop.
Save FernandoS27/b1ab4f8e861c9162f54952ed8273431b to your computer and use it in GitHub Desktop.
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
int main()
{
int N = 1000;
size_t size = N * sizeof(float);
srand(time(NULL));
// Allocate input vectors h_A and h_B in host memory
float* h_A = (float*)malloc(size);
float* h_B = (float*)malloc(size);
float* h_C = (float*)malloc(size);
// Initialize input vectors
unsigned int i;
for (i = 0; i < N; i++ ) {
h_A[i] = 1.0f - (float)(rand() % 20000) / 10000.0f;
float f = 1.0 - (float)(rand() % 20000) / 10000.0f;
h_B[i] = 64.0 + 128.0 * f;
}
// Initialize
cuInit(0);
// Get number of devices supporting CUDA
int deviceCount = 0;
cuDeviceGetCount(&deviceCount);
if (deviceCount == 0) {
printf("There is no device supporting CUDA.\n");
exit (0);
}
// Get handle for device 0
CUdevice cuDevice;
cuDeviceGet(&cuDevice, 0);
// Create context
CUcontext cuContext;
cuCtxCreate(&cuContext, 0, cuDevice);
// Create module from binary file
CUmodule cuModule;
if (cuModuleLoad(&cuModule, "test.cubin") != 0) {
printf("Failed to load cubin.\n");
exit (0);
}
// Allocate vectors in device memory
CUdeviceptr d_A;
cuMemAlloc(&d_A, size);
CUdeviceptr d_B;
cuMemAlloc(&d_B, size);
CUdeviceptr d_C;
cuMemAlloc(&d_C, size);
// Copy vectors from host memory to device memory
cuMemcpyHtoD(d_A, h_A, size);
cuMemcpyHtoD(d_B, h_B, size);
// Get function handle from module
CUfunction test;
const char name[] = "_Z4TestPfS_S_";
unsigned int j = cuModuleGetFunction(&test, cuModule, name);
if (j != 0) {
printf("Failed to get Function. %d\n", j);
exit (0);
}
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid =
(N + threadsPerBlock - 1) / threadsPerBlock;
void* args[] = { &d_A, &d_B, &d_C, &N };
cuLaunchKernel(test,
blocksPerGrid, 1, 1, threadsPerBlock, 1, 1,
0, 0, args, 0);
cuCtxSynchronize();
cuMemcpyDtoH(h_C, d_C, size);
for (i = 0; i < 256; i++ ) {
printf("Result: RRO %f -> %.12f\n", h_B[i], h_C[i]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment