FernandoS27/test.c

## test.c
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>

int main()
{
    int N = 1000;
    size_t size = N * sizeof(float);

    srand(time(NULL));
    // Allocate input vectors h_A and h_B in host memory
    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);
    float* h_C = (float*)malloc(size);

    // Initialize input vectors
    unsigned int i;
    for (i = 0; i < N; i++ ) {
        h_A[i] = 1.0f - (float)(rand() % 20000) / 10000.0f;

        float f = 1.0 - (float)(rand() % 20000) / 10000.0f;
        h_B[i] = 64.0 + 128.0 * f;
    }

    // Initialize
    cuInit(0);

    // Get number of devices supporting CUDA
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        printf("There is no device supporting CUDA.\n");
        exit (0);
    }

    // Get handle for device 0
    CUdevice cuDevice;
    cuDeviceGet(&cuDevice, 0);

    // Create context
    CUcontext cuContext;
    cuCtxCreate(&cuContext, 0, cuDevice);

    // Create module from binary file
    CUmodule cuModule;
    if (cuModuleLoad(&cuModule, "test.cubin") != 0)  {
        printf("Failed to load cubin.\n");
        exit (0);
    }

    // Allocate vectors in device memory
    CUdeviceptr d_A;
    cuMemAlloc(&d_A, size);
    CUdeviceptr d_B;
    cuMemAlloc(&d_B, size);
    CUdeviceptr d_C;
    cuMemAlloc(&d_C, size);

    // Copy vectors from host memory to device memory
    cuMemcpyHtoD(d_A, h_A, size);
    cuMemcpyHtoD(d_B, h_B, size);

    // Get function handle from module
    CUfunction test;
    const char name[] = "_Z4TestPfS_S_";
    unsigned int j = cuModuleGetFunction(&test, cuModule, name);
    if (j != 0) {
        printf("Failed to get Function. %d\n", j);
        exit (0);
    }

    // Invoke kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =
            (N + threadsPerBlock - 1) / threadsPerBlock;
    void* args[] = { &d_A, &d_B, &d_C, &N };
    cuLaunchKernel(test,
                   blocksPerGrid, 1, 1, threadsPerBlock, 1, 1,
                   0, 0, args, 0);

    cuCtxSynchronize();
    cuMemcpyDtoH(h_C, d_C, size);


    for (i = 0; i < 256; i++ ) {
        printf("Result: RRO %f -> %.12f\n", h_B[i], h_C[i]);
    }
}
	#include <cuda.h>
	#include <stdio.h>
	#include <time.h>
	#include <stdlib.h>
	#include <math.h>

	int main()
	{
	int N = 1000;
	size_t size = N * sizeof(float);

	srand(time(NULL));
	// Allocate input vectors h_A and h_B in host memory
	float* h_A = (float*)malloc(size);
	float* h_B = (float*)malloc(size);
	float* h_C = (float*)malloc(size);

	// Initialize input vectors
	unsigned int i;
	for (i = 0; i < N; i++ ) {
	h_A[i] = 1.0f - (float)(rand() % 20000) / 10000.0f;

	float f = 1.0 - (float)(rand() % 20000) / 10000.0f;
	h_B[i] = 64.0 + 128.0 * f;
	}

	// Initialize
	cuInit(0);

	// Get number of devices supporting CUDA
	int deviceCount = 0;
	cuDeviceGetCount(&deviceCount);
	if (deviceCount == 0) {
	printf("There is no device supporting CUDA.\n");
	exit (0);
	}

	// Get handle for device 0
	CUdevice cuDevice;
	cuDeviceGet(&cuDevice, 0);

	// Create context
	CUcontext cuContext;
	cuCtxCreate(&cuContext, 0, cuDevice);

	// Create module from binary file
	CUmodule cuModule;
	if (cuModuleLoad(&cuModule, "test.cubin") != 0) {
	printf("Failed to load cubin.\n");
	exit (0);
	}

	// Allocate vectors in device memory
	CUdeviceptr d_A;
	cuMemAlloc(&d_A, size);
	CUdeviceptr d_B;
	cuMemAlloc(&d_B, size);
	CUdeviceptr d_C;
	cuMemAlloc(&d_C, size);

	// Copy vectors from host memory to device memory
	cuMemcpyHtoD(d_A, h_A, size);
	cuMemcpyHtoD(d_B, h_B, size);

	// Get function handle from module
	CUfunction test;
	const char name[] = "_Z4TestPfS_S_";
	unsigned int j = cuModuleGetFunction(&test, cuModule, name);
	if (j != 0) {
	printf("Failed to get Function. %d\n", j);
	exit (0);
	}

	// Invoke kernel
	int threadsPerBlock = 256;
	int blocksPerGrid =
	(N + threadsPerBlock - 1) / threadsPerBlock;
	void* args[] = { &d_A, &d_B, &d_C, &N };
	cuLaunchKernel(test,
	blocksPerGrid, 1, 1, threadsPerBlock, 1, 1,
	0, 0, args, 0);

	cuCtxSynchronize();
	cuMemcpyDtoH(h_C, d_C, size);


	for (i = 0; i < 256; i++ ) {
	printf("Result: RRO %f -> %.12f\n", h_B[i], h_C[i]);
	}
	}