delijati/hw.cpp

## hw.cpp
#include <iostream>
#include <cuda_runtime.h>
#include <cudnn.h>


/**
 * Minimal example to apply sigmoid activation on a tensor
 * using cuDNN.
 **/
int main(int argc, char** argv)
{
    int numGPUs;
    int driverVersion = 0, runtimeVersion = 0;
    cudaGetDeviceCount(&numGPUs);
    std::cout << "Found " << numGPUs << " GPUs." << std::endl;
    cudaSetDevice(0); // use GPU0
    int device;
    struct cudaDeviceProp devProp;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&devProp, device);
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

    std::cout << "Device: " << devProp.name << std::endl;
    std::cout << "Driver Version: " << driverVersion<<"\n";
    std::cout << "Runtime Version: " << runtimeVersion<<"\n";
    std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl;
    std::cout << "Total amount of global memory: "<<(unsigned long long)devProp.totalGlobalMem<<" bytes\n";
    std::cout << "Total amount of constant memory: "<<devProp.totalConstMem<<"bytes\n";
    std::cout << "Total amount of shared memory per block: "<<devProp.sharedMemPerBlock<<" bytes\n";
    std::cout << "Total number of registers available per block: "<<devProp.regsPerBlock<<"\n";
	std::cout << "Warp size: "<<devProp.warpSize<<"\n";

    cudnnHandle_t handle_;
    cudnnCreate(&handle_);
    std::cout << "Created cuDNN handle" << std::endl;

    // create the tensor descriptor
    cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
    int n = 1, c = 1, h = 1, w = 10;
    int NUM_ELEMENTS = n*c*h*w;
    cudnnTensorDescriptor_t x_desc;
    cudnnCreateTensorDescriptor(&x_desc);
    cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w);

    // create the tensor
    float *x;
    cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float));
    for(int i=0;i<NUM_ELEMENTS;i++) x[i] = i * 1.00f;
    std::cout << "Original array: ";
    for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";

    // create activation function descriptor
    float alpha[1] = {1};
    float beta[1] = {0.0};
    cudnnActivationDescriptor_t sigmoid_activation;
    cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
    cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN;
    cudnnCreateActivationDescriptor(&sigmoid_activation);
    cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f);

    cudnnActivationForward(
        handle_,
        sigmoid_activation,
        alpha,
        x_desc,
        x,
        beta,
        x_desc,
        x
    );

    cudnnDestroy(handle_);
    std::cout << std::endl << "Destroyed cuDNN handle." << std::endl;
    std::cout << "New array: ";
    for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";
    std::cout << std::endl;
    cudaFree(x);
    return 0;
}
/*
Info:
$ lsb_release -a
LSB Version:	n/a
Distributor ID:	ManjaroLinux
Description:	Manjaro Linux
Release:	22.0.0
Codename:	Sikaris
$ uname -a
Linux papagayo 5.15.78-1-MANJARO #1 SMP PREEMPT Thu Nov 10 20:50:09 UTC 2022 x86_64 GNU/Linux
$ nvidia-smi -L
GPU 0: NVIDIA GeForce MX450
Build:
$ g++ -I/opt/cuda/include -I/opt/cuda/targets/ppc64le-linux/include -o hw.o -c hw.cpp
$ nvcc -ccbin g++ -m64 -gencode arch=compute_80,code=sm_80 -o hw hw.o -I/opt/cuda/include -I/opt/cuda/targets/ppc64le-linux/include -L/opt/cuda/lib64 -L/opt/cuda/targets/ppc64le-linux/lib -lcublasLt -lcudart -lcublas -lcudnn -lstdc++ -lm
$ ./hw
Found 1 GPUs.
Device: NVIDIA GeForce MX450
Driver Version: 11080
Runtime Version: 11080
Compute capability:7.5
Total amount of global memory: 1969815552 bytes
Total amount of constant memory: 65536bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Created cuDNN handle
Original array: 0 1 2 3 4 5 6 7 8 9
Destroyed cuDNN handle.
New array: 0.5 0.731059 0.880797 0.952574 0.982014 0.993307 0.997527 0.999089 0.999665 0.999877
*/
	#include <iostream>
	#include <cuda_runtime.h>
	#include <cudnn.h>


	/**
	* Minimal example to apply sigmoid activation on a tensor
	* using cuDNN.
	**/
	int main(int argc, char** argv)
	{
	int numGPUs;
	int driverVersion = 0, runtimeVersion = 0;
	cudaGetDeviceCount(&numGPUs);
	std::cout << "Found " << numGPUs << " GPUs." << std::endl;
	cudaSetDevice(0); // use GPU0
	int device;
	struct cudaDeviceProp devProp;
	cudaGetDevice(&device);
	cudaGetDeviceProperties(&devProp, device);
	cudaDriverGetVersion(&driverVersion);
	cudaRuntimeGetVersion(&runtimeVersion);

	std::cout << "Device: " << devProp.name << std::endl;
	std::cout << "Driver Version: " << driverVersion<<"\n";
	std::cout << "Runtime Version: " << runtimeVersion<<"\n";
	std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl;
	std::cout << "Total amount of global memory: "<<(unsigned long long)devProp.totalGlobalMem<<" bytes\n";
	std::cout << "Total amount of constant memory: "<<devProp.totalConstMem<<"bytes\n";
	std::cout << "Total amount of shared memory per block: "<<devProp.sharedMemPerBlock<<" bytes\n";
	std::cout << "Total number of registers available per block: "<<devProp.regsPerBlock<<"\n";
	std::cout << "Warp size: "<<devProp.warpSize<<"\n";

	cudnnHandle_t handle_;
	cudnnCreate(&handle_);
	std::cout << "Created cuDNN handle" << std::endl;

	// create the tensor descriptor
	cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
	cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
	int n = 1, c = 1, h = 1, w = 10;
	int NUM_ELEMENTS = nch*w;
	cudnnTensorDescriptor_t x_desc;
	cudnnCreateTensorDescriptor(&x_desc);
	cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w);

	// create the tensor
	float *x;
	cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float));
	for(int i=0;i<NUM_ELEMENTS;i++) x[i] = i * 1.00f;
	std::cout << "Original array: ";
	for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";

	// create activation function descriptor
	float alpha[1] = {1};
	float beta[1] = {0.0};
	cudnnActivationDescriptor_t sigmoid_activation;
	cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
	cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN;
	cudnnCreateActivationDescriptor(&sigmoid_activation);
	cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f);

	cudnnActivationForward(
	handle_,
	sigmoid_activation,
	alpha,
	x_desc,
	x,
	beta,
	x_desc,
	x
	);

	cudnnDestroy(handle_);
	std::cout << std::endl << "Destroyed cuDNN handle." << std::endl;
	std::cout << "New array: ";
	for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";
	std::cout << std::endl;
	cudaFree(x);
	return 0;
	}
	/*
	Info:
	$ lsb_release -a
	LSB Version: n/a
	Distributor ID: ManjaroLinux
	Description: Manjaro Linux
	Release: 22.0.0
	Codename: Sikaris
	$ uname -a
	Linux papagayo 5.15.78-1-MANJARO #1 SMP PREEMPT Thu Nov 10 20:50:09 UTC 2022 x86_64 GNU/Linux
	$ nvidia-smi -L
	GPU 0: NVIDIA GeForce MX450
	Build:
	$ g++ -I/opt/cuda/include -I/opt/cuda/targets/ppc64le-linux/include -o hw.o -c hw.cpp
	$ nvcc -ccbin g++ -m64 -gencode arch=compute_80,code=sm_80 -o hw hw.o -I/opt/cuda/include -I/opt/cuda/targets/ppc64le-linux/include -L/opt/cuda/lib64 -L/opt/cuda/targets/ppc64le-linux/lib -lcublasLt -lcudart -lcublas -lcudnn -lstdc++ -lm
	$ ./hw
	Found 1 GPUs.
	Device: NVIDIA GeForce MX450
	Driver Version: 11080
	Runtime Version: 11080
	Compute capability:7.5
	Total amount of global memory: 1969815552 bytes
	Total amount of constant memory: 65536bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Created cuDNN handle
	Original array: 0 1 2 3 4 5 6 7 8 9
	Destroyed cuDNN handle.
	New array: 0.5 0.731059 0.880797 0.952574 0.982014 0.993307 0.997527 0.999089 0.999665 0.999877
	*/