FantasyVR/cusparse_spmv_dll.cpp

## cusparse_spmv_dll.cpp
#include <dlfcn.h>
#include <stdio.h>

enum cudaMemcpyKind {
  cudaMemcpyHostToHost = 0,
  cudaMemcpyHostToDevice = 1,
  cudaMemcpyDeviceToHost = 2,
  cudaMemcpyDeviceToDevice = 3,
  cudaMemcpyDefault = 4
};


typedef void (*gc)(int *count);
typedef void (*malloc) ( void** devPtr, size_t size );
typedef void (*free) ( void* devPtr );
typedef void (*copy) ( void* dst, const void* src, size_t count, cudaMemcpyKind kind );


struct cusparseContext;
typedef struct cusparseContext *cusparseHandle_t;
struct cusparseDnVecDescr;
struct cusparseSpMatDescr;
typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
typedef enum cudaDataType_t {
  CUDA_R_16F = 2,   /* real as a half */
  CUDA_C_16F = 6,   /* complex as a pair of half numbers */
  CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
  CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
  CUDA_R_32F = 0,   /* real as a float */
  CUDA_C_32F = 4,   /* complex as a pair of float numbers */
  CUDA_R_64F = 1,   /* real as a double */
  CUDA_C_64F = 5,   /* complex as a pair of double numbers */
  CUDA_R_4I = 16,   /* real as a signed 4-bit int */
  CUDA_C_4I = 17,   /* complex as a pair of signed 4-bit int numbers */
  CUDA_R_4U = 18,   /* real as a unsigned 4-bit int */
  CUDA_C_4U = 19,   /* complex as a pair of unsigned 4-bit int numbers */
  CUDA_R_8I = 3,    /* real as a signed 8-bit int */
  CUDA_C_8I = 7,    /* complex as a pair of signed 8-bit int numbers */
  CUDA_R_8U = 8,    /* real as a unsigned 8-bit int */
  CUDA_C_8U = 9,    /* complex as a pair of unsigned 8-bit int numbers */
  CUDA_R_16I = 20,  /* real as a signed 16-bit int */
  CUDA_C_16I = 21,  /* complex as a pair of signed 16-bit int numbers */
  CUDA_R_16U = 22,  /* real as a unsigned 16-bit int */
  CUDA_C_16U = 23,  /* complex as a pair of unsigned 16-bit int numbers */
  CUDA_R_32I = 10,  /* real as a signed 32-bit int */
  CUDA_C_32I = 11,  /* complex as a pair of signed 32-bit int numbers */
  CUDA_R_32U = 12,  /* real as a unsigned 32-bit int */
  CUDA_C_32U = 13,  /* complex as a pair of unsigned 32-bit int numbers */
  CUDA_R_64I = 24,  /* real as a signed 64-bit int */
  CUDA_C_64I = 25,  /* complex as a pair of signed 64-bit int numbers */
  CUDA_R_64U = 26,  /* real as a unsigned 64-bit int */
  CUDA_C_64U = 27   /* complex as a pair of unsigned 64-bit int numbers */
} cudaDataType;
typedef enum {
  CUSPARSE_INDEX_16U = 1,  ///< 16-bit unsigned integer for matrix/vector
                           ///< indices
  CUSPARSE_INDEX_32I = 2,  ///< 32-bit signed integer for matrix/vector indices
  CUSPARSE_INDEX_64I = 3   ///< 64-bit signed integer for matrix/vector indices
} cusparseIndexType_t;
typedef enum {
  CUSPARSE_INDEX_BASE_ZERO = 0,
  CUSPARSE_INDEX_BASE_ONE = 1
} cusparseIndexBase_t;
typedef enum {
  CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
  CUSPARSE_OPERATION_TRANSPOSE = 1,
  CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
} cusparseOperation_t;

typedef enum {
  CUSPARSE_SPMV_ALG_DEFAULT = 0,
  CUSPARSE_SPMV_COO_ALG1 = 1,
  CUSPARSE_SPMV_CSR_ALG1 = 2,
  CUSPARSE_SPMV_CSR_ALG2 = 3,
  CUSPARSE_SPMV_COO_ALG2 = 4
} cusparseSpMVAlg_t;

typedef int (*create)(cusparseHandle_t *handle);
typedef int (*destroy)(cusparseHandle_t handle);
typedef int (*createDnVec)(cusparseDnVecDescr_t* dnVecDescr,
                    int               size,
                    void*                 values,
                    cudaDataType          valueType);
typedef int (*destoryDnVec)(cusparseDnVecDescr_t dnVecDescr);
typedef int (*createCSRSpMat)(cusparseSpMatDescr_t* spMatDescr,
                  int               rows,
                  int               cols,
                  int               nnz,
                  void*                 csrRowOffsets,
                  void*                 csrColInd,
                  void*                 csrValues,
                  cusparseIndexType_t   csrRowOffsetsType,
                  cusparseIndexType_t   csrColIndType,
                  cusparseIndexBase_t   idxBase,
                  cudaDataType          valueType);
typedef int(*destoryCSRSpMat)(cusparseSpMatDescr_t spMatDescr);

typedef int(*csr_buffersize)(cusparseHandle_t           handle,
                        cusparseOperation_t        opA,
                        const void*                alpha,
                        const cusparseSpMatDescr_t matA,
                        const cusparseDnVecDescr_t vecX,
                        const void*                beta,
                        const cusparseDnVecDescr_t vecY,
                        cudaDataType               computeType,
                        cusparseSpMVAlg_t          alg,
                        size_t*                    bufferSize);
typedef int(*spmv)(cusparseHandle_t     handle,
             cusparseOperation_t  opA,
             const void*          alpha,
             cusparseSpMatDescr_t matA,
             cusparseDnVecDescr_t vecX,
             const void*          beta,
             cusparseDnVecDescr_t vecY,
             cudaDataType         computeType,
             cusparseSpMVAlg_t    alg,
             void*                externalBuffer);

int main(){
	// load cuda runtime api functions
	// void *cuda_so = dlopen("/usr/lib/x86_64-linux-gnu/libcuda.so", RTLD_NOW);
	void *cuda_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcudart.so", RTLD_NOW);

	malloc  cu_malloc = dlsym(cuda_so, "cudaMalloc");
	free cu_free =  dlsym(cuda_so, "cudaFree");
	copy   cu_cpy = dlsym(cuda_so, "cudaMemcpy");

	// load cusparse api functions
	void *cusparse_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcusparse.so", RTLD_NOW);
	create cp_create = dlsym(cusparse_so, "cusparseCreate");
	destroy cp_destory = dlsym(cusparse_so, "cusparseDestroy");

	createDnVec cp_createDnVec = dlsym(cusparse_so, "cusparseCreateDnVec");
	destoryDnVec cp_destoryDnVec = dlsym(cusparse_so, "cusparseDestroyDnVec");
	createCSRSpMat cp_createCSRSpMat = dlsym(cusparse_so, "cusparseCreateCsr");
	csr_buffersize cp_csr_buffersize = dlsym(cusparse_so, "cusparseSpMV_bufferSize");
	destoryCSRSpMat cp_destoryCSRSpMat = dlsym(cusparse_so, "cusparseDestroySpMat");
	spmv cp_spmv = dlsym(cusparse_so, "cusparseSpMV");


	// Host data
    const int A_num_rows      = 4;
    const int A_num_cols      = 4;
    const int A_nnz           = 9;
    int       hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
    int       hA_columns[]    = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
    float     hA_values[]     = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                  6.0f, 7.0f, 8.0f, 9.0f };
    float     hX[]            = { 1.0f, 2.0f, 3.0f, 4.0f };
    float     hY[]            = { 1.0f, 0.0f, 0.0f, 0.0f };
    float     hY_result[]     = { 20.0f, 8.0f, 51.0f, 52.0f };
	float     hY_result_2[]   = { 0.0f, 0.0f, 0.0f, 0.0f };
    float     alpha           = 1.0f;
    float     beta            = 1.0f;
	 // Device memory management
    int   *dA_csrOffsets, *dA_columns;
    float *dA_values, *dX, *dY;
     cu_malloc((void**) &dA_csrOffsets,(A_num_rows + 1) * sizeof(int));
     cu_malloc((void**) &dA_columns, A_nnz * sizeof(int));
     cu_malloc((void**) &dA_values,  A_nnz * sizeof(float));
     cu_malloc((void**) &dX,         A_num_cols * sizeof(float));
     cu_malloc((void**) &dY,         A_num_rows * sizeof(float));
     cu_cpy(dA_csrOffsets, hA_csrOffsets, (A_num_rows + 1) * sizeof(int),
cudaMemcpyHostToDevice);
     cu_cpy(dA_columns, hA_columns, A_nnz * sizeof(int),cudaMemcpyHostToDevice);
     cu_cpy(dA_values, hA_values, A_nnz * sizeof(float),cudaMemcpyHostToDevice) ;
     cu_cpy(dX, hX, A_num_cols * sizeof(float),cudaMemcpyHostToDevice);
     cu_cpy(dY, hY, A_num_rows * sizeof(float), cudaMemcpyHostToDevice);

	// create cusparse handle
	cusparseHandle_t     handle = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnVecDescr_t vecX, vecY;
    void*                dBuffer    = NULL;
    size_t               bufferSize = 0;
	cp_create(&handle);
	cp_createDnVec(&vecX, A_num_cols, dX, CUDA_R_32F);
	cp_createDnVec(&vecY, A_num_rows, dY, CUDA_R_32F);
	cp_createCSRSpMat(&matA, A_num_rows, A_num_cols, A_nnz,
                                      dA_csrOffsets, dA_columns, dA_values,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);

	cp_csr_buffersize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                                 CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
	cu_malloc((void**) &dBuffer, bufferSize);
	printf("buffersize %lu\n", bufferSize);
	cp_spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
                                 CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);


	cp_destoryDnVec(vecX);
	cp_destoryDnVec(vecY);
	cp_destoryCSRSpMat(matA);
	cp_destory(handle);

	cu_cpy(hY, dY, A_num_rows * sizeof(float),
                           cudaMemcpyDeviceToHost);

	int correct = 1;
    for (int i = 0; i < A_num_rows; i++) {
        if (hY[i] != hY_result[i]) { // direct floating point comparison is not
            correct = 0;             // reliable
            break;
        }
    }
    if (correct)
        printf("spmv_csr_example test PASSED\n");
    else
        printf("spmv_csr_example test FAILED: wrong result\n");

	printf("%f %f %f %f\n", hY[0], hY[1], hY[2], hY[3]);

	// Free memory
	cu_free(dBuffer);
    cu_free(dA_csrOffsets);
    cu_free(dA_columns) ;
    cu_free(dA_values) ;
    cu_free(dX) ;
    cu_free(dY) ;

	dlclose(cuda_so);
	dlclose(cusparse_so);
}
	#include <dlfcn.h>
	#include <stdio.h>

	enum cudaMemcpyKind {
	cudaMemcpyHostToHost = 0,
	cudaMemcpyHostToDevice = 1,
	cudaMemcpyDeviceToHost = 2,
	cudaMemcpyDeviceToDevice = 3,
	cudaMemcpyDefault = 4
	};


	typedef void (gc)(int count);
	typedef void (malloc) ( void* devPtr, size_t size );
	typedef void (free) ( void devPtr );
	typedef void (copy) ( void dst, const void* src, size_t count, cudaMemcpyKind kind );


	struct cusparseContext;
	typedef struct cusparseContext *cusparseHandle_t;
	struct cusparseDnVecDescr;
	struct cusparseSpMatDescr;
	typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
	typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
	typedef enum cudaDataType_t {
	CUDA_R_16F = 2, /* real as a half */
	CUDA_C_16F = 6, /* complex as a pair of half numbers */
	CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
	CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
	CUDA_R_32F = 0, /* real as a float */
	CUDA_C_32F = 4, /* complex as a pair of float numbers */
	CUDA_R_64F = 1, /* real as a double */
	CUDA_C_64F = 5, /* complex as a pair of double numbers */
	CUDA_R_4I = 16, /* real as a signed 4-bit int */
	CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */
	CUDA_R_4U = 18, /* real as a unsigned 4-bit int */
	CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */
	CUDA_R_8I = 3, /* real as a signed 8-bit int */
	CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */
	CUDA_R_8U = 8, /* real as a unsigned 8-bit int */
	CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */
	CUDA_R_16I = 20, /* real as a signed 16-bit int */
	CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */
	CUDA_R_16U = 22, /* real as a unsigned 16-bit int */
	CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */
	CUDA_R_32I = 10, /* real as a signed 32-bit int */
	CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */
	CUDA_R_32U = 12, /* real as a unsigned 32-bit int */
	CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */
	CUDA_R_64I = 24, /* real as a signed 64-bit int */
	CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */
	CUDA_R_64U = 26, /* real as a unsigned 64-bit int */
	CUDA_C_64U = 27 /* complex as a pair of unsigned 64-bit int numbers */
	} cudaDataType;
	typedef enum {
	CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector
	///< indices
	CUSPARSE_INDEX_32I = 2, ///< 32-bit signed integer for matrix/vector indices
	CUSPARSE_INDEX_64I = 3 ///< 64-bit signed integer for matrix/vector indices
	} cusparseIndexType_t;
	typedef enum {
	CUSPARSE_INDEX_BASE_ZERO = 0,
	CUSPARSE_INDEX_BASE_ONE = 1
	} cusparseIndexBase_t;
	typedef enum {
	CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
	CUSPARSE_OPERATION_TRANSPOSE = 1,
	CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
	} cusparseOperation_t;

	typedef enum {
	CUSPARSE_SPMV_ALG_DEFAULT = 0,
	CUSPARSE_SPMV_COO_ALG1 = 1,
	CUSPARSE_SPMV_CSR_ALG1 = 2,
	CUSPARSE_SPMV_CSR_ALG2 = 3,
	CUSPARSE_SPMV_COO_ALG2 = 4
	} cusparseSpMVAlg_t;

	typedef int (create)(cusparseHandle_t handle);
	typedef int (*destroy)(cusparseHandle_t handle);
	typedef int (createDnVec)(cusparseDnVecDescr_t dnVecDescr,
	int size,
	void* values,
	cudaDataType valueType);
	typedef int (*destoryDnVec)(cusparseDnVecDescr_t dnVecDescr);
	typedef int (createCSRSpMat)(cusparseSpMatDescr_t spMatDescr,
	int rows,
	int cols,
	int nnz,
	void* csrRowOffsets,
	void* csrColInd,
	void* csrValues,
	cusparseIndexType_t csrRowOffsetsType,
	cusparseIndexType_t csrColIndType,
	cusparseIndexBase_t idxBase,
	cudaDataType valueType);
	typedef int(*destoryCSRSpMat)(cusparseSpMatDescr_t spMatDescr);

	typedef int(*csr_buffersize)(cusparseHandle_t handle,
	cusparseOperation_t opA,
	const void* alpha,
	const cusparseSpMatDescr_t matA,
	const cusparseDnVecDescr_t vecX,
	const void* beta,
	const cusparseDnVecDescr_t vecY,
	cudaDataType computeType,
	cusparseSpMVAlg_t alg,
	size_t* bufferSize);
	typedef int(*spmv)(cusparseHandle_t handle,
	cusparseOperation_t opA,
	const void* alpha,
	cusparseSpMatDescr_t matA,
	cusparseDnVecDescr_t vecX,
	const void* beta,
	cusparseDnVecDescr_t vecY,
	cudaDataType computeType,
	cusparseSpMVAlg_t alg,
	void* externalBuffer);

	int main(){
	// load cuda runtime api functions
	// void *cuda_so = dlopen("/usr/lib/x86_64-linux-gnu/libcuda.so", RTLD_NOW);
	void *cuda_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcudart.so", RTLD_NOW);

	malloc cu_malloc = dlsym(cuda_so, "cudaMalloc");
	free cu_free = dlsym(cuda_so, "cudaFree");
	copy cu_cpy = dlsym(cuda_so, "cudaMemcpy");

	// load cusparse api functions
	void *cusparse_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcusparse.so", RTLD_NOW);
	create cp_create = dlsym(cusparse_so, "cusparseCreate");
	destroy cp_destory = dlsym(cusparse_so, "cusparseDestroy");

	createDnVec cp_createDnVec = dlsym(cusparse_so, "cusparseCreateDnVec");
	destoryDnVec cp_destoryDnVec = dlsym(cusparse_so, "cusparseDestroyDnVec");
	createCSRSpMat cp_createCSRSpMat = dlsym(cusparse_so, "cusparseCreateCsr");
	csr_buffersize cp_csr_buffersize = dlsym(cusparse_so, "cusparseSpMV_bufferSize");
	destoryCSRSpMat cp_destoryCSRSpMat = dlsym(cusparse_so, "cusparseDestroySpMat");
	spmv cp_spmv = dlsym(cusparse_so, "cusparseSpMV");


	// Host data
	const int A_num_rows = 4;
	const int A_num_cols = 4;
	const int A_nnz = 9;
	int hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
	int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
	float hA_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
	6.0f, 7.0f, 8.0f, 9.0f };
	float hX[] = { 1.0f, 2.0f, 3.0f, 4.0f };
	float hY[] = { 1.0f, 0.0f, 0.0f, 0.0f };
	float hY_result[] = { 20.0f, 8.0f, 51.0f, 52.0f };
	float hY_result_2[] = { 0.0f, 0.0f, 0.0f, 0.0f };
	float alpha = 1.0f;
	float beta = 1.0f;
	// Device memory management
	int dA_csrOffsets, dA_columns;
	float dA_values, dX, *dY;
	cu_malloc((void*) &dA_csrOffsets,(A_num_rows + 1) sizeof(int));
	cu_malloc((void*) &dA_columns, A_nnz sizeof(int));
	cu_malloc((void*) &dA_values, A_nnz sizeof(float));
	cu_malloc((void*) &dX, A_num_cols sizeof(float));
	cu_malloc((void*) &dY, A_num_rows sizeof(float));
	cu_cpy(dA_csrOffsets, hA_csrOffsets, (A_num_rows + 1) * sizeof(int),
	cudaMemcpyHostToDevice);
	cu_cpy(dA_columns, hA_columns, A_nnz * sizeof(int),cudaMemcpyHostToDevice);
	cu_cpy(dA_values, hA_values, A_nnz * sizeof(float),cudaMemcpyHostToDevice) ;
	cu_cpy(dX, hX, A_num_cols * sizeof(float),cudaMemcpyHostToDevice);
	cu_cpy(dY, hY, A_num_rows * sizeof(float), cudaMemcpyHostToDevice);

	// create cusparse handle
	cusparseHandle_t handle = NULL;
	cusparseSpMatDescr_t matA;
	cusparseDnVecDescr_t vecX, vecY;
	void* dBuffer = NULL;
	size_t bufferSize = 0;
	cp_create(&handle);
	cp_createDnVec(&vecX, A_num_cols, dX, CUDA_R_32F);
	cp_createDnVec(&vecY, A_num_rows, dY, CUDA_R_32F);
	cp_createCSRSpMat(&matA, A_num_rows, A_num_cols, A_nnz,
	dA_csrOffsets, dA_columns, dA_values,
	CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
	CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);

	cp_csr_buffersize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
	&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
	CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
	cu_malloc((void**) &dBuffer, bufferSize);
	printf("buffersize %lu\n", bufferSize);
	cp_spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
	&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
	CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);


	cp_destoryDnVec(vecX);
	cp_destoryDnVec(vecY);
	cp_destoryCSRSpMat(matA);
	cp_destory(handle);

	cu_cpy(hY, dY, A_num_rows * sizeof(float),
	cudaMemcpyDeviceToHost);

	int correct = 1;
	for (int i = 0; i < A_num_rows; i++) {
	if (hY[i] != hY_result[i]) { // direct floating point comparison is not
	correct = 0; // reliable
	break;
	}
	}
	if (correct)
	printf("spmv_csr_example test PASSED\n");
	else
	printf("spmv_csr_example test FAILED: wrong result\n");

	printf("%f %f %f %f\n", hY[0], hY[1], hY[2], hY[3]);

	// Free memory
	cu_free(dBuffer);
	cu_free(dA_csrOffsets);
	cu_free(dA_columns) ;
	cu_free(dA_values) ;
	cu_free(dX) ;
	cu_free(dY) ;

	dlclose(cuda_so);
	dlclose(cusparse_so);
	}