Skip to content

Instantly share code, notes, and snippets.

@FantasyVR
Created August 14, 2022 17:44
Show Gist options
  • Save FantasyVR/cf717d18fe0dddff640478c02b397b9a to your computer and use it in GitHub Desktop.
Save FantasyVR/cf717d18fe0dddff640478c02b397b9a to your computer and use it in GitHub Desktop.
#include <dlfcn.h>
#include <stdio.h>
enum cudaMemcpyKind {
cudaMemcpyHostToHost = 0,
cudaMemcpyHostToDevice = 1,
cudaMemcpyDeviceToHost = 2,
cudaMemcpyDeviceToDevice = 3,
cudaMemcpyDefault = 4
};
typedef void (*gc)(int *count);
typedef void (*malloc) ( void** devPtr, size_t size );
typedef void (*free) ( void* devPtr );
typedef void (*copy) ( void* dst, const void* src, size_t count, cudaMemcpyKind kind );
struct cusparseContext;
typedef struct cusparseContext *cusparseHandle_t;
struct cusparseDnVecDescr;
struct cusparseSpMatDescr;
typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
typedef enum cudaDataType_t {
CUDA_R_16F = 2, /* real as a half */
CUDA_C_16F = 6, /* complex as a pair of half numbers */
CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
CUDA_R_32F = 0, /* real as a float */
CUDA_C_32F = 4, /* complex as a pair of float numbers */
CUDA_R_64F = 1, /* real as a double */
CUDA_C_64F = 5, /* complex as a pair of double numbers */
CUDA_R_4I = 16, /* real as a signed 4-bit int */
CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */
CUDA_R_4U = 18, /* real as a unsigned 4-bit int */
CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */
CUDA_R_8I = 3, /* real as a signed 8-bit int */
CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */
CUDA_R_8U = 8, /* real as a unsigned 8-bit int */
CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */
CUDA_R_16I = 20, /* real as a signed 16-bit int */
CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */
CUDA_R_16U = 22, /* real as a unsigned 16-bit int */
CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */
CUDA_R_32I = 10, /* real as a signed 32-bit int */
CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */
CUDA_R_32U = 12, /* real as a unsigned 32-bit int */
CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */
CUDA_R_64I = 24, /* real as a signed 64-bit int */
CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */
CUDA_R_64U = 26, /* real as a unsigned 64-bit int */
CUDA_C_64U = 27 /* complex as a pair of unsigned 64-bit int numbers */
} cudaDataType;
typedef enum {
CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector
///< indices
CUSPARSE_INDEX_32I = 2, ///< 32-bit signed integer for matrix/vector indices
CUSPARSE_INDEX_64I = 3 ///< 64-bit signed integer for matrix/vector indices
} cusparseIndexType_t;
typedef enum {
CUSPARSE_INDEX_BASE_ZERO = 0,
CUSPARSE_INDEX_BASE_ONE = 1
} cusparseIndexBase_t;
typedef enum {
CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
CUSPARSE_OPERATION_TRANSPOSE = 1,
CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
} cusparseOperation_t;
typedef enum {
CUSPARSE_SPMV_ALG_DEFAULT = 0,
CUSPARSE_SPMV_COO_ALG1 = 1,
CUSPARSE_SPMV_CSR_ALG1 = 2,
CUSPARSE_SPMV_CSR_ALG2 = 3,
CUSPARSE_SPMV_COO_ALG2 = 4
} cusparseSpMVAlg_t;
typedef int (*create)(cusparseHandle_t *handle);
typedef int (*destroy)(cusparseHandle_t handle);
typedef int (*createDnVec)(cusparseDnVecDescr_t* dnVecDescr,
int size,
void* values,
cudaDataType valueType);
typedef int (*destoryDnVec)(cusparseDnVecDescr_t dnVecDescr);
typedef int (*createCSRSpMat)(cusparseSpMatDescr_t* spMatDescr,
int rows,
int cols,
int nnz,
void* csrRowOffsets,
void* csrColInd,
void* csrValues,
cusparseIndexType_t csrRowOffsetsType,
cusparseIndexType_t csrColIndType,
cusparseIndexBase_t idxBase,
cudaDataType valueType);
typedef int(*destoryCSRSpMat)(cusparseSpMatDescr_t spMatDescr);
typedef int(*csr_buffersize)(cusparseHandle_t handle,
cusparseOperation_t opA,
const void* alpha,
const cusparseSpMatDescr_t matA,
const cusparseDnVecDescr_t vecX,
const void* beta,
const cusparseDnVecDescr_t vecY,
cudaDataType computeType,
cusparseSpMVAlg_t alg,
size_t* bufferSize);
typedef int(*spmv)(cusparseHandle_t handle,
cusparseOperation_t opA,
const void* alpha,
cusparseSpMatDescr_t matA,
cusparseDnVecDescr_t vecX,
const void* beta,
cusparseDnVecDescr_t vecY,
cudaDataType computeType,
cusparseSpMVAlg_t alg,
void* externalBuffer);
int main(){
// load cuda runtime api functions
// void *cuda_so = dlopen("/usr/lib/x86_64-linux-gnu/libcuda.so", RTLD_NOW);
void *cuda_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcudart.so", RTLD_NOW);
malloc cu_malloc = dlsym(cuda_so, "cudaMalloc");
free cu_free = dlsym(cuda_so, "cudaFree");
copy cu_cpy = dlsym(cuda_so, "cudaMemcpy");
// load cusparse api functions
void *cusparse_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcusparse.so", RTLD_NOW);
create cp_create = dlsym(cusparse_so, "cusparseCreate");
destroy cp_destory = dlsym(cusparse_so, "cusparseDestroy");
createDnVec cp_createDnVec = dlsym(cusparse_so, "cusparseCreateDnVec");
destoryDnVec cp_destoryDnVec = dlsym(cusparse_so, "cusparseDestroyDnVec");
createCSRSpMat cp_createCSRSpMat = dlsym(cusparse_so, "cusparseCreateCsr");
csr_buffersize cp_csr_buffersize = dlsym(cusparse_so, "cusparseSpMV_bufferSize");
destoryCSRSpMat cp_destoryCSRSpMat = dlsym(cusparse_so, "cusparseDestroySpMat");
spmv cp_spmv = dlsym(cusparse_so, "cusparseSpMV");
// Host data
const int A_num_rows = 4;
const int A_num_cols = 4;
const int A_nnz = 9;
int hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
float hA_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f };
float hX[] = { 1.0f, 2.0f, 3.0f, 4.0f };
float hY[] = { 1.0f, 0.0f, 0.0f, 0.0f };
float hY_result[] = { 20.0f, 8.0f, 51.0f, 52.0f };
float hY_result_2[] = { 0.0f, 0.0f, 0.0f, 0.0f };
float alpha = 1.0f;
float beta = 1.0f;
// Device memory management
int *dA_csrOffsets, *dA_columns;
float *dA_values, *dX, *dY;
cu_malloc((void**) &dA_csrOffsets,(A_num_rows + 1) * sizeof(int));
cu_malloc((void**) &dA_columns, A_nnz * sizeof(int));
cu_malloc((void**) &dA_values, A_nnz * sizeof(float));
cu_malloc((void**) &dX, A_num_cols * sizeof(float));
cu_malloc((void**) &dY, A_num_rows * sizeof(float));
cu_cpy(dA_csrOffsets, hA_csrOffsets, (A_num_rows + 1) * sizeof(int),
cudaMemcpyHostToDevice);
cu_cpy(dA_columns, hA_columns, A_nnz * sizeof(int),cudaMemcpyHostToDevice);
cu_cpy(dA_values, hA_values, A_nnz * sizeof(float),cudaMemcpyHostToDevice) ;
cu_cpy(dX, hX, A_num_cols * sizeof(float),cudaMemcpyHostToDevice);
cu_cpy(dY, hY, A_num_rows * sizeof(float), cudaMemcpyHostToDevice);
// create cusparse handle
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA;
cusparseDnVecDescr_t vecX, vecY;
void* dBuffer = NULL;
size_t bufferSize = 0;
cp_create(&handle);
cp_createDnVec(&vecX, A_num_cols, dX, CUDA_R_32F);
cp_createDnVec(&vecY, A_num_rows, dY, CUDA_R_32F);
cp_createCSRSpMat(&matA, A_num_rows, A_num_cols, A_nnz,
dA_csrOffsets, dA_columns, dA_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
cp_csr_buffersize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
cu_malloc((void**) &dBuffer, bufferSize);
printf("buffersize %lu\n", bufferSize);
cp_spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer);
cp_destoryDnVec(vecX);
cp_destoryDnVec(vecY);
cp_destoryCSRSpMat(matA);
cp_destory(handle);
cu_cpy(hY, dY, A_num_rows * sizeof(float),
cudaMemcpyDeviceToHost);
int correct = 1;
for (int i = 0; i < A_num_rows; i++) {
if (hY[i] != hY_result[i]) { // direct floating point comparison is not
correct = 0; // reliable
break;
}
}
if (correct)
printf("spmv_csr_example test PASSED\n");
else
printf("spmv_csr_example test FAILED: wrong result\n");
printf("%f %f %f %f\n", hY[0], hY[1], hY[2], hY[3]);
// Free memory
cu_free(dBuffer);
cu_free(dA_csrOffsets);
cu_free(dA_columns) ;
cu_free(dA_values) ;
cu_free(dX) ;
cu_free(dY) ;
dlclose(cuda_so);
dlclose(cusparse_so);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment