Created
August 14, 2022 17:44
-
-
Save FantasyVR/cf717d18fe0dddff640478c02b397b9a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <dlfcn.h> | |
#include <stdio.h> | |
enum cudaMemcpyKind { | |
cudaMemcpyHostToHost = 0, | |
cudaMemcpyHostToDevice = 1, | |
cudaMemcpyDeviceToHost = 2, | |
cudaMemcpyDeviceToDevice = 3, | |
cudaMemcpyDefault = 4 | |
}; | |
typedef void (*gc)(int *count); | |
typedef void (*malloc) ( void** devPtr, size_t size ); | |
typedef void (*free) ( void* devPtr ); | |
typedef void (*copy) ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ); | |
struct cusparseContext; | |
typedef struct cusparseContext *cusparseHandle_t; | |
struct cusparseDnVecDescr; | |
struct cusparseSpMatDescr; | |
typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t; | |
typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t; | |
typedef enum cudaDataType_t { | |
CUDA_R_16F = 2, /* real as a half */ | |
CUDA_C_16F = 6, /* complex as a pair of half numbers */ | |
CUDA_R_16BF = 14, /* real as a nv_bfloat16 */ | |
CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */ | |
CUDA_R_32F = 0, /* real as a float */ | |
CUDA_C_32F = 4, /* complex as a pair of float numbers */ | |
CUDA_R_64F = 1, /* real as a double */ | |
CUDA_C_64F = 5, /* complex as a pair of double numbers */ | |
CUDA_R_4I = 16, /* real as a signed 4-bit int */ | |
CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */ | |
CUDA_R_4U = 18, /* real as a unsigned 4-bit int */ | |
CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */ | |
CUDA_R_8I = 3, /* real as a signed 8-bit int */ | |
CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */ | |
CUDA_R_8U = 8, /* real as a unsigned 8-bit int */ | |
CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */ | |
CUDA_R_16I = 20, /* real as a signed 16-bit int */ | |
CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */ | |
CUDA_R_16U = 22, /* real as a unsigned 16-bit int */ | |
CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */ | |
CUDA_R_32I = 10, /* real as a signed 32-bit int */ | |
CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */ | |
CUDA_R_32U = 12, /* real as a unsigned 32-bit int */ | |
CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */ | |
CUDA_R_64I = 24, /* real as a signed 64-bit int */ | |
CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */ | |
CUDA_R_64U = 26, /* real as a unsigned 64-bit int */ | |
CUDA_C_64U = 27 /* complex as a pair of unsigned 64-bit int numbers */ | |
} cudaDataType; | |
typedef enum { | |
CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector | |
///< indices | |
CUSPARSE_INDEX_32I = 2, ///< 32-bit signed integer for matrix/vector indices | |
CUSPARSE_INDEX_64I = 3 ///< 64-bit signed integer for matrix/vector indices | |
} cusparseIndexType_t; | |
typedef enum { | |
CUSPARSE_INDEX_BASE_ZERO = 0, | |
CUSPARSE_INDEX_BASE_ONE = 1 | |
} cusparseIndexBase_t; | |
typedef enum { | |
CUSPARSE_OPERATION_NON_TRANSPOSE = 0, | |
CUSPARSE_OPERATION_TRANSPOSE = 1, | |
CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 | |
} cusparseOperation_t; | |
typedef enum { | |
CUSPARSE_SPMV_ALG_DEFAULT = 0, | |
CUSPARSE_SPMV_COO_ALG1 = 1, | |
CUSPARSE_SPMV_CSR_ALG1 = 2, | |
CUSPARSE_SPMV_CSR_ALG2 = 3, | |
CUSPARSE_SPMV_COO_ALG2 = 4 | |
} cusparseSpMVAlg_t; | |
typedef int (*create)(cusparseHandle_t *handle); | |
typedef int (*destroy)(cusparseHandle_t handle); | |
typedef int (*createDnVec)(cusparseDnVecDescr_t* dnVecDescr, | |
int size, | |
void* values, | |
cudaDataType valueType); | |
typedef int (*destoryDnVec)(cusparseDnVecDescr_t dnVecDescr); | |
typedef int (*createCSRSpMat)(cusparseSpMatDescr_t* spMatDescr, | |
int rows, | |
int cols, | |
int nnz, | |
void* csrRowOffsets, | |
void* csrColInd, | |
void* csrValues, | |
cusparseIndexType_t csrRowOffsetsType, | |
cusparseIndexType_t csrColIndType, | |
cusparseIndexBase_t idxBase, | |
cudaDataType valueType); | |
typedef int(*destoryCSRSpMat)(cusparseSpMatDescr_t spMatDescr); | |
typedef int(*csr_buffersize)(cusparseHandle_t handle, | |
cusparseOperation_t opA, | |
const void* alpha, | |
const cusparseSpMatDescr_t matA, | |
const cusparseDnVecDescr_t vecX, | |
const void* beta, | |
const cusparseDnVecDescr_t vecY, | |
cudaDataType computeType, | |
cusparseSpMVAlg_t alg, | |
size_t* bufferSize); | |
typedef int(*spmv)(cusparseHandle_t handle, | |
cusparseOperation_t opA, | |
const void* alpha, | |
cusparseSpMatDescr_t matA, | |
cusparseDnVecDescr_t vecX, | |
const void* beta, | |
cusparseDnVecDescr_t vecY, | |
cudaDataType computeType, | |
cusparseSpMVAlg_t alg, | |
void* externalBuffer); | |
int main(){ | |
// load cuda runtime api functions | |
// void *cuda_so = dlopen("/usr/lib/x86_64-linux-gnu/libcuda.so", RTLD_NOW); | |
void *cuda_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcudart.so", RTLD_NOW); | |
malloc cu_malloc = dlsym(cuda_so, "cudaMalloc"); | |
free cu_free = dlsym(cuda_so, "cudaFree"); | |
copy cu_cpy = dlsym(cuda_so, "cudaMemcpy"); | |
// load cusparse api functions | |
void *cusparse_so = dlopen("/home/py/anaconda3/pkgs/cudatoolkit-11.2.2-he111cf0_8/lib/libcusparse.so", RTLD_NOW); | |
create cp_create = dlsym(cusparse_so, "cusparseCreate"); | |
destroy cp_destory = dlsym(cusparse_so, "cusparseDestroy"); | |
createDnVec cp_createDnVec = dlsym(cusparse_so, "cusparseCreateDnVec"); | |
destoryDnVec cp_destoryDnVec = dlsym(cusparse_so, "cusparseDestroyDnVec"); | |
createCSRSpMat cp_createCSRSpMat = dlsym(cusparse_so, "cusparseCreateCsr"); | |
csr_buffersize cp_csr_buffersize = dlsym(cusparse_so, "cusparseSpMV_bufferSize"); | |
destoryCSRSpMat cp_destoryCSRSpMat = dlsym(cusparse_so, "cusparseDestroySpMat"); | |
spmv cp_spmv = dlsym(cusparse_so, "cusparseSpMV"); | |
// Host data | |
const int A_num_rows = 4; | |
const int A_num_cols = 4; | |
const int A_nnz = 9; | |
int hA_csrOffsets[] = { 0, 3, 4, 7, 9 }; | |
int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 }; | |
float hA_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, | |
6.0f, 7.0f, 8.0f, 9.0f }; | |
float hX[] = { 1.0f, 2.0f, 3.0f, 4.0f }; | |
float hY[] = { 1.0f, 0.0f, 0.0f, 0.0f }; | |
float hY_result[] = { 20.0f, 8.0f, 51.0f, 52.0f }; | |
float hY_result_2[] = { 0.0f, 0.0f, 0.0f, 0.0f }; | |
float alpha = 1.0f; | |
float beta = 1.0f; | |
// Device memory management | |
int *dA_csrOffsets, *dA_columns; | |
float *dA_values, *dX, *dY; | |
cu_malloc((void**) &dA_csrOffsets,(A_num_rows + 1) * sizeof(int)); | |
cu_malloc((void**) &dA_columns, A_nnz * sizeof(int)); | |
cu_malloc((void**) &dA_values, A_nnz * sizeof(float)); | |
cu_malloc((void**) &dX, A_num_cols * sizeof(float)); | |
cu_malloc((void**) &dY, A_num_rows * sizeof(float)); | |
cu_cpy(dA_csrOffsets, hA_csrOffsets, (A_num_rows + 1) * sizeof(int), | |
cudaMemcpyHostToDevice); | |
cu_cpy(dA_columns, hA_columns, A_nnz * sizeof(int),cudaMemcpyHostToDevice); | |
cu_cpy(dA_values, hA_values, A_nnz * sizeof(float),cudaMemcpyHostToDevice) ; | |
cu_cpy(dX, hX, A_num_cols * sizeof(float),cudaMemcpyHostToDevice); | |
cu_cpy(dY, hY, A_num_rows * sizeof(float), cudaMemcpyHostToDevice); | |
// create cusparse handle | |
cusparseHandle_t handle = NULL; | |
cusparseSpMatDescr_t matA; | |
cusparseDnVecDescr_t vecX, vecY; | |
void* dBuffer = NULL; | |
size_t bufferSize = 0; | |
cp_create(&handle); | |
cp_createDnVec(&vecX, A_num_cols, dX, CUDA_R_32F); | |
cp_createDnVec(&vecY, A_num_rows, dY, CUDA_R_32F); | |
cp_createCSRSpMat(&matA, A_num_rows, A_num_cols, A_nnz, | |
dA_csrOffsets, dA_columns, dA_values, | |
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, | |
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); | |
cp_csr_buffersize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F, | |
CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize); | |
cu_malloc((void**) &dBuffer, bufferSize); | |
printf("buffersize %lu\n", bufferSize); | |
cp_spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F, | |
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer); | |
cp_destoryDnVec(vecX); | |
cp_destoryDnVec(vecY); | |
cp_destoryCSRSpMat(matA); | |
cp_destory(handle); | |
cu_cpy(hY, dY, A_num_rows * sizeof(float), | |
cudaMemcpyDeviceToHost); | |
int correct = 1; | |
for (int i = 0; i < A_num_rows; i++) { | |
if (hY[i] != hY_result[i]) { // direct floating point comparison is not | |
correct = 0; // reliable | |
break; | |
} | |
} | |
if (correct) | |
printf("spmv_csr_example test PASSED\n"); | |
else | |
printf("spmv_csr_example test FAILED: wrong result\n"); | |
printf("%f %f %f %f\n", hY[0], hY[1], hY[2], hY[3]); | |
// Free memory | |
cu_free(dBuffer); | |
cu_free(dA_csrOffsets); | |
cu_free(dA_columns) ; | |
cu_free(dA_values) ; | |
cu_free(dX) ; | |
cu_free(dY) ; | |
dlclose(cuda_so); | |
dlclose(cusparse_so); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment