-
-
Save IvanYashchuk/82a493190b89fd4da7e74ab6331eea8b to your computer and use it in GitHub Desktop.
Verifying SpMV with float16/bfloat16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc. | |
#include <cusparse.h> // cusparseSpMV | |
#include <cuda_fp16.h> // __half data types | |
#include <cuda_bf16.h> // __nv_bfloat16 | |
#include <stdio.h> // printf | |
#include <stdlib.h> // EXIT_FAILURE | |
// #define SCALAR_T __nv_bfloat16 | |
// #define TO_FLOAT __bfloat162float | |
// #define A_CUDA_DTYPE CUDA_R_16BF | |
// #define X_CUDA_DTYPE CUDA_R_16BF | |
// #define Y_CUDA_DTYPE CUDA_R_16BF | |
// #define CUDA_COMPUTE_TYPE CUDA_R_32F | |
#define SCALAR_T __half | |
#define TO_FLOAT __half2float | |
#define A_CUDA_DTYPE CUDA_R_16F | |
#define X_CUDA_DTYPE CUDA_R_16F | |
#define Y_CUDA_DTYPE CUDA_R_16F | |
#define CUDA_COMPUTE_TYPE CUDA_R_32F | |
#define CHECK_CUDA(func) \ | |
{ \ | |
cudaError_t status = (func); \ | |
if (status != cudaSuccess) { \ | |
printf("CUDA API failed at line %d with error: %s (%d)\n", \ | |
__LINE__, cudaGetErrorString(status), status); \ | |
return EXIT_FAILURE; \ | |
} \ | |
} | |
#define CHECK_CUSPARSE(func) \ | |
{ \ | |
cusparseStatus_t status = (func); \ | |
if (status != CUSPARSE_STATUS_SUCCESS) { \ | |
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \ | |
__LINE__, cusparseGetErrorString(status), status); \ | |
return EXIT_FAILURE; \ | |
} \ | |
} | |
int main(void) { | |
// Host problem definition | |
const int A_num_rows = 4; | |
const int A_num_cols = 4; | |
const int A_nnz = 9; | |
int hA_csrOffsets[] = { 0, 3, 4, 7, 9 }; | |
int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 }; | |
SCALAR_T hA_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, | |
6.0f, 7.0f, 8.0f, 9.0f }; | |
SCALAR_T hX[] = { 1.0f, 2.0f, 3.0f, 4.0f }; | |
SCALAR_T hY[] = { 0.0f, 0.0f, 0.0f, 0.0f }; | |
SCALAR_T hY_result[] = { 19.0f, 8.0f, 51.0f, 52.0f }; | |
float alpha = 1.0f; | |
float beta = 0.0f; | |
//-------------------------------------------------------------------------- | |
// Device memory management | |
int *dA_csrOffsets, *dA_columns; | |
SCALAR_T *dA_values, *dX, *dY; | |
CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets, | |
(A_num_rows + 1) * sizeof(int)) ) | |
CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int)) ) | |
CHECK_CUDA( cudaMalloc((void**) &dA_values, A_nnz * sizeof(SCALAR_T)) ) | |
CHECK_CUDA( cudaMalloc((void**) &dX, A_num_cols * sizeof(SCALAR_T)) ) | |
CHECK_CUDA( cudaMalloc((void**) &dY, A_num_rows * sizeof(SCALAR_T)) ) | |
CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets, | |
(A_num_rows + 1) * sizeof(int), | |
cudaMemcpyHostToDevice) ) | |
CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int), | |
cudaMemcpyHostToDevice) ) | |
CHECK_CUDA( cudaMemcpy(dA_values, hA_values, A_nnz * sizeof(SCALAR_T), | |
cudaMemcpyHostToDevice) ) | |
CHECK_CUDA( cudaMemcpy(dX, hX, A_num_cols * sizeof(SCALAR_T), | |
cudaMemcpyHostToDevice) ) | |
CHECK_CUDA( cudaMemcpy(dY, hY, A_num_rows * sizeof(SCALAR_T), | |
cudaMemcpyHostToDevice) ) | |
//-------------------------------------------------------------------------- | |
// CUSPARSE APIs | |
cusparseHandle_t handle = NULL; | |
cusparseSpMatDescr_t matA; | |
cusparseDnVecDescr_t vecX, vecY; | |
void* dBuffer = NULL; | |
size_t bufferSize = 0; | |
CHECK_CUSPARSE( cusparseCreate(&handle) ) | |
// Create sparse matrix A in CSR format | |
CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz, | |
dA_csrOffsets, dA_columns, dA_values, | |
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, | |
CUSPARSE_INDEX_BASE_ZERO, A_CUDA_DTYPE) ) | |
// Create dense vector X | |
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, A_num_cols, dX, X_CUDA_DTYPE) ) | |
// Create dense vector y | |
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, A_num_rows, dY, Y_CUDA_DTYPE) ) | |
// allocate an external buffer if needed | |
CHECK_CUSPARSE( cusparseSpMV_bufferSize( | |
handle, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
&alpha, matA, vecX, &beta, vecY, CUDA_COMPUTE_TYPE, | |
CUSPARSE_MV_ALG_DEFAULT, &bufferSize) ) | |
CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) ) | |
// execute SpMV | |
CHECK_CUSPARSE( cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
&alpha, matA, vecX, &beta, vecY, CUDA_COMPUTE_TYPE, | |
CUSPARSE_MV_ALG_DEFAULT, dBuffer) ) | |
// destroy matrix/vector descriptors | |
CHECK_CUSPARSE( cusparseDestroySpMat(matA) ) | |
CHECK_CUSPARSE( cusparseDestroyDnVec(vecX) ) | |
CHECK_CUSPARSE( cusparseDestroyDnVec(vecY) ) | |
CHECK_CUSPARSE( cusparseDestroy(handle) ) | |
//-------------------------------------------------------------------------- | |
// device result check | |
CHECK_CUDA( cudaMemcpy(hY, dY, A_num_rows * sizeof(SCALAR_T), | |
cudaMemcpyDeviceToHost) ) | |
int correct = 1; | |
for (int i = 0; i < A_num_rows; i++) { | |
if (hY[i] != hY_result[i]) { // direct floating point comparison is not | |
correct = 0; // reliable | |
break; | |
} | |
} | |
if (correct) | |
printf("spmv_csr_example test PASSED\n"); | |
else | |
printf("spmv_csr_example test FAILED: wrong result\n"); | |
for (int i = 0; i < A_num_rows; i++) { | |
printf("%f ", TO_FLOAT(hY[i])); | |
} | |
printf("\n"); | |
//-------------------------------------------------------------------------- | |
// device memory deallocation | |
CHECK_CUDA( cudaFree(dBuffer) ) | |
CHECK_CUDA( cudaFree(dA_csrOffsets) ) | |
CHECK_CUDA( cudaFree(dA_columns) ) | |
CHECK_CUDA( cudaFree(dA_values) ) | |
CHECK_CUDA( cudaFree(dX) ) | |
CHECK_CUDA( cudaFree(dY) ) | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment