Skip to content

Instantly share code, notes, and snippets.

@maiorpa
Created December 27, 2016 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maiorpa/d2b853f87a0c4d58163ba4de8e06bac8 to your computer and use it in GitHub Desktop.
Save maiorpa/d2b853f87a0c4d58163ba4de8e06bac8 to your computer and use it in GitHub Desktop.
cuda question
bool LsSol::Solve_PCG(double& conv_ratio, unsigned int& iteration,
LsSol::ILinearSystemPreconditioner_Sol& ls)
{
const double conv_ratio_tol = conv_ratio;
const unsigned int mx_iter = iteration;
if( ls.GetTmpVectorArySize() < 2 ){ ls.ReSizeTmpVecSolver(2); }
const int ix = -2;
const int ir = -1;
const int ip = 0;
const int iz = 1;
// x = 0.0
ls.SCAL(0.0,ix);
double sq_inv_norm_res0;
{
const double sq_norm_res0 = ls.DOT(ir,ir);
if( sq_norm_res0 < 1.0e-30 ){
conv_ratio = 0.0;
iteration = 0;
return true;
}
sq_inv_norm_res0 = 1.0 / sq_norm_res0;
}
ls.COPY(ir,iz);
ls.SolvePrecond(iz);
ls.COPY(iz,ip);
double inpro_rz = ls.DOT(ir,iz);
for(unsigned int iitr=0;iitr<mx_iter;iitr++){
double alpha;
{ // calc alpha
ls.MATVEC(1.0,ip,0.0,iz);
const double val_pAp = ls.DOT(ip,iz);
alpha = inpro_rz / val_pAp;
}
ls.AXPY(-alpha,iz,ir);
ls.AXPY(alpha,ip,ix); // Converge Judgement‚М‘O‚Й“ь‚к‚й
{ // Converge Judgement
double sq_norm_res = ls.DOT(ir,ir);
// std::cout << iitr << " " << sqrt(sq_norm_res * sq_inv_norm_res0) << std::endl;
if( sq_norm_res * sq_inv_norm_res0 < conv_ratio_tol*conv_ratio_tol ){
conv_ratio = sqrt( sq_norm_res * sq_inv_norm_res0 );
//conv_ratio = sq_norm_res;
iteration = iitr;
return true;
}
}
double beta;
{ // calc beta
ls.COPY(ir,iz);
ls.SolvePrecond(iz);
const double inpro_rz_new = ls.DOT(ir,iz);
beta = inpro_rz_new/inpro_rz;
inpro_rz = inpro_rz_new;
}
ls.SCAL(beta,ip);
ls.AXPY(1.0,iz,ip);
}
// Converge Judgement
double sq_norm_res = ls.DOT(ir,ir);
conv_ratio = sqrt( sq_norm_res * sq_inv_norm_res0 );
//conv_ratio = sq_norm_res;
return false;
}
std::pair<unsigned int, double> StrategyBsrWithSepareteLUMatrices::Solve(unsigned int max_iterate, double tolerance)
{
assert(mInstaller->mCusparseHandle != nullptr);
assert(mInstaller->mCublasHandle != nullptr);
checkCudaErrors(cudaMemcpy(mInstaller->mDeviceCol, mBsrColInd, mNnzb*sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mInstaller->mDeviceRow, mBsrRowPtr, (mMb+1)*sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mInstaller->mDeviceData, mData, mDataSize*sizeof(double), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mInstaller->mDeviceX, mX, mResSize*sizeof(double), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mInstaller->mDeviceRes, mRes, mResSize*sizeof(double), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mDeviceLCol, mLColInd, mLNnzb*sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mDeviceLRow, mLRowPtr, (mMb+1)*sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mDeviceValsL, mValsL, mLNnzb*mBlockDim*mBlockDim*sizeof(double), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mDeviceUCol, mUColInd, mUNnzb*sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mDeviceURow, mURowPtr, (mMb+1)*sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(mDeviceValsU, mValsU, mUNnzb*mBlockDim*mBlockDim*sizeof(double), cudaMemcpyHostToDevice));
double alpha = 1.0;
double beta = 0.0;
int structural_zero=0;
int numerical_zero=0;
const double double_one = 1.0;
const double double_zero = 0.0;
cusparseStatus_t status;
// x = 0
cublasDscal(mInstaller->mCublasHandle, mResSize, &beta, mInstaller->mDeviceX, 1);
double sq_inv_norm_res0;
double inpro_rz;
double val_pAp;
double sq_norm_res;
double inpro_rz_new;
double sq_norm_res0;
{
cublasDdot(mInstaller->mCublasHandle, mResSize, mInstaller->mDeviceRes, 1, mInstaller->mDeviceRes, 1, &sq_norm_res0);
if( sq_norm_res0 < 1.0e-30 ){
cudaMemcpy(mX, mInstaller->mDeviceX, mResSize*sizeof(double), cudaMemcpyDeviceToHost);
return std::make_pair(0, 0.0);
}
sq_inv_norm_res0 = 1.0 / sq_norm_res0;
}
{
// Solve precondition system
// Forward Solve, we can re-use infoA since the sparsity pattern of A matches that of L
status = cusparseDbsrsv2_solve(mInstaller->mCusparseHandle, CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, mMb, mLNnzb, &double_one, mDescrL,
mDeviceValsL, mDeviceLRow, mDeviceLCol, mBlockDim,
mInfo_L, mInstaller->mDeviceRes, mDeviceY, /*CUSPARSE_SOLVE_POLICY_NO_LEVEL*/CUSPARSE_SOLVE_POLICY_USE_LEVEL, mBuffer);
checkCudaErrors(status);
// Back Substitution
status = cusparseDbsrsv2_solve(mInstaller->mCusparseHandle, CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, mMb, mUNnzb, &double_one, mDescrU,
mDeviceValsU, mDeviceURow, mDeviceUCol, mBlockDim,
mInfo_U, mDeviceY, mDeviceZ, /*CUSPARSE_SOLVE_POLICY_NO_LEVEL*/CUSPARSE_SOLVE_POLICY_USE_LEVEL, mBuffer);
checkCudaErrors(status);
// End Solve precondition system
cublasDdot(mInstaller->mCublasHandle, mResSize, mInstaller->mDeviceRes, 1, mDeviceZ, 1, &inpro_rz);
cublasDcopy(mInstaller->mCublasHandle, mResSize, mDeviceZ, 1, mInstaller->mDeviceP, 1);
unsigned int iitr;
for(iitr=0;iitr<max_iterate;iitr++){
{ // calc alpha
cusparseDbsrmv(mInstaller->mCusparseHandle, CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, mMb, mNb, mNnzb, &double_one, mInstaller->mMatDescr
, mInstaller->mDeviceData, mInstaller->mDeviceRow, mInstaller->mDeviceCol, mBlockDim
, mInstaller->mDeviceP, &double_zero, mInstaller->mDeviceAp);
cublasDdot(mInstaller->mCublasHandle, mResSize, mInstaller->mDeviceP, 1, mInstaller->mDeviceAp, 1, &val_pAp);
alpha = inpro_rz / val_pAp;
}
{
// Converge Judgement
cublasDaxpy(mInstaller->mCublasHandle, mResSize, &alpha, mInstaller->mDeviceP, 1, mInstaller->mDeviceX, 1);
const double nalpha = -alpha;
cublasDaxpy(mInstaller->mCublasHandle, mResSize, &nalpha, mInstaller->mDeviceAp, 1, mInstaller->mDeviceRes, 1);
}
{ // check converge
cublasDdot(mInstaller->mCublasHandle, mResSize, mInstaller->mDeviceRes, 1, mInstaller->mDeviceRes, 1, &sq_norm_res);
if( sq_norm_res * sq_inv_norm_res0 < tolerance*tolerance /*|| sq_norm_res < 1e-7*/){
tolerance = sqrt( sq_norm_res * sq_inv_norm_res0 );
cudaMemcpy(mX, mInstaller->mDeviceX, mResSize*sizeof(double), cudaMemcpyDeviceToHost);
return std::make_pair(iitr,tolerance);
}
}
{ // calc beta
{
// Solve precondition system
// Forward Solve, we can re-use infoA since the sparsity pattern of A matches that of L
status = cusparseDbsrsv2_solve(mInstaller->mCusparseHandle, CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, mMb, mLNnzb, &double_one, mDescrL,
mDeviceValsL, mDeviceLRow, mDeviceLCol, mBlockDim,
mInfo_L, mInstaller->mDeviceRes, mDeviceY,/*CUSPARSE_SOLVE_POLICY_NO_LEVEL*/CUSPARSE_SOLVE_POLICY_USE_LEVEL, mBuffer);
checkCudaErrors(status);
// Back Substitution
status = cusparseDbsrsv2_solve(mInstaller->mCusparseHandle, CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, mMb, mUNnzb, &double_one, mDescrU,
mDeviceValsU, mDeviceURow, mDeviceUCol, mBlockDim,
mInfo_U, mDeviceY, mDeviceZ, /*CUSPARSE_SOLVE_POLICY_NO_LEVEL*/CUSPARSE_SOLVE_POLICY_USE_LEVEL, mBuffer);
checkCudaErrors(status);
// End Solve precondition system
}
cublasDdot(mInstaller->mCublasHandle, mResSize, mInstaller->mDeviceRes, 1, mDeviceZ, 1, &inpro_rz_new);
beta = inpro_rz_new/inpro_rz;
inpro_rz = inpro_rz_new;
}
cublasDscal(mInstaller->mCublasHandle, mResSize, &beta, mInstaller->mDeviceP, 1);
cublasDaxpy(mInstaller->mCublasHandle, mResSize, &double_one, mDeviceZ, 1, mInstaller->mDeviceP, 1);
}
cudaMemcpy(mX, mInstaller->mDeviceX, mResSize*sizeof(double), cudaMemcpyDeviceToHost);
return std::make_pair(iitr, sqrt( sq_norm_res * sq_inv_norm_res0 ));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment