Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
CUDA vs. CPU sparse solver in Python
# ### Interface cuSOLVER PyCUDA
from __future__ import print_function
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import scipy.sparse as sp
import ctypes
## Wrap the cuSOLVER cusolverSpDcsrlsvqr() using ctypes
## http://docs.nvidia.com/cuda/cusolver/#cusolver-lt-t-gt-csrlsvqr
# cuSparse
_libcusparse = ctypes.cdll.LoadLibrary('libcusparse.so')
_libcusparse.cusparseCreate.restype = int
_libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p]
_libcusparse.cusparseDestroy.restype = int
_libcusparse.cusparseDestroy.argtypes = [ctypes.c_void_p]
_libcusparse.cusparseCreateMatDescr.restype = int
_libcusparse.cusparseCreateMatDescr.argtypes = [ctypes.c_void_p]
# cuSOLVER
_libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')
_libcusolver.cusolverSpCreate.restype = int
_libcusolver.cusolverSpCreate.argtypes = [ctypes.c_void_p]
_libcusolver.cusolverSpDestroy.restype = int
_libcusolver.cusolverSpDestroy.argtypes = [ctypes.c_void_p]
_libcusolver.cusolverSpDcsrlsvqr.restype = int
_libcusolver.cusolverSpDcsrlsvqr.argtypes= [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_double,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p]
def cuspsolve(A, b):
Acsr = sp.csr_matrix(A, dtype=float)
b = np.asarray(b, dtype=float)
x = np.empty_like(b)
# Copy arrays to GPU
dcsrVal = gpuarray.to_gpu(Acsr.data)
dcsrColInd = gpuarray.to_gpu(Acsr.indices)
dcsrIndPtr = gpuarray.to_gpu(Acsr.indptr)
dx = gpuarray.to_gpu(x)
db = gpuarray.to_gpu(b)
# Create solver parameters
m = ctypes.c_int(Acsr.shape[0]) # Need check if A is square
nnz = ctypes.c_int(Acsr.nnz)
descrA = ctypes.c_void_p()
reorder = ctypes.c_int(0)
tol = ctypes.c_double(1e-10)
singularity = ctypes.c_int(0) # -1 if A not singular
# create cusparse handle
_cusp_handle = ctypes.c_void_p()
status = _libcusparse.cusparseCreate(ctypes.byref(_cusp_handle))
assert(status == 0)
cusp_handle = _cusp_handle.value
# create MatDescriptor
status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(descrA))
assert(status == 0)
#create cusolver handle
_cuso_handle = ctypes.c_void_p()
status = _libcusolver.cusolverSpCreate(ctypes.byref(_cuso_handle))
assert(status == 0)
cuso_handle = _cuso_handle.value
# Solve
res=_libcusolver.cusolverSpDcsrlsvqr(cuso_handle,
m,
nnz,
descrA,
int(dcsrVal.gpudata),
int(dcsrIndPtr.gpudata),
int(dcsrColInd.gpudata),
int(db.gpudata),
tol,
reorder,
int(dx.gpudata),
ctypes.byref(singularity))
assert(res == 0)
if singularity.value != -1:
raise ValueError('Singular matrix!')
x = dx.get() # Get result as numpy array
# Destroy handles
status = _libcusolver.cusolverSpDestroy(cuso_handle)
assert(status == 0)
status = _libcusparse.cusparseDestroy(cusp_handle)
assert(status == 0)
# Return result
return x
# Test
if __name__ == '__main__':
A = np.diag(np.arange(1, 5))
b = np.ones(4)
x = cuspsolve(A, b)
np.testing.assert_almost_equal(x, np.array([1. , 0.5, 0.33333333, 0.25]))
# Timing comparison
from scipy.sparse import rand
from scipy.sparse.linalg import spsolve
from scipy.sparse import coo_matrix
import time
n = 10000
i = j = np.arange(n)
diag = np.ones(n)
A = rand(n, n, density=0.001)
A = A.tocsr()
A[i, j] = diag
b = np.ones(n)
t0 = time.time()
x = spsolve(A, b)
dt1 = time.time() - t0
print("scipy.sparse.linalg.spsolve time: %s" %dt1)
t0 = time.time()
x = cuspsolve(A, b)
dt2 = time.time() - t0
print("cuspsolve time: %s" %dt2)
ratio = dt1/dt2
if ratio > 1:
print("CUDA is %s times faster than CPU." %ratio)
else:
print("CUDA is %s times slower than CPU." %(1./ratio))
@Stanpol

This comment has been minimized.

Copy link

Stanpol commented Sep 19, 2017

CPU: Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz, 6 cores
GPU: Tesla M60, 8Gb, Cuda 8.0
Python 3.5

Results:
scipy.sparse.linalg.spsolve time: 149.61404728889465
cuspsolve time: 134.93718433380127
CUDA is 1.1087681133080889 times faster than CPU.

@lotusk

This comment has been minimized.

Copy link

lotusk commented Oct 12, 2018

CPU: Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz, 6 cores
GPU: Tesla M60, 8Gb, Cuda 8.0
Python 3.5

Results:
scipy.sparse.linalg.spsolve time: 149.61404728889465
cuspsolve time: 134.93718433380127
CUDA is 1.1087681133080889 times faster than CPU.

Just a little faster????

@SanPen

This comment has been minimized.

Copy link

SanPen commented Dec 4, 2019

Hi

CPU: Intel® Core™ i7-6700K CPU @ 4.00GHz × 8
GPU: GeForce GTX 1060 6GB/PCIe/SSE2, With CUDA 10.1
RAM: 32 GB

scipy.sparse.linalg.spsolve time: 71.13084483146667
cuspsolve time: 115.17455720901489
CUDA is 1.6191928759162477 times slower than CPU.

My only other comment is that my scipy uses MKL under Ubuntu.

@mrkwjc

This comment has been minimized.

Copy link
Owner Author

mrkwjc commented Dec 6, 2019

This is probably because of transferring data to and from GPU. Maybe only _libcusolver.cusolverSpDcsrlsvqr call should be wrapped with time measures. This would make sense i we are keeping data on GPU...

@mrkwjc

This comment has been minimized.

Copy link
Owner Author

mrkwjc commented Dec 6, 2019

Well, i tested things and this definitely NOT the data copying issue. CUDA is simply slower!

To see this in the even more spectacular way i higly reccomend to install scikit-umfpack (using pip). The default superlu solver used in spsolve from scipy works using one core only, whereas umfpack boosts solution using all your CPUs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.