Skip to content

Instantly share code, notes, and snippets.

@brunodantas
Last active October 16, 2019 10:31
Show Gist options
  • Save brunodantas/f5e6d0d1e8ba84766da707a3f00f9b74 to your computer and use it in GitHub Desktop.
Save brunodantas/f5e6d0d1e8ba84766da707a3f00f9b74 to your computer and use it in GitHub Desktop.
matrix multiplication: numba cuda vs numpy
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Matrix multiplication"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numba GPU"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from numba import cuda\n",
"@cuda.jit('void( float64 [ : , : ] , float64 [ : , : ] , float64 [ : , : ] , int32 )')\n",
"def cu_matmul(a , b, c , n) :\n",
" x, y = cuda.grid (2)\n",
" if (x >= n) or (y >= n) :\n",
" return\n",
" c[x, y] = 0\n",
" for i in range(n) :\n",
" c[x, y] += a[x, i ] * b[ i , y]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"14.7 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"device = cuda.get_current_device()\n",
"tpb = device.WARP_SIZE\n",
"n = 320\n",
"bpg = (n+tpb-1)//tpb\n",
"grid_dim = (bpg, bpg)\n",
"block_dim = (tpb , tpb)\n",
"A = np.random.random((n, n ) ).astype (np. float64 )\n",
"B = np.random.random((n, n ) ).astype (np. float64 )\n",
"C = np.empty((n, n) , dtype=np.float64 )\n",
"dev_A = cuda.to_device(A)\n",
"dev_B = cuda.to_device(B)\n",
"dev_C = cuda.to_device(C, copy=False )\n",
"result_cuda = %timeit -o cu_matmul[grid_dim , block_dim](dev_A, dev_B, dev_C, n)\n",
"dev_C. copy_to_host(C)\n",
"assert (np. allclose (np. dot(A, B) , C))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numpy"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"525 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%timeit C = np.dot(A, B)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Matrix multiplication"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numba GPU"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from numba import cuda\n",
"@cuda.jit('void( float64 [ : , : ] , float64 [ : , : ] , float64 [ : , : ] , int32 )')\n",
"def cu_matmul(a , b, c , n) :\n",
" x, y = cuda.grid (2)\n",
" if (x >= n) or (y >= n) :\n",
" return\n",
" c[x, y] = 0\n",
" for i in range(n) :\n",
" c[x, y] += a[x, i ] * b[ i , y]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"14.7 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"device = cuda.get_current_device()\n",
"tpb = device.WARP_SIZE\n",
"n = 320\n",
"bpg = (n+tpb-1)//tpb\n",
"grid_dim = (bpg, bpg)\n",
"block_dim = (tpb , tpb)\n",
"A = np.random.random((n, n ) ).astype (np. float64 )\n",
"B = np.random.random((n, n ) ).astype (np. float64 )\n",
"C = np.empty((n, n) , dtype=np.float64 )\n",
"dev_A = cuda.to_device(A)\n",
"dev_B = cuda.to_device(B)\n",
"dev_C = cuda.to_device(C, copy=False )\n",
"result_cuda = %timeit -o cu_matmul[grid_dim , block_dim](dev_A, dev_B, dev_C, n)\n",
"dev_C. copy_to_host(C)\n",
"assert (np. allclose (np. dot(A, B) , C))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numpy"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"525 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%timeit C = np.dot(A, B)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment