Last active
December 16, 2021 06:35
-
-
Save carstenbauer/33a5b57de78cce6a3cecef88d4e2c5e7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "202a91a0-8ac9-4d47-a628-85786980cbb2", | |
"metadata": {}, | |
"source": [ | |
"# CPU\n", | |
"\n", | |
"## Single-threaded sparse mat-vec product" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "c3eee86d-1ade-401b-bb8f-b892fa7090af", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"using DifferentialEquations\n", | |
"using BenchmarkTools\n", | |
"\n", | |
"using SparseArrays\n", | |
"using LinearAlgebra" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "486c494c-7f5d-42c6-945a-69603dd79269", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"const A = sprand(10_000,10_000,0.01);\n", | |
"u0 = rand(10_000);\n", | |
"u0_tmp = similar(u0);\n", | |
"tspan = (0,1);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "1b080292-9e42-4740-b88b-6a282ca310ff", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"f! (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"f!(du,u,p,t) = mul!(du, A, u)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "e2efe370-7059-467f-be3f-a18a3d8c1357", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"prob = ODEProblem{true}(f!,u0,tspan);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "bdca209e-c95e-4249-b63f-f0af8d841756", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 980.351 μs (0 allocations: 0 bytes)\n" | |
] | |
} | |
], | |
"source": [ | |
"@btime mul!($u0_tmp, $A, $u0);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "5f664b91-373a-451b-9254-a70591bb8635", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 0.345203 seconds (951 allocations: 33.146 MiB, 2.20% gc time)\n" | |
] | |
} | |
], | |
"source": [ | |
"@time sol = solve(prob, Tsit5());" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9ee1c776-0327-4970-9c92-87ed2f6f86de", | |
"metadata": {}, | |
"source": [ | |
"## Multithreaded sparse mat-vec product (MKLSparse)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "3e58a193-d9a8-45f9-9a32-06c2f4d4dcbb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"using MKLSparse" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "c66c1b46-e84c-425b-b620-16fc49bd2b0e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 239.171 μs (0 allocations: 0 bytes)\n" | |
] | |
} | |
], | |
"source": [ | |
"@btime mul!($u0_tmp, $A, $u0);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "39a1db83-9cd7-4a37-be43-5466b15b6361", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 0.126366 seconds (951 allocations: 33.146 MiB)\n" | |
] | |
} | |
], | |
"source": [ | |
"@time sol = solve(prob, Tsit5());" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "8fdd126f-069d-4cb3-980a-1cdc893ffba2", | |
"metadata": {}, | |
"source": [ | |
"# DGX (NVIDIA A100 GPU)\n", | |
"\n", | |
"**`Float32` precision**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "d4224cde-723b-4589-b53f-c99251638ec9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"using CUDA\n", | |
"using CUDA.CUSPARSE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "90978bc4-e2b2-4120-89d6-12d576776458", | |
"metadata": { | |
"scrolled": true, | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CUDA toolkit 11.4, artifact installation\n", | |
"NVIDIA driver 495.29.5, for CUDA 11.5\n", | |
"CUDA driver 11.5\n", | |
"\n", | |
"Libraries: \n", | |
"- CUBLAS: 11.5.4\n", | |
"- CURAND: 10.2.5\n", | |
"- CUFFT: 10.5.1\n", | |
"- CUSOLVER: 11.2.0\n", | |
"- CUSPARSE: 11.6.0\n", | |
"- CUPTI: 14.0.0\n", | |
"- NVML: 11.0.0+495.29.5\n", | |
"- CUDNN: 8.20.2 (for CUDA 11.4.0)\n", | |
"- CUTENSOR: 1.3.0 (for CUDA 11.2.0)\n", | |
"\n", | |
"Toolchain:\n", | |
"- Julia: 1.7.0\n", | |
"- LLVM: 12.0.1\n", | |
"- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0\n", | |
"- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80\n", | |
"\n", | |
"8 devices:\n", | |
" 0: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n", | |
" 1: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n", | |
" 2: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n", | |
" 3: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n", | |
" 4: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n", | |
" 5: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n", | |
" 6: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n", | |
" 7: NVIDIA A100-SXM4-40GB (sm_80, 39.583 GiB / 39.586 GiB available)\n" | |
] | |
} | |
], | |
"source": [ | |
"CUDA.versioninfo()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "1c41a6c1-94b3-491d-8897-54ec75e48408", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"const A_cuda32 = CuSparseMatrixCSC(SparseMatrixCSC{Float32}(A));\n", | |
"u0_cuda32 = CuArray(Vector{Float32}(u0));\n", | |
"u0_cuda32_tmp = CuArray(Vector{Float32}(u0));\n", | |
"tspan=(0.f0,1.f0);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "6672ef53-34a7-4add-a9d4-0f05ada19a2a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"f_cuda32! (generic function with 1 method)" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"f_cuda32!(du,u,p,t) = mul!(du, A_cuda32, u)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "7d8722a7-6d47-4868-9390-485f82e40af9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"prob_cuda32 = ODEProblem{true}(f_cuda32!,u0_cuda32,tspan);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "ab34f2ee-1f7e-4265-8868-c8dfa3dd6a49", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 14.107 μs (42 allocations: 768 bytes)\n" | |
] | |
} | |
], | |
"source": [ | |
"@btime mul!($u0_cuda32_tmp, $A_cuda32, $u0_cuda32);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "bbfda17d-449c-4b49-a020-a804e6e441a6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 0.047797 seconds (34.61 k allocations: 2.085 MiB)\n" | |
] | |
} | |
], | |
"source": [ | |
"@time sol = solve(prob_cuda32, Tsit5());" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a68ea4ef-e747-493e-8a1d-2392c18e713a", | |
"metadata": {}, | |
"source": [ | |
"## `CuSparseMatrixCSR`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e701f26d-e8cd-4fca-bcfc-2974b43e74a5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"const A_cuda32_csr = CuSparseMatrixCSR(SparseMatrixCSC{Float32}(A));\n", | |
"f_cuda32_csr!(du,u,p,t) = mul!(du, A_cuda32_csr, u)\n", | |
"prob_cuda32_csr = ODEProblem{true}(f_cuda32_csr!,u0_cuda32,tspan);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "de0c04d6-0482-4eaa-8be1-2b7d73ca6e7e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 14.017 μs (43 allocations: 784 bytes)\n" | |
] | |
} | |
], | |
"source": [ | |
"@btime mul!($u0_cuda32_tmp, $A_cuda32_csr, $u0_cuda32);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "be9ff39e-4ceb-4b9e-804b-0f465baa29f7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 0.019357 seconds (34.93 k allocations: 2.090 MiB)\n" | |
] | |
} | |
], | |
"source": [ | |
"@time sol = solve(prob_cuda32_csr, Tsit5());" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Julia 1.7.0 (multithreaded)", | |
"language": "julia", | |
"name": "julia-1.7-multithreaded" | |
}, | |
"language_info": { | |
"file_extension": ".jl", | |
"mimetype": "application/julia", | |
"name": "julia", | |
"version": "1.7.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment