Last active
March 20, 2024 10:33
-
-
Save carstenbauer/7f8147c5b0a469ef1f86904fd77cc5a9 to your computer and use it in GitHub Desktop.
parallel matrix-vector multiplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Test: @test | |
using Base.Threads: nthreads | |
using BenchmarkTools: @btime | |
using OhMyThreads: @tasks, tmap!, tmapreduce, chunks | |
using ThreadPinning | |
pinthreads(:cores) | |
# naive, math implementation | |
function matvec_row!(y, A, x) | |
fill!(y, zero(length(y))) | |
for r in axes(A, 1) | |
for c in axes(A, 2) | |
@inbounds y[r] += A[r, c] * x[c] | |
end | |
end | |
return y | |
end | |
matvec_row(A, x) = matvec_row!(similar(x, size(A, 1)), A, x) | |
# respect column-major order | |
function matvec_col!(y, A, x) | |
fill!(y, zero(length(y))) | |
for c in axes(A, 2) | |
for r in axes(A, 1) | |
@inbounds y[r] += A[r, c] * x[c] | |
end | |
end | |
return y | |
end | |
matvec_col(A, x) = matvec_col!(similar(x, size(A, 1)), A, x) | |
function matvec_col_attasks(A, x; ntasks = nthreads()) | |
@tasks for c in axes(A, 2) | |
@set reducer = + | |
@set ntasks = ntasks | |
y_local = zeros(eltype(x), length(x)) | |
for r in axes(A, 1) | |
@inbounds y_local[r] += A[r, c] * x[c] | |
end | |
y_local | |
end | |
end | |
function matvec_col_attasks_chunks(A, x; ntasks = nthreads()) | |
@tasks for c_idcs in chunks(axes(A, 2); n = ntasks) | |
@set chunking = false | |
@set reducer = + | |
y_local = @views matvec_col(A[:, c_idcs], x[c_idcs]) | |
end | |
end | |
function matvec_col_tmapreduce_chunks(A, x; ntasks = nthreads()) | |
tmapreduce(+, chunks(axes(A, 2); n = ntasks); chunking = false) do c_idcs | |
y_local = @views matvec_col(A[:, c_idcs], x[c_idcs]) | |
end | |
end | |
function matvec_row_attasks!(y, A, x; ntasks = nthreads()) | |
@tasks for r in axes(A, 1) | |
@set ntasks = ntasks | |
s = zero(eltype(y)) | |
for c in axes(A, 2) | |
@inbounds s += A[r, c] * x[c] | |
end | |
y[r] = s | |
end | |
y | |
end | |
matvec_row_attasks(A, x; kwargs...) = matvec_row_attasks!(similar(x), A, x; kwargs...) | |
function matvec_row_tmap!(y, A, x; ntasks = nthreads()) | |
tmap!(y, axes(A, 1); ntasks) do r | |
s = zero(eltype(y)) | |
for c in axes(A, 2) | |
@inbounds s += A[r, c] * x[c] | |
end | |
s | |
end | |
end | |
matvec_row_tmap(A, x; kwargs...) = matvec_row_tmap!(similar(x), A, x; kwargs...) | |
function matvec_row_tmapreduce(A, x; ntasks = nthreads()) | |
tmapreduce(vcat, axes(A, 1); ntasks) do r | |
s = zero(eltype(x)) | |
for c in axes(A, 2) | |
@inbounds s += A[r, c] * x[c] | |
end | |
[s] | |
end | |
end | |
let | |
N = 10_000 | |
# N = 1000 | |
x = rand(N) | |
A = rand(N, N) | |
res = A * x | |
@test matvec_row(A, x) ≈ res | |
@test matvec_row_attasks(A, x) ≈ res | |
@test matvec_row_tmap(A, x) ≈ res | |
@test matvec_row_tmapreduce(A, x) ≈ res | |
@test matvec_col(A, x) ≈ res | |
@test matvec_col_attasks(A, x) ≈ res | |
@test matvec_col_attasks_chunks(A, x) ≈ res | |
@test matvec_col_tmapreduce_chunks(A, x) ≈ res | |
@show N | |
@show nthreads() | |
println("matvec_row") | |
@btime matvec_row($A, $x) samples=32 evals=3 | |
println("matvec_row_attasks") | |
@btime matvec_row_attasks($A, $x) samples=32 evals=3 | |
println("matvec_row_tmap") | |
@btime matvec_row_tmap($A, $x) samples=32 evals=3 | |
println("matvec_row_tmapreduce") | |
@btime matvec_row_tmapreduce($A, $x) samples=32 evals=3 | |
println("matvec_col") | |
@btime matvec_col($A, $x) samples=32 evals=3 | |
println("matvec_col_attasks") | |
@btime matvec_col_attasks($A, $x) samples=32 evals=3 | |
println("matvec_col_attasks_chunks") | |
@btime matvec_col_attasks_chunks($A, $x) samples=32 evals=3 | |
println("matvec_col_tmapreduce_chunks") | |
@btime matvec_col_tmapreduce_chunks($A, $x) samples=32 evals=3 | |
nothing | |
end | |
# N = 10_000 | |
# nthreads() = 10 | |
# matvec_row | |
# 498.986 ms (2 allocations: 78.17 KiB) | |
# matvec_row_attasks | |
# 57.265 ms (70 allocations: 84.06 KiB) | |
# matvec_row_tmap | |
# 53.439 ms (70 allocations: 84.27 KiB) | |
# matvec_row_tmapreduce | |
# 55.067 ms (20075 allocations: 40.57 MiB) | |
# matvec_col | |
# 53.024 ms (2 allocations: 78.17 KiB) | |
# matvec_col_attasks | |
# 108.179 ms (40066 allocations: 1.49 GiB) | |
# matvec_col_attasks_chunks | |
# 19.793 ms (109 allocations: 1.46 MiB) | |
# matvec_col_tmapreduce_chunks | |
# 19.601 ms (109 allocations: 1.46 MiB) | |
# N = 1000 | |
# nthreads() = 10 | |
# matvec_row | |
# 809.858 μs (1 allocation: 7.94 KiB) | |
# matvec_row_attasks | |
# 112.115 μs (69 allocations: 13.83 KiB) | |
# matvec_row_tmap | |
# 112.198 μs (69 allocations: 14.03 KiB) | |
# matvec_row_tmapreduce | |
# 153.875 μs (2067 allocations: 577.92 KiB) | |
# matvec_col | |
# 115.778 μs (1 allocation: 7.94 KiB) | |
# matvec_col_attasks | |
# 776.681 μs (2067 allocations: 15.50 MiB) | |
# matvec_col_attasks_chunks | |
# 36.344 μs (90 allocations: 156.91 KiB) | |
# matvec_col_tmapreduce_chunks | |
# 36.967 μs (90 allocations: 156.91 KiB) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment