Skip to content

Instantly share code, notes, and snippets.

@carstenbauer
Last active March 20, 2024 10:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save carstenbauer/7f8147c5b0a469ef1f86904fd77cc5a9 to your computer and use it in GitHub Desktop.
Save carstenbauer/7f8147c5b0a469ef1f86904fd77cc5a9 to your computer and use it in GitHub Desktop.
parallel matrix-vector multiplication
using Test: @test
using Base.Threads: nthreads
using BenchmarkTools: @btime
using OhMyThreads: @tasks, tmap!, tmapreduce, chunks
using ThreadPinning
pinthreads(:cores)
# naive, math implementation
function matvec_row!(y, A, x)
fill!(y, zero(length(y)))
for r in axes(A, 1)
for c in axes(A, 2)
@inbounds y[r] += A[r, c] * x[c]
end
end
return y
end
matvec_row(A, x) = matvec_row!(similar(x, size(A, 1)), A, x)
# respect column-major order
function matvec_col!(y, A, x)
fill!(y, zero(length(y)))
for c in axes(A, 2)
for r in axes(A, 1)
@inbounds y[r] += A[r, c] * x[c]
end
end
return y
end
matvec_col(A, x) = matvec_col!(similar(x, size(A, 1)), A, x)
function matvec_col_attasks(A, x; ntasks = nthreads())
@tasks for c in axes(A, 2)
@set reducer = +
@set ntasks = ntasks
y_local = zeros(eltype(x), length(x))
for r in axes(A, 1)
@inbounds y_local[r] += A[r, c] * x[c]
end
y_local
end
end
function matvec_col_attasks_chunks(A, x; ntasks = nthreads())
@tasks for c_idcs in chunks(axes(A, 2); n = ntasks)
@set chunking = false
@set reducer = +
y_local = @views matvec_col(A[:, c_idcs], x[c_idcs])
end
end
function matvec_col_tmapreduce_chunks(A, x; ntasks = nthreads())
tmapreduce(+, chunks(axes(A, 2); n = ntasks); chunking = false) do c_idcs
y_local = @views matvec_col(A[:, c_idcs], x[c_idcs])
end
end
function matvec_row_attasks!(y, A, x; ntasks = nthreads())
@tasks for r in axes(A, 1)
@set ntasks = ntasks
s = zero(eltype(y))
for c in axes(A, 2)
@inbounds s += A[r, c] * x[c]
end
y[r] = s
end
y
end
matvec_row_attasks(A, x; kwargs...) = matvec_row_attasks!(similar(x), A, x; kwargs...)
function matvec_row_tmap!(y, A, x; ntasks = nthreads())
tmap!(y, axes(A, 1); ntasks) do r
s = zero(eltype(y))
for c in axes(A, 2)
@inbounds s += A[r, c] * x[c]
end
s
end
end
matvec_row_tmap(A, x; kwargs...) = matvec_row_tmap!(similar(x), A, x; kwargs...)
function matvec_row_tmapreduce(A, x; ntasks = nthreads())
tmapreduce(vcat, axes(A, 1); ntasks) do r
s = zero(eltype(x))
for c in axes(A, 2)
@inbounds s += A[r, c] * x[c]
end
[s]
end
end
let
N = 10_000
# N = 1000
x = rand(N)
A = rand(N, N)
res = A * x
@test matvec_row(A, x) ≈ res
@test matvec_row_attasks(A, x) ≈ res
@test matvec_row_tmap(A, x) ≈ res
@test matvec_row_tmapreduce(A, x) ≈ res
@test matvec_col(A, x) ≈ res
@test matvec_col_attasks(A, x) ≈ res
@test matvec_col_attasks_chunks(A, x) ≈ res
@test matvec_col_tmapreduce_chunks(A, x) ≈ res
@show N
@show nthreads()
println("matvec_row")
@btime matvec_row($A, $x) samples=32 evals=3
println("matvec_row_attasks")
@btime matvec_row_attasks($A, $x) samples=32 evals=3
println("matvec_row_tmap")
@btime matvec_row_tmap($A, $x) samples=32 evals=3
println("matvec_row_tmapreduce")
@btime matvec_row_tmapreduce($A, $x) samples=32 evals=3
println("matvec_col")
@btime matvec_col($A, $x) samples=32 evals=3
println("matvec_col_attasks")
@btime matvec_col_attasks($A, $x) samples=32 evals=3
println("matvec_col_attasks_chunks")
@btime matvec_col_attasks_chunks($A, $x) samples=32 evals=3
println("matvec_col_tmapreduce_chunks")
@btime matvec_col_tmapreduce_chunks($A, $x) samples=32 evals=3
nothing
end
# N = 10_000
# nthreads() = 10
# matvec_row
# 498.986 ms (2 allocations: 78.17 KiB)
# matvec_row_attasks
# 57.265 ms (70 allocations: 84.06 KiB)
# matvec_row_tmap
# 53.439 ms (70 allocations: 84.27 KiB)
# matvec_row_tmapreduce
# 55.067 ms (20075 allocations: 40.57 MiB)
# matvec_col
# 53.024 ms (2 allocations: 78.17 KiB)
# matvec_col_attasks
# 108.179 ms (40066 allocations: 1.49 GiB)
# matvec_col_attasks_chunks
# 19.793 ms (109 allocations: 1.46 MiB)
# matvec_col_tmapreduce_chunks
# 19.601 ms (109 allocations: 1.46 MiB)
# N = 1000
# nthreads() = 10
# matvec_row
# 809.858 μs (1 allocation: 7.94 KiB)
# matvec_row_attasks
# 112.115 μs (69 allocations: 13.83 KiB)
# matvec_row_tmap
# 112.198 μs (69 allocations: 14.03 KiB)
# matvec_row_tmapreduce
# 153.875 μs (2067 allocations: 577.92 KiB)
# matvec_col
# 115.778 μs (1 allocation: 7.94 KiB)
# matvec_col_attasks
# 776.681 μs (2067 allocations: 15.50 MiB)
# matvec_col_attasks_chunks
# 36.344 μs (90 allocations: 156.91 KiB)
# matvec_col_tmapreduce_chunks
# 36.967 μs (90 allocations: 156.91 KiB)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment